• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2009-2013, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  normalizer2.h
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2009nov22
16 *   created by: Markus W. Scherer
17 */
18 
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
21 
22 /**
23  * \file
24  * \brief C++ API: New API for Unicode Normalization.
25  */
26 
27 #include "unicode/utypes.h"
28 
29 #if U_SHOW_CPLUSPLUS_API
30 
31 #if !UCONFIG_NO_NORMALIZATION
32 
33 #include "unicode/stringpiece.h"
34 #include "unicode/uniset.h"
35 #include "unicode/unistr.h"
36 #include "unicode/unorm2.h"
37 
38 U_NAMESPACE_BEGIN
39 
40 class ByteSink;
41 
42 /**
43  * Unicode normalization functionality for standard Unicode normalization or
44  * for using custom mapping tables.
45  * All instances of this class are unmodifiable/immutable.
46  * Instances returned by getInstance() are singletons that must not be deleted by the caller.
47  * The Normalizer2 class is not intended for public subclassing.
48  *
49  * The primary functions are to produce a normalized string and to detect whether
50  * a string is already normalized.
51  * The most commonly used normalization forms are those defined in
52  * http://www.unicode.org/unicode/reports/tr15/
53  * However, this API supports additional normalization forms for specialized purposes.
54  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
55  * and can be used in implementations of UTS #46.
56  *
57  * Not only are the standard compose and decompose modes supplied,
58  * but additional modes are provided as documented in the Mode enum.
59  *
60  * Some of the functions in this class identify normalization boundaries.
61  * At a normalization boundary, the portions of the string
62  * before it and starting from it do not interact and can be handled independently.
63  *
64  * The spanQuickCheckYes() stops at a normalization boundary.
65  * When the goal is a normalized string, then the text before the boundary
66  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
67  *
68  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
69  * a character is guaranteed to be at a normalization boundary,
70  * regardless of context.
71  * This is used for moving from one normalization boundary to the next
72  * or preceding boundary, and for performing iterative normalization.
73  *
74  * Iterative normalization is useful when only a small portion of a
75  * longer string needs to be processed.
76  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
77  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
78  * (to process only the substring for which sort key bytes are computed).
79  *
80  * The set of normalization boundaries returned by these functions may not be
81  * complete: There may be more boundaries that could be returned.
82  * Different functions may return different boundaries.
83  * @stable ICU 4.4
84  */
85 class U_COMMON_API Normalizer2 : public UObject {
86 public:
87     /**
88      * Destructor.
89      * @stable ICU 4.4
90      */
91     ~Normalizer2();
92 
93     /**
94      * Returns a Normalizer2 instance for Unicode NFC normalization.
95      * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode).
96      * Returns an unmodifiable singleton instance. Do not delete it.
97      * @param errorCode Standard ICU error code. Its input value must
98      *                  pass the U_SUCCESS() test, or else the function returns
99      *                  immediately. Check for U_FAILURE() on output or use with
100      *                  function chaining. (See User Guide for details.)
101      * @return the requested Normalizer2, if successful
102      * @stable ICU 49
103      */
104     static const Normalizer2 *
105     getNFCInstance(UErrorCode &errorCode);
106 
107     /**
108      * Returns a Normalizer2 instance for Unicode NFD normalization.
109      * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode).
110      * Returns an unmodifiable singleton instance. Do not delete it.
111      * @param errorCode Standard ICU error code. Its input value must
112      *                  pass the U_SUCCESS() test, or else the function returns
113      *                  immediately. Check for U_FAILURE() on output or use with
114      *                  function chaining. (See User Guide for details.)
115      * @return the requested Normalizer2, if successful
116      * @stable ICU 49
117      */
118     static const Normalizer2 *
119     getNFDInstance(UErrorCode &errorCode);
120 
121     /**
122      * Returns a Normalizer2 instance for Unicode NFKC normalization.
123      * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode).
124      * Returns an unmodifiable singleton instance. Do not delete it.
125      * @param errorCode Standard ICU error code. Its input value must
126      *                  pass the U_SUCCESS() test, or else the function returns
127      *                  immediately. Check for U_FAILURE() on output or use with
128      *                  function chaining. (See User Guide for details.)
129      * @return the requested Normalizer2, if successful
130      * @stable ICU 49
131      */
132     static const Normalizer2 *
133     getNFKCInstance(UErrorCode &errorCode);
134 
135     /**
136      * Returns a Normalizer2 instance for Unicode NFKD normalization.
137      * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode).
138      * Returns an unmodifiable singleton instance. Do not delete it.
139      * @param errorCode Standard ICU error code. Its input value must
140      *                  pass the U_SUCCESS() test, or else the function returns
141      *                  immediately. Check for U_FAILURE() on output or use with
142      *                  function chaining. (See User Guide for details.)
143      * @return the requested Normalizer2, if successful
144      * @stable ICU 49
145      */
146     static const Normalizer2 *
147     getNFKDInstance(UErrorCode &errorCode);
148 
149     /**
150      * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization
151      * which is equivalent to applying the NFKC_Casefold mappings and then NFC.
152      * See https://www.unicode.org/reports/tr44/#NFKC_Casefold
153      *
154      * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode).
155      * Returns an unmodifiable singleton instance. Do not delete it.
156      * @param errorCode Standard ICU error code. Its input value must
157      *                  pass the U_SUCCESS() test, or else the function returns
158      *                  immediately. Check for U_FAILURE() on output or use with
159      *                  function chaining. (See User Guide for details.)
160      * @return the requested Normalizer2, if successful
161      * @stable ICU 49
162      */
163     static const Normalizer2 *
164     getNFKCCasefoldInstance(UErrorCode &errorCode);
165 
166     /**
167      * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
168      * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
169      * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
170      *
171      * Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode).
172      * Returns an unmodifiable singleton instance. Do not delete it.
173      * @param errorCode Standard ICU error code. Its input value must
174      *                  pass the U_SUCCESS() test, or else the function returns
175      *                  immediately. Check for U_FAILURE() on output or use with
176      *                  function chaining. (See User Guide for details.)
177      * @return the requested Normalizer2, if successful
178      * @stable ICU 74
179      */
180     static const Normalizer2 *
181     getNFKCSimpleCasefoldInstance(UErrorCode &errorCode);
182 
183     /**
184      * Returns a Normalizer2 instance which uses the specified data file
185      * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
186      * and which composes or decomposes text according to the specified mode.
187      * Returns an unmodifiable singleton instance. Do not delete it.
188      *
189      * Use packageName=nullptr for data files that are part of ICU's own data.
190      * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
191      * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
192      * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
193      *
194      * @param packageName nullptr for ICU built-in data, otherwise application data package name
195      * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
196      * @param mode normalization mode (compose or decompose etc.)
197      * @param errorCode Standard ICU error code. Its input value must
198      *                  pass the U_SUCCESS() test, or else the function returns
199      *                  immediately. Check for U_FAILURE() on output or use with
200      *                  function chaining. (See User Guide for details.)
201      * @return the requested Normalizer2, if successful
202      * @stable ICU 4.4
203      */
204     static const Normalizer2 *
205     getInstance(const char *packageName,
206                 const char *name,
207                 UNormalization2Mode mode,
208                 UErrorCode &errorCode);
209 
210     /**
211      * Returns the normalized form of the source string.
212      * @param src source string
213      * @param errorCode Standard ICU error code. Its input value must
214      *                  pass the U_SUCCESS() test, or else the function returns
215      *                  immediately. Check for U_FAILURE() on output or use with
216      *                  function chaining. (See User Guide for details.)
217      * @return normalized src
218      * @stable ICU 4.4
219      */
220     UnicodeString
normalize(const UnicodeString & src,UErrorCode & errorCode)221     normalize(const UnicodeString &src, UErrorCode &errorCode) const {
222         UnicodeString result;
223         normalize(src, result, errorCode);
224         return result;
225     }
226     /**
227      * Writes the normalized form of the source string to the destination string
228      * (replacing its contents) and returns the destination string.
229      * The source and destination strings must be different objects.
230      * @param src source string
231      * @param dest destination string; its contents is replaced with normalized src
232      * @param errorCode Standard ICU error code. Its input value must
233      *                  pass the U_SUCCESS() test, or else the function returns
234      *                  immediately. Check for U_FAILURE() on output or use with
235      *                  function chaining. (See User Guide for details.)
236      * @return dest
237      * @stable ICU 4.4
238      */
239     virtual UnicodeString &
240     normalize(const UnicodeString &src,
241               UnicodeString &dest,
242               UErrorCode &errorCode) const = 0;
243 
244     /**
245      * Normalizes a UTF-8 string and optionally records how source substrings
246      * relate to changed and unchanged result substrings.
247      *
248      * Implemented completely for all built-in modes except for FCD.
249      * The base class implementation converts to & from UTF-16 and does not support edits.
250      *
251      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
252      * @param src       Source UTF-8 string.
253      * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
254      *                  sink.Flush() is called at the end.
255      * @param edits     Records edits for index mapping, working with styled text,
256      *                  and getting only changes (if any).
257      *                  The Edits contents is undefined if any error occurs.
258      *                  This function calls edits->reset() first unless
259      *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
260      * @param errorCode Standard ICU error code. Its input value must
261      *                  pass the U_SUCCESS() test, or else the function returns
262      *                  immediately. Check for U_FAILURE() on output or use with
263      *                  function chaining. (See User Guide for details.)
264      * @stable ICU 60
265      */
266     virtual void
267     normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
268                   Edits *edits, UErrorCode &errorCode) const;
269 
270     /**
271      * Appends the normalized form of the second string to the first string
272      * (merging them at the boundary) and returns the first string.
273      * The result is normalized if the first string was normalized.
274      * The first and second strings must be different objects.
275      * @param first string, should be normalized
276      * @param second string, will be normalized
277      * @param errorCode Standard ICU error code. Its input value must
278      *                  pass the U_SUCCESS() test, or else the function returns
279      *                  immediately. Check for U_FAILURE() on output or use with
280      *                  function chaining. (See User Guide for details.)
281      * @return first
282      * @stable ICU 4.4
283      */
284     virtual UnicodeString &
285     normalizeSecondAndAppend(UnicodeString &first,
286                              const UnicodeString &second,
287                              UErrorCode &errorCode) const = 0;
288     /**
289      * Appends the second string to the first string
290      * (merging them at the boundary) and returns the first string.
291      * The result is normalized if both the strings were normalized.
292      * The first and second strings must be different objects.
293      * @param first string, should be normalized
294      * @param second string, should be normalized
295      * @param errorCode Standard ICU error code. Its input value must
296      *                  pass the U_SUCCESS() test, or else the function returns
297      *                  immediately. Check for U_FAILURE() on output or use with
298      *                  function chaining. (See User Guide for details.)
299      * @return first
300      * @stable ICU 4.4
301      */
302     virtual UnicodeString &
303     append(UnicodeString &first,
304            const UnicodeString &second,
305            UErrorCode &errorCode) const = 0;
306 
307     /**
308      * Gets the decomposition mapping of c.
309      * Roughly equivalent to normalizing the String form of c
310      * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
311      * returns false and does not write a string
312      * if c does not have a decomposition mapping in this instance's data.
313      * This function is independent of the mode of the Normalizer2.
314      * @param c code point
315      * @param decomposition String object which will be set to c's
316      *                      decomposition mapping, if there is one.
317      * @return true if c has a decomposition, otherwise false
318      * @stable ICU 4.6
319      */
320     virtual UBool
321     getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
322 
323     /**
324      * Gets the raw decomposition mapping of c.
325      *
326      * This is similar to the getDecomposition() method but returns the
327      * raw decomposition mapping as specified in UnicodeData.txt or
328      * (for custom data) in the mapping files processed by the gennorm2 tool.
329      * By contrast, getDecomposition() returns the processed,
330      * recursively-decomposed version of this mapping.
331      *
332      * When used on a standard NFKC Normalizer2 instance,
333      * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
334      *
335      * When used on a standard NFC Normalizer2 instance,
336      * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
337      * in this case, the result contains either one or two code points (=1..4 char16_ts).
338      *
339      * This function is independent of the mode of the Normalizer2.
340      * The default implementation returns false.
341      * @param c code point
342      * @param decomposition String object which will be set to c's
343      *                      raw decomposition mapping, if there is one.
344      * @return true if c has a decomposition, otherwise false
345      * @stable ICU 49
346      */
347     virtual UBool
348     getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
349 
350     /**
351      * Performs pairwise composition of a & b and returns the composite if there is one.
352      *
353      * Returns a composite code point c only if c has a two-way mapping to a+b.
354      * In standard Unicode normalization, this means that
355      * c has a canonical decomposition to a+b
356      * and c does not have the Full_Composition_Exclusion property.
357      *
358      * This function is independent of the mode of the Normalizer2.
359      * The default implementation returns a negative value.
360      * @param a A (normalization starter) code point.
361      * @param b Another code point.
362      * @return The non-negative composite code point if there is one; otherwise a negative value.
363      * @stable ICU 49
364      */
365     virtual UChar32
366     composePair(UChar32 a, UChar32 b) const;
367 
368     /**
369      * Gets the combining class of c.
370      * The default implementation returns 0
371      * but all standard implementations return the Unicode Canonical_Combining_Class value.
372      * @param c code point
373      * @return c's combining class
374      * @stable ICU 49
375      */
376     virtual uint8_t
377     getCombiningClass(UChar32 c) const;
378 
379     /**
380      * Tests if the string is normalized.
381      * Internally, in cases where the quickCheck() method would return "maybe"
382      * (which is only possible for the two COMPOSE modes) this method
383      * resolves to "yes" or "no" to provide a definitive result,
384      * at the cost of doing more work in those cases.
385      * @param s input string
386      * @param errorCode Standard ICU error code. Its input value must
387      *                  pass the U_SUCCESS() test, or else the function returns
388      *                  immediately. Check for U_FAILURE() on output or use with
389      *                  function chaining. (See User Guide for details.)
390      * @return true if s is normalized
391      * @stable ICU 4.4
392      */
393     virtual UBool
394     isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
395     /**
396      * Tests if the UTF-8 string is normalized.
397      * Internally, in cases where the quickCheck() method would return "maybe"
398      * (which is only possible for the two COMPOSE modes) this method
399      * resolves to "yes" or "no" to provide a definitive result,
400      * at the cost of doing more work in those cases.
401      *
402      * This works for all normalization modes.
403      * It is optimized for UTF-8 for all built-in modes except for FCD.
404      * The base class implementation converts to UTF-16 and calls isNormalized().
405      *
406      * @param s UTF-8 input string
407      * @param errorCode Standard ICU error code. Its input value must
408      *                  pass the U_SUCCESS() test, or else the function returns
409      *                  immediately. Check for U_FAILURE() on output or use with
410      *                  function chaining. (See User Guide for details.)
411      * @return true if s is normalized
412      * @stable ICU 60
413      */
414     virtual UBool
415     isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
416 
417 
418     /**
419      * Tests if the string is normalized.
420      * For the two COMPOSE modes, the result could be "maybe" in cases that
421      * would take a little more work to resolve definitively.
422      * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
423      * combination of quick check + normalization, to avoid
424      * re-checking the "yes" prefix.
425      * @param s input string
426      * @param errorCode Standard ICU error code. Its input value must
427      *                  pass the U_SUCCESS() test, or else the function returns
428      *                  immediately. Check for U_FAILURE() on output or use with
429      *                  function chaining. (See User Guide for details.)
430      * @return UNormalizationCheckResult
431      * @stable ICU 4.4
432      */
433     virtual UNormalizationCheckResult
434     quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
435 
436     /**
437      * Returns the end of the normalized substring of the input string.
438      * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
439      * the substring <code>UnicodeString(s, 0, end)</code>
440      * will pass the quick check with a "yes" result.
441      *
442      * The returned end index is usually one or more characters before the
443      * "no" or "maybe" character: The end index is at a normalization boundary.
444      * (See the class documentation for more about normalization boundaries.)
445      *
446      * When the goal is a normalized string and most input strings are expected
447      * to be normalized already, then call this method,
448      * and if it returns a prefix shorter than the input string,
449      * copy that prefix and use normalizeSecondAndAppend() for the remainder.
450      * @param s input string
451      * @param errorCode Standard ICU error code. Its input value must
452      *                  pass the U_SUCCESS() test, or else the function returns
453      *                  immediately. Check for U_FAILURE() on output or use with
454      *                  function chaining. (See User Guide for details.)
455      * @return "yes" span end index
456      * @stable ICU 4.4
457      */
458     virtual int32_t
459     spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
460 
461     /**
462      * Tests if the character always has a normalization boundary before it,
463      * regardless of context.
464      * If true, then the character does not normalization-interact with
465      * preceding characters.
466      * In other words, a string containing this character can be normalized
467      * by processing portions before this character and starting from this
468      * character independently.
469      * This is used for iterative normalization. See the class documentation for details.
470      * @param c character to test
471      * @return true if c has a normalization boundary before it
472      * @stable ICU 4.4
473      */
474     virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
475 
476     /**
477      * Tests if the character always has a normalization boundary after it,
478      * regardless of context.
479      * If true, then the character does not normalization-interact with
480      * following characters.
481      * In other words, a string containing this character can be normalized
482      * by processing portions up to this character and after this
483      * character independently.
484      * This is used for iterative normalization. See the class documentation for details.
485      * Note that this operation may be significantly slower than hasBoundaryBefore().
486      * @param c character to test
487      * @return true if c has a normalization boundary after it
488      * @stable ICU 4.4
489      */
490     virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
491 
492     /**
493      * Tests if the character is normalization-inert.
494      * If true, then the character does not change, nor normalization-interact with
495      * preceding or following characters.
496      * In other words, a string containing this character can be normalized
497      * by processing portions before this character and after this
498      * character independently.
499      * This is used for iterative normalization. See the class documentation for details.
500      * Note that this operation may be significantly slower than hasBoundaryBefore().
501      * @param c character to test
502      * @return true if c is normalization-inert
503      * @stable ICU 4.4
504      */
505     virtual UBool isInert(UChar32 c) const = 0;
506 };
507 
508 /**
509  * Normalization filtered by a UnicodeSet.
510  * Normalizes portions of the text contained in the filter set and leaves
511  * portions not contained in the filter set unchanged.
512  * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
513  * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
514  * This class implements all of (and only) the Normalizer2 API.
515  * An instance of this class is unmodifiable/immutable but is constructed and
516  * must be destructed by the owner.
517  * @stable ICU 4.4
518  */
519 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
520 public:
521     /**
522      * Constructs a filtered normalizer wrapping any Normalizer2 instance
523      * and a filter set.
524      * Both are aliased and must not be modified or deleted while this object
525      * is used.
526      * The filter set should be frozen; otherwise the performance will suffer greatly.
527      * @param n2 wrapped Normalizer2 instance
528      * @param filterSet UnicodeSet which determines the characters to be normalized
529      * @stable ICU 4.4
530      */
FilteredNormalizer2(const Normalizer2 & n2,const UnicodeSet & filterSet)531     FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
532             norm2(n2), set(filterSet) {}
533 
534     /**
535      * Destructor.
536      * @stable ICU 4.4
537      */
538     ~FilteredNormalizer2();
539 
540     /**
541      * Writes the normalized form of the source string to the destination string
542      * (replacing its contents) and returns the destination string.
543      * The source and destination strings must be different objects.
544      * @param src source string
545      * @param dest destination string; its contents is replaced with normalized src
546      * @param errorCode Standard ICU error code. Its input value must
547      *                  pass the U_SUCCESS() test, or else the function returns
548      *                  immediately. Check for U_FAILURE() on output or use with
549      *                  function chaining. (See User Guide for details.)
550      * @return dest
551      * @stable ICU 4.4
552      */
553     virtual UnicodeString &
554     normalize(const UnicodeString &src,
555               UnicodeString &dest,
556               UErrorCode &errorCode) const override;
557 
558     /**
559      * Normalizes a UTF-8 string and optionally records how source substrings
560      * relate to changed and unchanged result substrings.
561      *
562      * Implemented completely for most built-in modes except for FCD.
563      * The base class implementation converts to & from UTF-16 and does not support edits.
564      *
565      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
566      * @param src       Source UTF-8 string.
567      * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
568      *                  sink.Flush() is called at the end.
569      * @param edits     Records edits for index mapping, working with styled text,
570      *                  and getting only changes (if any).
571      *                  The Edits contents is undefined if any error occurs.
572      *                  This function calls edits->reset() first unless
573      *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
574      * @param errorCode Standard ICU error code. Its input value must
575      *                  pass the U_SUCCESS() test, or else the function returns
576      *                  immediately. Check for U_FAILURE() on output or use with
577      *                  function chaining. (See User Guide for details.)
578      * @stable ICU 60
579      */
580     virtual void
581     normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
582                   Edits *edits, UErrorCode &errorCode) const override;
583 
584     /**
585      * Appends the normalized form of the second string to the first string
586      * (merging them at the boundary) and returns the first string.
587      * The result is normalized if the first string was normalized.
588      * The first and second strings must be different objects.
589      * @param first string, should be normalized
590      * @param second string, will be normalized
591      * @param errorCode Standard ICU error code. Its input value must
592      *                  pass the U_SUCCESS() test, or else the function returns
593      *                  immediately. Check for U_FAILURE() on output or use with
594      *                  function chaining. (See User Guide for details.)
595      * @return first
596      * @stable ICU 4.4
597      */
598     virtual UnicodeString &
599     normalizeSecondAndAppend(UnicodeString &first,
600                              const UnicodeString &second,
601                              UErrorCode &errorCode) const override;
602     /**
603      * Appends the second string to the first string
604      * (merging them at the boundary) and returns the first string.
605      * The result is normalized if both the strings were normalized.
606      * The first and second strings must be different objects.
607      * @param first string, should be normalized
608      * @param second string, should be normalized
609      * @param errorCode Standard ICU error code. Its input value must
610      *                  pass the U_SUCCESS() test, or else the function returns
611      *                  immediately. Check for U_FAILURE() on output or use with
612      *                  function chaining. (See User Guide for details.)
613      * @return first
614      * @stable ICU 4.4
615      */
616     virtual UnicodeString &
617     append(UnicodeString &first,
618            const UnicodeString &second,
619            UErrorCode &errorCode) const override;
620 
621     /**
622      * Gets the decomposition mapping of c.
623      * For details see the base class documentation.
624      *
625      * This function is independent of the mode of the Normalizer2.
626      * @param c code point
627      * @param decomposition String object which will be set to c's
628      *                      decomposition mapping, if there is one.
629      * @return true if c has a decomposition, otherwise false
630      * @stable ICU 4.6
631      */
632     virtual UBool
633     getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
634 
635     /**
636      * Gets the raw decomposition mapping of c.
637      * For details see the base class documentation.
638      *
639      * This function is independent of the mode of the Normalizer2.
640      * @param c code point
641      * @param decomposition String object which will be set to c's
642      *                      raw decomposition mapping, if there is one.
643      * @return true if c has a decomposition, otherwise false
644      * @stable ICU 49
645      */
646     virtual UBool
647     getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
648 
649     /**
650      * Performs pairwise composition of a & b and returns the composite if there is one.
651      * For details see the base class documentation.
652      *
653      * This function is independent of the mode of the Normalizer2.
654      * @param a A (normalization starter) code point.
655      * @param b Another code point.
656      * @return The non-negative composite code point if there is one; otherwise a negative value.
657      * @stable ICU 49
658      */
659     virtual UChar32
660     composePair(UChar32 a, UChar32 b) const override;
661 
662     /**
663      * Gets the combining class of c.
664      * The default implementation returns 0
665      * but all standard implementations return the Unicode Canonical_Combining_Class value.
666      * @param c code point
667      * @return c's combining class
668      * @stable ICU 49
669      */
670     virtual uint8_t
671     getCombiningClass(UChar32 c) const override;
672 
673     /**
674      * Tests if the string is normalized.
675      * For details see the Normalizer2 base class documentation.
676      * @param s input string
677      * @param errorCode Standard ICU error code. Its input value must
678      *                  pass the U_SUCCESS() test, or else the function returns
679      *                  immediately. Check for U_FAILURE() on output or use with
680      *                  function chaining. (See User Guide for details.)
681      * @return true if s is normalized
682      * @stable ICU 4.4
683      */
684     virtual UBool
685     isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
686     /**
687      * Tests if the UTF-8 string is normalized.
688      * Internally, in cases where the quickCheck() method would return "maybe"
689      * (which is only possible for the two COMPOSE modes) this method
690      * resolves to "yes" or "no" to provide a definitive result,
691      * at the cost of doing more work in those cases.
692      *
693      * This works for all normalization modes.
694      * It is optimized for UTF-8 for all built-in modes except for FCD.
695      * The base class implementation converts to UTF-16 and calls isNormalized().
696      *
697      * @param s UTF-8 input string
698      * @param errorCode Standard ICU error code. Its input value must
699      *                  pass the U_SUCCESS() test, or else the function returns
700      *                  immediately. Check for U_FAILURE() on output or use with
701      *                  function chaining. (See User Guide for details.)
702      * @return true if s is normalized
703      * @stable ICU 60
704      */
705     virtual UBool
706     isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
707     /**
708      * Tests if the string is normalized.
709      * For details see the Normalizer2 base class documentation.
710      * @param s input string
711      * @param errorCode Standard ICU error code. Its input value must
712      *                  pass the U_SUCCESS() test, or else the function returns
713      *                  immediately. Check for U_FAILURE() on output or use with
714      *                  function chaining. (See User Guide for details.)
715      * @return UNormalizationCheckResult
716      * @stable ICU 4.4
717      */
718     virtual UNormalizationCheckResult
719     quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
720     /**
721      * Returns the end of the normalized substring of the input string.
722      * For details see the Normalizer2 base class documentation.
723      * @param s input string
724      * @param errorCode Standard ICU error code. Its input value must
725      *                  pass the U_SUCCESS() test, or else the function returns
726      *                  immediately. Check for U_FAILURE() on output or use with
727      *                  function chaining. (See User Guide for details.)
728      * @return "yes" span end index
729      * @stable ICU 4.4
730      */
731     virtual int32_t
732     spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
733 
734     /**
735      * Tests if the character always has a normalization boundary before it,
736      * regardless of context.
737      * For details see the Normalizer2 base class documentation.
738      * @param c character to test
739      * @return true if c has a normalization boundary before it
740      * @stable ICU 4.4
741      */
742     virtual UBool hasBoundaryBefore(UChar32 c) const override;
743 
744     /**
745      * Tests if the character always has a normalization boundary after it,
746      * regardless of context.
747      * For details see the Normalizer2 base class documentation.
748      * @param c character to test
749      * @return true if c has a normalization boundary after it
750      * @stable ICU 4.4
751      */
752     virtual UBool hasBoundaryAfter(UChar32 c) const override;
753 
754     /**
755      * Tests if the character is normalization-inert.
756      * For details see the Normalizer2 base class documentation.
757      * @param c character to test
758      * @return true if c is normalization-inert
759      * @stable ICU 4.4
760      */
761     virtual UBool isInert(UChar32 c) const override;
762 private:
763     UnicodeString &
764     normalize(const UnicodeString &src,
765               UnicodeString &dest,
766               USetSpanCondition spanCondition,
767               UErrorCode &errorCode) const;
768 
769     void
770     normalizeUTF8(uint32_t options, const char *src, int32_t length,
771                   ByteSink &sink, Edits *edits,
772                   USetSpanCondition spanCondition,
773                   UErrorCode &errorCode) const;
774 
775     UnicodeString &
776     normalizeSecondAndAppend(UnicodeString &first,
777                              const UnicodeString &second,
778                              UBool doNormalize,
779                              UErrorCode &errorCode) const;
780 
781     const Normalizer2 &norm2;
782     const UnicodeSet &set;
783 };
784 
785 U_NAMESPACE_END
786 
787 #endif  // !UCONFIG_NO_NORMALIZATION
788 
789 #endif /* U_SHOW_CPLUSPLUS_API */
790 
791 #endif  // __NORMALIZER2_H__
792