1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2013, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: normalizer2.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009nov22 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __NORMALIZER2_H__ 20 #define __NORMALIZER2_H__ 21 22 /** 23 * \file 24 * \brief C++ API: New API for Unicode Normalization. 25 */ 26 27 #include "unicode/utypes.h" 28 29 #if U_SHOW_CPLUSPLUS_API 30 31 #if !UCONFIG_NO_NORMALIZATION 32 33 #include "unicode/stringpiece.h" 34 #include "unicode/uniset.h" 35 #include "unicode/unistr.h" 36 #include "unicode/unorm2.h" 37 38 U_NAMESPACE_BEGIN 39 40 class ByteSink; 41 42 /** 43 * Unicode normalization functionality for standard Unicode normalization or 44 * for using custom mapping tables. 45 * All instances of this class are unmodifiable/immutable. 46 * Instances returned by getInstance() are singletons that must not be deleted by the caller. 47 * The Normalizer2 class is not intended for public subclassing. 48 * 49 * The primary functions are to produce a normalized string and to detect whether 50 * a string is already normalized. 51 * The most commonly used normalization forms are those defined in 52 * http://www.unicode.org/unicode/reports/tr15/ 53 * However, this API supports additional normalization forms for specialized purposes. 54 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) 55 * and can be used in implementations of UTS #46. 56 * 57 * Not only are the standard compose and decompose modes supplied, 58 * but additional modes are provided as documented in the Mode enum. 59 * 60 * Some of the functions in this class identify normalization boundaries. 61 * At a normalization boundary, the portions of the string 62 * before it and starting from it do not interact and can be handled independently. 63 * 64 * The spanQuickCheckYes() stops at a normalization boundary. 65 * When the goal is a normalized string, then the text before the boundary 66 * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). 67 * 68 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether 69 * a character is guaranteed to be at a normalization boundary, 70 * regardless of context. 71 * This is used for moving from one normalization boundary to the next 72 * or preceding boundary, and for performing iterative normalization. 73 * 74 * Iterative normalization is useful when only a small portion of a 75 * longer string needs to be processed. 76 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator 77 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() 78 * (to process only the substring for which sort key bytes are computed). 79 * 80 * The set of normalization boundaries returned by these functions may not be 81 * complete: There may be more boundaries that could be returned. 82 * Different functions may return different boundaries. 83 * @stable ICU 4.4 84 */ 85 class U_COMMON_API Normalizer2 : public UObject { 86 public: 87 /** 88 * Destructor. 89 * @stable ICU 4.4 90 */ 91 ~Normalizer2(); 92 93 /** 94 * Returns a Normalizer2 instance for Unicode NFC normalization. 95 * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode). 96 * Returns an unmodifiable singleton instance. Do not delete it. 97 * @param errorCode Standard ICU error code. Its input value must 98 * pass the U_SUCCESS() test, or else the function returns 99 * immediately. Check for U_FAILURE() on output or use with 100 * function chaining. (See User Guide for details.) 101 * @return the requested Normalizer2, if successful 102 * @stable ICU 49 103 */ 104 static const Normalizer2 * 105 getNFCInstance(UErrorCode &errorCode); 106 107 /** 108 * Returns a Normalizer2 instance for Unicode NFD normalization. 109 * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode). 110 * Returns an unmodifiable singleton instance. Do not delete it. 111 * @param errorCode Standard ICU error code. Its input value must 112 * pass the U_SUCCESS() test, or else the function returns 113 * immediately. Check for U_FAILURE() on output or use with 114 * function chaining. (See User Guide for details.) 115 * @return the requested Normalizer2, if successful 116 * @stable ICU 49 117 */ 118 static const Normalizer2 * 119 getNFDInstance(UErrorCode &errorCode); 120 121 /** 122 * Returns a Normalizer2 instance for Unicode NFKC normalization. 123 * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode). 124 * Returns an unmodifiable singleton instance. Do not delete it. 125 * @param errorCode Standard ICU error code. Its input value must 126 * pass the U_SUCCESS() test, or else the function returns 127 * immediately. Check for U_FAILURE() on output or use with 128 * function chaining. (See User Guide for details.) 129 * @return the requested Normalizer2, if successful 130 * @stable ICU 49 131 */ 132 static const Normalizer2 * 133 getNFKCInstance(UErrorCode &errorCode); 134 135 /** 136 * Returns a Normalizer2 instance for Unicode NFKD normalization. 137 * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode). 138 * Returns an unmodifiable singleton instance. Do not delete it. 139 * @param errorCode Standard ICU error code. Its input value must 140 * pass the U_SUCCESS() test, or else the function returns 141 * immediately. Check for U_FAILURE() on output or use with 142 * function chaining. (See User Guide for details.) 143 * @return the requested Normalizer2, if successful 144 * @stable ICU 49 145 */ 146 static const Normalizer2 * 147 getNFKDInstance(UErrorCode &errorCode); 148 149 /** 150 * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization 151 * which is equivalent to applying the NFKC_Casefold mappings and then NFC. 152 * See https://www.unicode.org/reports/tr44/#NFKC_Casefold 153 * 154 * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode). 155 * Returns an unmodifiable singleton instance. Do not delete it. 156 * @param errorCode Standard ICU error code. Its input value must 157 * pass the U_SUCCESS() test, or else the function returns 158 * immediately. Check for U_FAILURE() on output or use with 159 * function chaining. (See User Guide for details.) 160 * @return the requested Normalizer2, if successful 161 * @stable ICU 49 162 */ 163 static const Normalizer2 * 164 getNFKCCasefoldInstance(UErrorCode &errorCode); 165 166 /** 167 * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization 168 * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC. 169 * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold 170 * 171 * Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode). 172 * Returns an unmodifiable singleton instance. Do not delete it. 173 * @param errorCode Standard ICU error code. Its input value must 174 * pass the U_SUCCESS() test, or else the function returns 175 * immediately. Check for U_FAILURE() on output or use with 176 * function chaining. (See User Guide for details.) 177 * @return the requested Normalizer2, if successful 178 * @stable ICU 74 179 */ 180 static const Normalizer2 * 181 getNFKCSimpleCasefoldInstance(UErrorCode &errorCode); 182 183 /** 184 * Returns a Normalizer2 instance which uses the specified data file 185 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) 186 * and which composes or decomposes text according to the specified mode. 187 * Returns an unmodifiable singleton instance. Do not delete it. 188 * 189 * Use packageName=nullptr for data files that are part of ICU's own data. 190 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. 191 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. 192 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. 193 * 194 * @param packageName nullptr for ICU built-in data, otherwise application data package name 195 * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file 196 * @param mode normalization mode (compose or decompose etc.) 197 * @param errorCode Standard ICU error code. Its input value must 198 * pass the U_SUCCESS() test, or else the function returns 199 * immediately. Check for U_FAILURE() on output or use with 200 * function chaining. (See User Guide for details.) 201 * @return the requested Normalizer2, if successful 202 * @stable ICU 4.4 203 */ 204 static const Normalizer2 * 205 getInstance(const char *packageName, 206 const char *name, 207 UNormalization2Mode mode, 208 UErrorCode &errorCode); 209 210 /** 211 * Returns the normalized form of the source string. 212 * @param src source string 213 * @param errorCode Standard ICU error code. Its input value must 214 * pass the U_SUCCESS() test, or else the function returns 215 * immediately. Check for U_FAILURE() on output or use with 216 * function chaining. (See User Guide for details.) 217 * @return normalized src 218 * @stable ICU 4.4 219 */ 220 UnicodeString normalize(const UnicodeString & src,UErrorCode & errorCode)221 normalize(const UnicodeString &src, UErrorCode &errorCode) const { 222 UnicodeString result; 223 normalize(src, result, errorCode); 224 return result; 225 } 226 /** 227 * Writes the normalized form of the source string to the destination string 228 * (replacing its contents) and returns the destination string. 229 * The source and destination strings must be different objects. 230 * @param src source string 231 * @param dest destination string; its contents is replaced with normalized src 232 * @param errorCode Standard ICU error code. Its input value must 233 * pass the U_SUCCESS() test, or else the function returns 234 * immediately. Check for U_FAILURE() on output or use with 235 * function chaining. (See User Guide for details.) 236 * @return dest 237 * @stable ICU 4.4 238 */ 239 virtual UnicodeString & 240 normalize(const UnicodeString &src, 241 UnicodeString &dest, 242 UErrorCode &errorCode) const = 0; 243 244 /** 245 * Normalizes a UTF-8 string and optionally records how source substrings 246 * relate to changed and unchanged result substrings. 247 * 248 * Implemented completely for all built-in modes except for FCD. 249 * The base class implementation converts to & from UTF-16 and does not support edits. 250 * 251 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 252 * @param src Source UTF-8 string. 253 * @param sink A ByteSink to which the normalized UTF-8 result string is written. 254 * sink.Flush() is called at the end. 255 * @param edits Records edits for index mapping, working with styled text, 256 * and getting only changes (if any). 257 * The Edits contents is undefined if any error occurs. 258 * This function calls edits->reset() first unless 259 * options includes U_EDITS_NO_RESET. edits can be nullptr. 260 * @param errorCode Standard ICU error code. Its input value must 261 * pass the U_SUCCESS() test, or else the function returns 262 * immediately. Check for U_FAILURE() on output or use with 263 * function chaining. (See User Guide for details.) 264 * @stable ICU 60 265 */ 266 virtual void 267 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 268 Edits *edits, UErrorCode &errorCode) const; 269 270 /** 271 * Appends the normalized form of the second string to the first string 272 * (merging them at the boundary) and returns the first string. 273 * The result is normalized if the first string was normalized. 274 * The first and second strings must be different objects. 275 * @param first string, should be normalized 276 * @param second string, will be normalized 277 * @param errorCode Standard ICU error code. Its input value must 278 * pass the U_SUCCESS() test, or else the function returns 279 * immediately. Check for U_FAILURE() on output or use with 280 * function chaining. (See User Guide for details.) 281 * @return first 282 * @stable ICU 4.4 283 */ 284 virtual UnicodeString & 285 normalizeSecondAndAppend(UnicodeString &first, 286 const UnicodeString &second, 287 UErrorCode &errorCode) const = 0; 288 /** 289 * Appends the second string to the first string 290 * (merging them at the boundary) and returns the first string. 291 * The result is normalized if both the strings were normalized. 292 * The first and second strings must be different objects. 293 * @param first string, should be normalized 294 * @param second string, should be normalized 295 * @param errorCode Standard ICU error code. Its input value must 296 * pass the U_SUCCESS() test, or else the function returns 297 * immediately. Check for U_FAILURE() on output or use with 298 * function chaining. (See User Guide for details.) 299 * @return first 300 * @stable ICU 4.4 301 */ 302 virtual UnicodeString & 303 append(UnicodeString &first, 304 const UnicodeString &second, 305 UErrorCode &errorCode) const = 0; 306 307 /** 308 * Gets the decomposition mapping of c. 309 * Roughly equivalent to normalizing the String form of c 310 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function 311 * returns false and does not write a string 312 * if c does not have a decomposition mapping in this instance's data. 313 * This function is independent of the mode of the Normalizer2. 314 * @param c code point 315 * @param decomposition String object which will be set to c's 316 * decomposition mapping, if there is one. 317 * @return true if c has a decomposition, otherwise false 318 * @stable ICU 4.6 319 */ 320 virtual UBool 321 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; 322 323 /** 324 * Gets the raw decomposition mapping of c. 325 * 326 * This is similar to the getDecomposition() method but returns the 327 * raw decomposition mapping as specified in UnicodeData.txt or 328 * (for custom data) in the mapping files processed by the gennorm2 tool. 329 * By contrast, getDecomposition() returns the processed, 330 * recursively-decomposed version of this mapping. 331 * 332 * When used on a standard NFKC Normalizer2 instance, 333 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. 334 * 335 * When used on a standard NFC Normalizer2 instance, 336 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); 337 * in this case, the result contains either one or two code points (=1..4 char16_ts). 338 * 339 * This function is independent of the mode of the Normalizer2. 340 * The default implementation returns false. 341 * @param c code point 342 * @param decomposition String object which will be set to c's 343 * raw decomposition mapping, if there is one. 344 * @return true if c has a decomposition, otherwise false 345 * @stable ICU 49 346 */ 347 virtual UBool 348 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; 349 350 /** 351 * Performs pairwise composition of a & b and returns the composite if there is one. 352 * 353 * Returns a composite code point c only if c has a two-way mapping to a+b. 354 * In standard Unicode normalization, this means that 355 * c has a canonical decomposition to a+b 356 * and c does not have the Full_Composition_Exclusion property. 357 * 358 * This function is independent of the mode of the Normalizer2. 359 * The default implementation returns a negative value. 360 * @param a A (normalization starter) code point. 361 * @param b Another code point. 362 * @return The non-negative composite code point if there is one; otherwise a negative value. 363 * @stable ICU 49 364 */ 365 virtual UChar32 366 composePair(UChar32 a, UChar32 b) const; 367 368 /** 369 * Gets the combining class of c. 370 * The default implementation returns 0 371 * but all standard implementations return the Unicode Canonical_Combining_Class value. 372 * @param c code point 373 * @return c's combining class 374 * @stable ICU 49 375 */ 376 virtual uint8_t 377 getCombiningClass(UChar32 c) const; 378 379 /** 380 * Tests if the string is normalized. 381 * Internally, in cases where the quickCheck() method would return "maybe" 382 * (which is only possible for the two COMPOSE modes) this method 383 * resolves to "yes" or "no" to provide a definitive result, 384 * at the cost of doing more work in those cases. 385 * @param s input string 386 * @param errorCode Standard ICU error code. Its input value must 387 * pass the U_SUCCESS() test, or else the function returns 388 * immediately. Check for U_FAILURE() on output or use with 389 * function chaining. (See User Guide for details.) 390 * @return true if s is normalized 391 * @stable ICU 4.4 392 */ 393 virtual UBool 394 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; 395 /** 396 * Tests if the UTF-8 string is normalized. 397 * Internally, in cases where the quickCheck() method would return "maybe" 398 * (which is only possible for the two COMPOSE modes) this method 399 * resolves to "yes" or "no" to provide a definitive result, 400 * at the cost of doing more work in those cases. 401 * 402 * This works for all normalization modes. 403 * It is optimized for UTF-8 for all built-in modes except for FCD. 404 * The base class implementation converts to UTF-16 and calls isNormalized(). 405 * 406 * @param s UTF-8 input string 407 * @param errorCode Standard ICU error code. Its input value must 408 * pass the U_SUCCESS() test, or else the function returns 409 * immediately. Check for U_FAILURE() on output or use with 410 * function chaining. (See User Guide for details.) 411 * @return true if s is normalized 412 * @stable ICU 60 413 */ 414 virtual UBool 415 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const; 416 417 418 /** 419 * Tests if the string is normalized. 420 * For the two COMPOSE modes, the result could be "maybe" in cases that 421 * would take a little more work to resolve definitively. 422 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster 423 * combination of quick check + normalization, to avoid 424 * re-checking the "yes" prefix. 425 * @param s input string 426 * @param errorCode Standard ICU error code. Its input value must 427 * pass the U_SUCCESS() test, or else the function returns 428 * immediately. Check for U_FAILURE() on output or use with 429 * function chaining. (See User Guide for details.) 430 * @return UNormalizationCheckResult 431 * @stable ICU 4.4 432 */ 433 virtual UNormalizationCheckResult 434 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; 435 436 /** 437 * Returns the end of the normalized substring of the input string. 438 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> 439 * the substring <code>UnicodeString(s, 0, end)</code> 440 * will pass the quick check with a "yes" result. 441 * 442 * The returned end index is usually one or more characters before the 443 * "no" or "maybe" character: The end index is at a normalization boundary. 444 * (See the class documentation for more about normalization boundaries.) 445 * 446 * When the goal is a normalized string and most input strings are expected 447 * to be normalized already, then call this method, 448 * and if it returns a prefix shorter than the input string, 449 * copy that prefix and use normalizeSecondAndAppend() for the remainder. 450 * @param s input string 451 * @param errorCode Standard ICU error code. Its input value must 452 * pass the U_SUCCESS() test, or else the function returns 453 * immediately. Check for U_FAILURE() on output or use with 454 * function chaining. (See User Guide for details.) 455 * @return "yes" span end index 456 * @stable ICU 4.4 457 */ 458 virtual int32_t 459 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; 460 461 /** 462 * Tests if the character always has a normalization boundary before it, 463 * regardless of context. 464 * If true, then the character does not normalization-interact with 465 * preceding characters. 466 * In other words, a string containing this character can be normalized 467 * by processing portions before this character and starting from this 468 * character independently. 469 * This is used for iterative normalization. See the class documentation for details. 470 * @param c character to test 471 * @return true if c has a normalization boundary before it 472 * @stable ICU 4.4 473 */ 474 virtual UBool hasBoundaryBefore(UChar32 c) const = 0; 475 476 /** 477 * Tests if the character always has a normalization boundary after it, 478 * regardless of context. 479 * If true, then the character does not normalization-interact with 480 * following characters. 481 * In other words, a string containing this character can be normalized 482 * by processing portions up to this character and after this 483 * character independently. 484 * This is used for iterative normalization. See the class documentation for details. 485 * Note that this operation may be significantly slower than hasBoundaryBefore(). 486 * @param c character to test 487 * @return true if c has a normalization boundary after it 488 * @stable ICU 4.4 489 */ 490 virtual UBool hasBoundaryAfter(UChar32 c) const = 0; 491 492 /** 493 * Tests if the character is normalization-inert. 494 * If true, then the character does not change, nor normalization-interact with 495 * preceding or following characters. 496 * In other words, a string containing this character can be normalized 497 * by processing portions before this character and after this 498 * character independently. 499 * This is used for iterative normalization. See the class documentation for details. 500 * Note that this operation may be significantly slower than hasBoundaryBefore(). 501 * @param c character to test 502 * @return true if c is normalization-inert 503 * @stable ICU 4.4 504 */ 505 virtual UBool isInert(UChar32 c) const = 0; 506 }; 507 508 /** 509 * Normalization filtered by a UnicodeSet. 510 * Normalizes portions of the text contained in the filter set and leaves 511 * portions not contained in the filter set unchanged. 512 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). 513 * Not-in-the-filter text is treated as "is normalized" and "quick check yes". 514 * This class implements all of (and only) the Normalizer2 API. 515 * An instance of this class is unmodifiable/immutable but is constructed and 516 * must be destructed by the owner. 517 * @stable ICU 4.4 518 */ 519 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { 520 public: 521 /** 522 * Constructs a filtered normalizer wrapping any Normalizer2 instance 523 * and a filter set. 524 * Both are aliased and must not be modified or deleted while this object 525 * is used. 526 * The filter set should be frozen; otherwise the performance will suffer greatly. 527 * @param n2 wrapped Normalizer2 instance 528 * @param filterSet UnicodeSet which determines the characters to be normalized 529 * @stable ICU 4.4 530 */ FilteredNormalizer2(const Normalizer2 & n2,const UnicodeSet & filterSet)531 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : 532 norm2(n2), set(filterSet) {} 533 534 /** 535 * Destructor. 536 * @stable ICU 4.4 537 */ 538 ~FilteredNormalizer2(); 539 540 /** 541 * Writes the normalized form of the source string to the destination string 542 * (replacing its contents) and returns the destination string. 543 * The source and destination strings must be different objects. 544 * @param src source string 545 * @param dest destination string; its contents is replaced with normalized src 546 * @param errorCode Standard ICU error code. Its input value must 547 * pass the U_SUCCESS() test, or else the function returns 548 * immediately. Check for U_FAILURE() on output or use with 549 * function chaining. (See User Guide for details.) 550 * @return dest 551 * @stable ICU 4.4 552 */ 553 virtual UnicodeString & 554 normalize(const UnicodeString &src, 555 UnicodeString &dest, 556 UErrorCode &errorCode) const override; 557 558 /** 559 * Normalizes a UTF-8 string and optionally records how source substrings 560 * relate to changed and unchanged result substrings. 561 * 562 * Implemented completely for most built-in modes except for FCD. 563 * The base class implementation converts to & from UTF-16 and does not support edits. 564 * 565 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 566 * @param src Source UTF-8 string. 567 * @param sink A ByteSink to which the normalized UTF-8 result string is written. 568 * sink.Flush() is called at the end. 569 * @param edits Records edits for index mapping, working with styled text, 570 * and getting only changes (if any). 571 * The Edits contents is undefined if any error occurs. 572 * This function calls edits->reset() first unless 573 * options includes U_EDITS_NO_RESET. edits can be nullptr. 574 * @param errorCode Standard ICU error code. Its input value must 575 * pass the U_SUCCESS() test, or else the function returns 576 * immediately. Check for U_FAILURE() on output or use with 577 * function chaining. (See User Guide for details.) 578 * @stable ICU 60 579 */ 580 virtual void 581 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 582 Edits *edits, UErrorCode &errorCode) const override; 583 584 /** 585 * Appends the normalized form of the second string to the first string 586 * (merging them at the boundary) and returns the first string. 587 * The result is normalized if the first string was normalized. 588 * The first and second strings must be different objects. 589 * @param first string, should be normalized 590 * @param second string, will be normalized 591 * @param errorCode Standard ICU error code. Its input value must 592 * pass the U_SUCCESS() test, or else the function returns 593 * immediately. Check for U_FAILURE() on output or use with 594 * function chaining. (See User Guide for details.) 595 * @return first 596 * @stable ICU 4.4 597 */ 598 virtual UnicodeString & 599 normalizeSecondAndAppend(UnicodeString &first, 600 const UnicodeString &second, 601 UErrorCode &errorCode) const override; 602 /** 603 * Appends the second string to the first string 604 * (merging them at the boundary) and returns the first string. 605 * The result is normalized if both the strings were normalized. 606 * The first and second strings must be different objects. 607 * @param first string, should be normalized 608 * @param second string, should be normalized 609 * @param errorCode Standard ICU error code. Its input value must 610 * pass the U_SUCCESS() test, or else the function returns 611 * immediately. Check for U_FAILURE() on output or use with 612 * function chaining. (See User Guide for details.) 613 * @return first 614 * @stable ICU 4.4 615 */ 616 virtual UnicodeString & 617 append(UnicodeString &first, 618 const UnicodeString &second, 619 UErrorCode &errorCode) const override; 620 621 /** 622 * Gets the decomposition mapping of c. 623 * For details see the base class documentation. 624 * 625 * This function is independent of the mode of the Normalizer2. 626 * @param c code point 627 * @param decomposition String object which will be set to c's 628 * decomposition mapping, if there is one. 629 * @return true if c has a decomposition, otherwise false 630 * @stable ICU 4.6 631 */ 632 virtual UBool 633 getDecomposition(UChar32 c, UnicodeString &decomposition) const override; 634 635 /** 636 * Gets the raw decomposition mapping of c. 637 * For details see the base class documentation. 638 * 639 * This function is independent of the mode of the Normalizer2. 640 * @param c code point 641 * @param decomposition String object which will be set to c's 642 * raw decomposition mapping, if there is one. 643 * @return true if c has a decomposition, otherwise false 644 * @stable ICU 49 645 */ 646 virtual UBool 647 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override; 648 649 /** 650 * Performs pairwise composition of a & b and returns the composite if there is one. 651 * For details see the base class documentation. 652 * 653 * This function is independent of the mode of the Normalizer2. 654 * @param a A (normalization starter) code point. 655 * @param b Another code point. 656 * @return The non-negative composite code point if there is one; otherwise a negative value. 657 * @stable ICU 49 658 */ 659 virtual UChar32 660 composePair(UChar32 a, UChar32 b) const override; 661 662 /** 663 * Gets the combining class of c. 664 * The default implementation returns 0 665 * but all standard implementations return the Unicode Canonical_Combining_Class value. 666 * @param c code point 667 * @return c's combining class 668 * @stable ICU 49 669 */ 670 virtual uint8_t 671 getCombiningClass(UChar32 c) const override; 672 673 /** 674 * Tests if the string is normalized. 675 * For details see the Normalizer2 base class documentation. 676 * @param s input string 677 * @param errorCode Standard ICU error code. Its input value must 678 * pass the U_SUCCESS() test, or else the function returns 679 * immediately. Check for U_FAILURE() on output or use with 680 * function chaining. (See User Guide for details.) 681 * @return true if s is normalized 682 * @stable ICU 4.4 683 */ 684 virtual UBool 685 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override; 686 /** 687 * Tests if the UTF-8 string is normalized. 688 * Internally, in cases where the quickCheck() method would return "maybe" 689 * (which is only possible for the two COMPOSE modes) this method 690 * resolves to "yes" or "no" to provide a definitive result, 691 * at the cost of doing more work in those cases. 692 * 693 * This works for all normalization modes. 694 * It is optimized for UTF-8 for all built-in modes except for FCD. 695 * The base class implementation converts to UTF-16 and calls isNormalized(). 696 * 697 * @param s UTF-8 input string 698 * @param errorCode Standard ICU error code. Its input value must 699 * pass the U_SUCCESS() test, or else the function returns 700 * immediately. Check for U_FAILURE() on output or use with 701 * function chaining. (See User Guide for details.) 702 * @return true if s is normalized 703 * @stable ICU 60 704 */ 705 virtual UBool 706 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override; 707 /** 708 * Tests if the string is normalized. 709 * For details see the Normalizer2 base class documentation. 710 * @param s input string 711 * @param errorCode Standard ICU error code. Its input value must 712 * pass the U_SUCCESS() test, or else the function returns 713 * immediately. Check for U_FAILURE() on output or use with 714 * function chaining. (See User Guide for details.) 715 * @return UNormalizationCheckResult 716 * @stable ICU 4.4 717 */ 718 virtual UNormalizationCheckResult 719 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override; 720 /** 721 * Returns the end of the normalized substring of the input string. 722 * For details see the Normalizer2 base class documentation. 723 * @param s input string 724 * @param errorCode Standard ICU error code. Its input value must 725 * pass the U_SUCCESS() test, or else the function returns 726 * immediately. Check for U_FAILURE() on output or use with 727 * function chaining. (See User Guide for details.) 728 * @return "yes" span end index 729 * @stable ICU 4.4 730 */ 731 virtual int32_t 732 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override; 733 734 /** 735 * Tests if the character always has a normalization boundary before it, 736 * regardless of context. 737 * For details see the Normalizer2 base class documentation. 738 * @param c character to test 739 * @return true if c has a normalization boundary before it 740 * @stable ICU 4.4 741 */ 742 virtual UBool hasBoundaryBefore(UChar32 c) const override; 743 744 /** 745 * Tests if the character always has a normalization boundary after it, 746 * regardless of context. 747 * For details see the Normalizer2 base class documentation. 748 * @param c character to test 749 * @return true if c has a normalization boundary after it 750 * @stable ICU 4.4 751 */ 752 virtual UBool hasBoundaryAfter(UChar32 c) const override; 753 754 /** 755 * Tests if the character is normalization-inert. 756 * For details see the Normalizer2 base class documentation. 757 * @param c character to test 758 * @return true if c is normalization-inert 759 * @stable ICU 4.4 760 */ 761 virtual UBool isInert(UChar32 c) const override; 762 private: 763 UnicodeString & 764 normalize(const UnicodeString &src, 765 UnicodeString &dest, 766 USetSpanCondition spanCondition, 767 UErrorCode &errorCode) const; 768 769 void 770 normalizeUTF8(uint32_t options, const char *src, int32_t length, 771 ByteSink &sink, Edits *edits, 772 USetSpanCondition spanCondition, 773 UErrorCode &errorCode) const; 774 775 UnicodeString & 776 normalizeSecondAndAppend(UnicodeString &first, 777 const UnicodeString &second, 778 UBool doNormalize, 779 UErrorCode &errorCode) const; 780 781 const Normalizer2 &norm2; 782 const UnicodeSet &set; 783 }; 784 785 U_NAMESPACE_END 786 787 #endif // !UCONFIG_NO_NORMALIZATION 788 789 #endif /* U_SHOW_CPLUSPLUS_API */ 790 791 #endif // __NORMALIZER2_H__ 792