1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2013, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: normalizer2.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009nov22 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __NORMALIZER2_H__ 20 #define __NORMALIZER2_H__ 21 22 /** 23 * \file 24 * \brief C++ API: New API for Unicode Normalization. 25 */ 26 27 #include "unicode/utypes.h" 28 29 #if U_SHOW_CPLUSPLUS_API 30 31 #if !UCONFIG_NO_NORMALIZATION 32 33 #include "unicode/stringpiece.h" 34 #include "unicode/uniset.h" 35 #include "unicode/unistr.h" 36 #include "unicode/unorm2.h" 37 38 U_NAMESPACE_BEGIN 39 40 class ByteSink; 41 42 /** 43 * Unicode normalization functionality for standard Unicode normalization or 44 * for using custom mapping tables. 45 * All instances of this class are unmodifiable/immutable. 46 * Instances returned by getInstance() are singletons that must not be deleted by the caller. 47 * The Normalizer2 class is not intended for public subclassing. 48 * 49 * The primary functions are to produce a normalized string and to detect whether 50 * a string is already normalized. 51 * The most commonly used normalization forms are those defined in 52 * http://www.unicode.org/unicode/reports/tr15/ 53 * However, this API supports additional normalization forms for specialized purposes. 54 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) 55 * and can be used in implementations of UTS #46. 56 * 57 * Not only are the standard compose and decompose modes supplied, 58 * but additional modes are provided as documented in the Mode enum. 59 * 60 * Some of the functions in this class identify normalization boundaries. 61 * At a normalization boundary, the portions of the string 62 * before it and starting from it do not interact and can be handled independently. 63 * 64 * The spanQuickCheckYes() stops at a normalization boundary. 65 * When the goal is a normalized string, then the text before the boundary 66 * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). 67 * 68 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether 69 * a character is guaranteed to be at a normalization boundary, 70 * regardless of context. 71 * This is used for moving from one normalization boundary to the next 72 * or preceding boundary, and for performing iterative normalization. 73 * 74 * Iterative normalization is useful when only a small portion of a 75 * longer string needs to be processed. 76 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator 77 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() 78 * (to process only the substring for which sort key bytes are computed). 79 * 80 * The set of normalization boundaries returned by these functions may not be 81 * complete: There may be more boundaries that could be returned. 82 * Different functions may return different boundaries. 83 * @stable ICU 4.4 84 */ 85 class U_COMMON_API Normalizer2 : public UObject { 86 public: 87 /** 88 * Destructor. 89 * @stable ICU 4.4 90 */ 91 ~Normalizer2(); 92 93 /** 94 * Returns a Normalizer2 instance for Unicode NFC normalization. 95 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode). 96 * Returns an unmodifiable singleton instance. Do not delete it. 97 * @param errorCode Standard ICU error code. Its input value must 98 * pass the U_SUCCESS() test, or else the function returns 99 * immediately. Check for U_FAILURE() on output or use with 100 * function chaining. (See User Guide for details.) 101 * @return the requested Normalizer2, if successful 102 * @stable ICU 49 103 */ 104 static const Normalizer2 * 105 getNFCInstance(UErrorCode &errorCode); 106 107 /** 108 * Returns a Normalizer2 instance for Unicode NFD normalization. 109 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode). 110 * Returns an unmodifiable singleton instance. Do not delete it. 111 * @param errorCode Standard ICU error code. Its input value must 112 * pass the U_SUCCESS() test, or else the function returns 113 * immediately. Check for U_FAILURE() on output or use with 114 * function chaining. (See User Guide for details.) 115 * @return the requested Normalizer2, if successful 116 * @stable ICU 49 117 */ 118 static const Normalizer2 * 119 getNFDInstance(UErrorCode &errorCode); 120 121 /** 122 * Returns a Normalizer2 instance for Unicode NFKC normalization. 123 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode). 124 * Returns an unmodifiable singleton instance. Do not delete it. 125 * @param errorCode Standard ICU error code. Its input value must 126 * pass the U_SUCCESS() test, or else the function returns 127 * immediately. Check for U_FAILURE() on output or use with 128 * function chaining. (See User Guide for details.) 129 * @return the requested Normalizer2, if successful 130 * @stable ICU 49 131 */ 132 static const Normalizer2 * 133 getNFKCInstance(UErrorCode &errorCode); 134 135 /** 136 * Returns a Normalizer2 instance for Unicode NFKD normalization. 137 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode). 138 * Returns an unmodifiable singleton instance. Do not delete it. 139 * @param errorCode Standard ICU error code. Its input value must 140 * pass the U_SUCCESS() test, or else the function returns 141 * immediately. Check for U_FAILURE() on output or use with 142 * function chaining. (See User Guide for details.) 143 * @return the requested Normalizer2, if successful 144 * @stable ICU 49 145 */ 146 static const Normalizer2 * 147 getNFKDInstance(UErrorCode &errorCode); 148 149 /** 150 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization. 151 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode). 152 * Returns an unmodifiable singleton instance. Do not delete it. 153 * @param errorCode Standard ICU error code. Its input value must 154 * pass the U_SUCCESS() test, or else the function returns 155 * immediately. Check for U_FAILURE() on output or use with 156 * function chaining. (See User Guide for details.) 157 * @return the requested Normalizer2, if successful 158 * @stable ICU 49 159 */ 160 static const Normalizer2 * 161 getNFKCCasefoldInstance(UErrorCode &errorCode); 162 163 /** 164 * Returns a Normalizer2 instance which uses the specified data file 165 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) 166 * and which composes or decomposes text according to the specified mode. 167 * Returns an unmodifiable singleton instance. Do not delete it. 168 * 169 * Use packageName=NULL for data files that are part of ICU's own data. 170 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. 171 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. 172 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. 173 * 174 * @param packageName NULL for ICU built-in data, otherwise application data package name 175 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file 176 * @param mode normalization mode (compose or decompose etc.) 177 * @param errorCode Standard ICU error code. Its input value must 178 * pass the U_SUCCESS() test, or else the function returns 179 * immediately. Check for U_FAILURE() on output or use with 180 * function chaining. (See User Guide for details.) 181 * @return the requested Normalizer2, if successful 182 * @stable ICU 4.4 183 */ 184 static const Normalizer2 * 185 getInstance(const char *packageName, 186 const char *name, 187 UNormalization2Mode mode, 188 UErrorCode &errorCode); 189 190 /** 191 * Returns the normalized form of the source string. 192 * @param src source string 193 * @param errorCode Standard ICU error code. Its input value must 194 * pass the U_SUCCESS() test, or else the function returns 195 * immediately. Check for U_FAILURE() on output or use with 196 * function chaining. (See User Guide for details.) 197 * @return normalized src 198 * @stable ICU 4.4 199 */ 200 UnicodeString normalize(const UnicodeString & src,UErrorCode & errorCode)201 normalize(const UnicodeString &src, UErrorCode &errorCode) const { 202 UnicodeString result; 203 normalize(src, result, errorCode); 204 return result; 205 } 206 /** 207 * Writes the normalized form of the source string to the destination string 208 * (replacing its contents) and returns the destination string. 209 * The source and destination strings must be different objects. 210 * @param src source string 211 * @param dest destination string; its contents is replaced with normalized src 212 * @param errorCode Standard ICU error code. Its input value must 213 * pass the U_SUCCESS() test, or else the function returns 214 * immediately. Check for U_FAILURE() on output or use with 215 * function chaining. (See User Guide for details.) 216 * @return dest 217 * @stable ICU 4.4 218 */ 219 virtual UnicodeString & 220 normalize(const UnicodeString &src, 221 UnicodeString &dest, 222 UErrorCode &errorCode) const = 0; 223 224 /** 225 * Normalizes a UTF-8 string and optionally records how source substrings 226 * relate to changed and unchanged result substrings. 227 * 228 * Currently implemented completely only for "compose" modes, 229 * such as for NFC, NFKC, and NFKC_Casefold 230 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS). 231 * Otherwise currently converts to & from UTF-16 and does not support edits. 232 * 233 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 234 * @param src Source UTF-8 string. 235 * @param sink A ByteSink to which the normalized UTF-8 result string is written. 236 * sink.Flush() is called at the end. 237 * @param edits Records edits for index mapping, working with styled text, 238 * and getting only changes (if any). 239 * The Edits contents is undefined if any error occurs. 240 * This function calls edits->reset() first unless 241 * options includes U_EDITS_NO_RESET. edits can be nullptr. 242 * @param errorCode Standard ICU error code. Its input value must 243 * pass the U_SUCCESS() test, or else the function returns 244 * immediately. Check for U_FAILURE() on output or use with 245 * function chaining. (See User Guide for details.) 246 * @stable ICU 60 247 */ 248 virtual void 249 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 250 Edits *edits, UErrorCode &errorCode) const; 251 252 /** 253 * Appends the normalized form of the second string to the first string 254 * (merging them at the boundary) and returns the first string. 255 * The result is normalized if the first string was normalized. 256 * The first and second strings must be different objects. 257 * @param first string, should be normalized 258 * @param second string, will be normalized 259 * @param errorCode Standard ICU error code. Its input value must 260 * pass the U_SUCCESS() test, or else the function returns 261 * immediately. Check for U_FAILURE() on output or use with 262 * function chaining. (See User Guide for details.) 263 * @return first 264 * @stable ICU 4.4 265 */ 266 virtual UnicodeString & 267 normalizeSecondAndAppend(UnicodeString &first, 268 const UnicodeString &second, 269 UErrorCode &errorCode) const = 0; 270 /** 271 * Appends the second string to the first string 272 * (merging them at the boundary) and returns the first string. 273 * The result is normalized if both the strings were normalized. 274 * The first and second strings must be different objects. 275 * @param first string, should be normalized 276 * @param second string, should be normalized 277 * @param errorCode Standard ICU error code. Its input value must 278 * pass the U_SUCCESS() test, or else the function returns 279 * immediately. Check for U_FAILURE() on output or use with 280 * function chaining. (See User Guide for details.) 281 * @return first 282 * @stable ICU 4.4 283 */ 284 virtual UnicodeString & 285 append(UnicodeString &first, 286 const UnicodeString &second, 287 UErrorCode &errorCode) const = 0; 288 289 /** 290 * Gets the decomposition mapping of c. 291 * Roughly equivalent to normalizing the String form of c 292 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function 293 * returns FALSE and does not write a string 294 * if c does not have a decomposition mapping in this instance's data. 295 * This function is independent of the mode of the Normalizer2. 296 * @param c code point 297 * @param decomposition String object which will be set to c's 298 * decomposition mapping, if there is one. 299 * @return TRUE if c has a decomposition, otherwise FALSE 300 * @stable ICU 4.6 301 */ 302 virtual UBool 303 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; 304 305 /** 306 * Gets the raw decomposition mapping of c. 307 * 308 * This is similar to the getDecomposition() method but returns the 309 * raw decomposition mapping as specified in UnicodeData.txt or 310 * (for custom data) in the mapping files processed by the gennorm2 tool. 311 * By contrast, getDecomposition() returns the processed, 312 * recursively-decomposed version of this mapping. 313 * 314 * When used on a standard NFKC Normalizer2 instance, 315 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. 316 * 317 * When used on a standard NFC Normalizer2 instance, 318 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); 319 * in this case, the result contains either one or two code points (=1..4 char16_ts). 320 * 321 * This function is independent of the mode of the Normalizer2. 322 * The default implementation returns FALSE. 323 * @param c code point 324 * @param decomposition String object which will be set to c's 325 * raw decomposition mapping, if there is one. 326 * @return TRUE if c has a decomposition, otherwise FALSE 327 * @stable ICU 49 328 */ 329 virtual UBool 330 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; 331 332 /** 333 * Performs pairwise composition of a & b and returns the composite if there is one. 334 * 335 * Returns a composite code point c only if c has a two-way mapping to a+b. 336 * In standard Unicode normalization, this means that 337 * c has a canonical decomposition to a+b 338 * and c does not have the Full_Composition_Exclusion property. 339 * 340 * This function is independent of the mode of the Normalizer2. 341 * The default implementation returns a negative value. 342 * @param a A (normalization starter) code point. 343 * @param b Another code point. 344 * @return The non-negative composite code point if there is one; otherwise a negative value. 345 * @stable ICU 49 346 */ 347 virtual UChar32 348 composePair(UChar32 a, UChar32 b) const; 349 350 /** 351 * Gets the combining class of c. 352 * The default implementation returns 0 353 * but all standard implementations return the Unicode Canonical_Combining_Class value. 354 * @param c code point 355 * @return c's combining class 356 * @stable ICU 49 357 */ 358 virtual uint8_t 359 getCombiningClass(UChar32 c) const; 360 361 /** 362 * Tests if the string is normalized. 363 * Internally, in cases where the quickCheck() method would return "maybe" 364 * (which is only possible for the two COMPOSE modes) this method 365 * resolves to "yes" or "no" to provide a definitive result, 366 * at the cost of doing more work in those cases. 367 * @param s input string 368 * @param errorCode Standard ICU error code. Its input value must 369 * pass the U_SUCCESS() test, or else the function returns 370 * immediately. Check for U_FAILURE() on output or use with 371 * function chaining. (See User Guide for details.) 372 * @return TRUE if s is normalized 373 * @stable ICU 4.4 374 */ 375 virtual UBool 376 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; 377 /** 378 * Tests if the UTF-8 string is normalized. 379 * Internally, in cases where the quickCheck() method would return "maybe" 380 * (which is only possible for the two COMPOSE modes) this method 381 * resolves to "yes" or "no" to provide a definitive result, 382 * at the cost of doing more work in those cases. 383 * 384 * This works for all normalization modes, 385 * but it is currently optimized for UTF-8 only for "compose" modes, 386 * such as for NFC, NFKC, and NFKC_Casefold 387 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS). 388 * For other modes it currently converts to UTF-16 and calls isNormalized(). 389 * 390 * @param s UTF-8 input string 391 * @param errorCode Standard ICU error code. Its input value must 392 * pass the U_SUCCESS() test, or else the function returns 393 * immediately. Check for U_FAILURE() on output or use with 394 * function chaining. (See User Guide for details.) 395 * @return TRUE if s is normalized 396 * @stable ICU 60 397 */ 398 virtual UBool 399 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const; 400 401 402 /** 403 * Tests if the string is normalized. 404 * For the two COMPOSE modes, the result could be "maybe" in cases that 405 * would take a little more work to resolve definitively. 406 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster 407 * combination of quick check + normalization, to avoid 408 * re-checking the "yes" prefix. 409 * @param s input string 410 * @param errorCode Standard ICU error code. Its input value must 411 * pass the U_SUCCESS() test, or else the function returns 412 * immediately. Check for U_FAILURE() on output or use with 413 * function chaining. (See User Guide for details.) 414 * @return UNormalizationCheckResult 415 * @stable ICU 4.4 416 */ 417 virtual UNormalizationCheckResult 418 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; 419 420 /** 421 * Returns the end of the normalized substring of the input string. 422 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> 423 * the substring <code>UnicodeString(s, 0, end)</code> 424 * will pass the quick check with a "yes" result. 425 * 426 * The returned end index is usually one or more characters before the 427 * "no" or "maybe" character: The end index is at a normalization boundary. 428 * (See the class documentation for more about normalization boundaries.) 429 * 430 * When the goal is a normalized string and most input strings are expected 431 * to be normalized already, then call this method, 432 * and if it returns a prefix shorter than the input string, 433 * copy that prefix and use normalizeSecondAndAppend() for the remainder. 434 * @param s input string 435 * @param errorCode Standard ICU error code. Its input value must 436 * pass the U_SUCCESS() test, or else the function returns 437 * immediately. Check for U_FAILURE() on output or use with 438 * function chaining. (See User Guide for details.) 439 * @return "yes" span end index 440 * @stable ICU 4.4 441 */ 442 virtual int32_t 443 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; 444 445 /** 446 * Tests if the character always has a normalization boundary before it, 447 * regardless of context. 448 * If true, then the character does not normalization-interact with 449 * preceding characters. 450 * In other words, a string containing this character can be normalized 451 * by processing portions before this character and starting from this 452 * character independently. 453 * This is used for iterative normalization. See the class documentation for details. 454 * @param c character to test 455 * @return TRUE if c has a normalization boundary before it 456 * @stable ICU 4.4 457 */ 458 virtual UBool hasBoundaryBefore(UChar32 c) const = 0; 459 460 /** 461 * Tests if the character always has a normalization boundary after it, 462 * regardless of context. 463 * If true, then the character does not normalization-interact with 464 * following characters. 465 * In other words, a string containing this character can be normalized 466 * by processing portions up to this character and after this 467 * character independently. 468 * This is used for iterative normalization. See the class documentation for details. 469 * Note that this operation may be significantly slower than hasBoundaryBefore(). 470 * @param c character to test 471 * @return TRUE if c has a normalization boundary after it 472 * @stable ICU 4.4 473 */ 474 virtual UBool hasBoundaryAfter(UChar32 c) const = 0; 475 476 /** 477 * Tests if the character is normalization-inert. 478 * If true, then the character does not change, nor normalization-interact with 479 * preceding or following characters. 480 * In other words, a string containing this character can be normalized 481 * by processing portions before this character and after this 482 * character independently. 483 * This is used for iterative normalization. See the class documentation for details. 484 * Note that this operation may be significantly slower than hasBoundaryBefore(). 485 * @param c character to test 486 * @return TRUE if c is normalization-inert 487 * @stable ICU 4.4 488 */ 489 virtual UBool isInert(UChar32 c) const = 0; 490 }; 491 492 /** 493 * Normalization filtered by a UnicodeSet. 494 * Normalizes portions of the text contained in the filter set and leaves 495 * portions not contained in the filter set unchanged. 496 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). 497 * Not-in-the-filter text is treated as "is normalized" and "quick check yes". 498 * This class implements all of (and only) the Normalizer2 API. 499 * An instance of this class is unmodifiable/immutable but is constructed and 500 * must be destructed by the owner. 501 * @stable ICU 4.4 502 */ 503 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { 504 public: 505 /** 506 * Constructs a filtered normalizer wrapping any Normalizer2 instance 507 * and a filter set. 508 * Both are aliased and must not be modified or deleted while this object 509 * is used. 510 * The filter set should be frozen; otherwise the performance will suffer greatly. 511 * @param n2 wrapped Normalizer2 instance 512 * @param filterSet UnicodeSet which determines the characters to be normalized 513 * @stable ICU 4.4 514 */ FilteredNormalizer2(const Normalizer2 & n2,const UnicodeSet & filterSet)515 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : 516 norm2(n2), set(filterSet) {} 517 518 /** 519 * Destructor. 520 * @stable ICU 4.4 521 */ 522 ~FilteredNormalizer2(); 523 524 /** 525 * Writes the normalized form of the source string to the destination string 526 * (replacing its contents) and returns the destination string. 527 * The source and destination strings must be different objects. 528 * @param src source string 529 * @param dest destination string; its contents is replaced with normalized src 530 * @param errorCode Standard ICU error code. Its input value must 531 * pass the U_SUCCESS() test, or else the function returns 532 * immediately. Check for U_FAILURE() on output or use with 533 * function chaining. (See User Guide for details.) 534 * @return dest 535 * @stable ICU 4.4 536 */ 537 virtual UnicodeString & 538 normalize(const UnicodeString &src, 539 UnicodeString &dest, 540 UErrorCode &errorCode) const U_OVERRIDE; 541 542 /** 543 * Normalizes a UTF-8 string and optionally records how source substrings 544 * relate to changed and unchanged result substrings. 545 * 546 * Currently implemented completely only for "compose" modes, 547 * such as for NFC, NFKC, and NFKC_Casefold 548 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS). 549 * Otherwise currently converts to & from UTF-16 and does not support edits. 550 * 551 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 552 * @param src Source UTF-8 string. 553 * @param sink A ByteSink to which the normalized UTF-8 result string is written. 554 * sink.Flush() is called at the end. 555 * @param edits Records edits for index mapping, working with styled text, 556 * and getting only changes (if any). 557 * The Edits contents is undefined if any error occurs. 558 * This function calls edits->reset() first unless 559 * options includes U_EDITS_NO_RESET. edits can be nullptr. 560 * @param errorCode Standard ICU error code. Its input value must 561 * pass the U_SUCCESS() test, or else the function returns 562 * immediately. Check for U_FAILURE() on output or use with 563 * function chaining. (See User Guide for details.) 564 * @stable ICU 60 565 */ 566 virtual void 567 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 568 Edits *edits, UErrorCode &errorCode) const U_OVERRIDE; 569 570 /** 571 * Appends the normalized form of the second string to the first string 572 * (merging them at the boundary) and returns the first string. 573 * The result is normalized if the first string was normalized. 574 * The first and second strings must be different objects. 575 * @param first string, should be normalized 576 * @param second string, will be normalized 577 * @param errorCode Standard ICU error code. Its input value must 578 * pass the U_SUCCESS() test, or else the function returns 579 * immediately. Check for U_FAILURE() on output or use with 580 * function chaining. (See User Guide for details.) 581 * @return first 582 * @stable ICU 4.4 583 */ 584 virtual UnicodeString & 585 normalizeSecondAndAppend(UnicodeString &first, 586 const UnicodeString &second, 587 UErrorCode &errorCode) const U_OVERRIDE; 588 /** 589 * Appends the second string to the first string 590 * (merging them at the boundary) and returns the first string. 591 * The result is normalized if both the strings were normalized. 592 * The first and second strings must be different objects. 593 * @param first string, should be normalized 594 * @param second string, should be normalized 595 * @param errorCode Standard ICU error code. Its input value must 596 * pass the U_SUCCESS() test, or else the function returns 597 * immediately. Check for U_FAILURE() on output or use with 598 * function chaining. (See User Guide for details.) 599 * @return first 600 * @stable ICU 4.4 601 */ 602 virtual UnicodeString & 603 append(UnicodeString &first, 604 const UnicodeString &second, 605 UErrorCode &errorCode) const U_OVERRIDE; 606 607 /** 608 * Gets the decomposition mapping of c. 609 * For details see the base class documentation. 610 * 611 * This function is independent of the mode of the Normalizer2. 612 * @param c code point 613 * @param decomposition String object which will be set to c's 614 * decomposition mapping, if there is one. 615 * @return TRUE if c has a decomposition, otherwise FALSE 616 * @stable ICU 4.6 617 */ 618 virtual UBool 619 getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE; 620 621 /** 622 * Gets the raw decomposition mapping of c. 623 * For details see the base class documentation. 624 * 625 * This function is independent of the mode of the Normalizer2. 626 * @param c code point 627 * @param decomposition String object which will be set to c's 628 * raw decomposition mapping, if there is one. 629 * @return TRUE if c has a decomposition, otherwise FALSE 630 * @stable ICU 49 631 */ 632 virtual UBool 633 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE; 634 635 /** 636 * Performs pairwise composition of a & b and returns the composite if there is one. 637 * For details see the base class documentation. 638 * 639 * This function is independent of the mode of the Normalizer2. 640 * @param a A (normalization starter) code point. 641 * @param b Another code point. 642 * @return The non-negative composite code point if there is one; otherwise a negative value. 643 * @stable ICU 49 644 */ 645 virtual UChar32 646 composePair(UChar32 a, UChar32 b) const U_OVERRIDE; 647 648 /** 649 * Gets the combining class of c. 650 * The default implementation returns 0 651 * but all standard implementations return the Unicode Canonical_Combining_Class value. 652 * @param c code point 653 * @return c's combining class 654 * @stable ICU 49 655 */ 656 virtual uint8_t 657 getCombiningClass(UChar32 c) const U_OVERRIDE; 658 659 /** 660 * Tests if the string is normalized. 661 * For details see the Normalizer2 base class documentation. 662 * @param s input string 663 * @param errorCode Standard ICU error code. Its input value must 664 * pass the U_SUCCESS() test, or else the function returns 665 * immediately. Check for U_FAILURE() on output or use with 666 * function chaining. (See User Guide for details.) 667 * @return TRUE if s is normalized 668 * @stable ICU 4.4 669 */ 670 virtual UBool 671 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; 672 /** 673 * Tests if the UTF-8 string is normalized. 674 * Internally, in cases where the quickCheck() method would return "maybe" 675 * (which is only possible for the two COMPOSE modes) this method 676 * resolves to "yes" or "no" to provide a definitive result, 677 * at the cost of doing more work in those cases. 678 * 679 * This works for all normalization modes, 680 * but it is currently optimized for UTF-8 only for "compose" modes, 681 * such as for NFC, NFKC, and NFKC_Casefold 682 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS). 683 * For other modes it currently converts to UTF-16 and calls isNormalized(). 684 * 685 * @param s UTF-8 input string 686 * @param errorCode Standard ICU error code. Its input value must 687 * pass the U_SUCCESS() test, or else the function returns 688 * immediately. Check for U_FAILURE() on output or use with 689 * function chaining. (See User Guide for details.) 690 * @return TRUE if s is normalized 691 * @stable ICU 60 692 */ 693 virtual UBool 694 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE; 695 /** 696 * Tests if the string is normalized. 697 * For details see the Normalizer2 base class documentation. 698 * @param s input string 699 * @param errorCode Standard ICU error code. Its input value must 700 * pass the U_SUCCESS() test, or else the function returns 701 * immediately. Check for U_FAILURE() on output or use with 702 * function chaining. (See User Guide for details.) 703 * @return UNormalizationCheckResult 704 * @stable ICU 4.4 705 */ 706 virtual UNormalizationCheckResult 707 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; 708 /** 709 * Returns the end of the normalized substring of the input string. 710 * For details see the Normalizer2 base class documentation. 711 * @param s input string 712 * @param errorCode Standard ICU error code. Its input value must 713 * pass the U_SUCCESS() test, or else the function returns 714 * immediately. Check for U_FAILURE() on output or use with 715 * function chaining. (See User Guide for details.) 716 * @return "yes" span end index 717 * @stable ICU 4.4 718 */ 719 virtual int32_t 720 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; 721 722 /** 723 * Tests if the character always has a normalization boundary before it, 724 * regardless of context. 725 * For details see the Normalizer2 base class documentation. 726 * @param c character to test 727 * @return TRUE if c has a normalization boundary before it 728 * @stable ICU 4.4 729 */ 730 virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE; 731 732 /** 733 * Tests if the character always has a normalization boundary after it, 734 * regardless of context. 735 * For details see the Normalizer2 base class documentation. 736 * @param c character to test 737 * @return TRUE if c has a normalization boundary after it 738 * @stable ICU 4.4 739 */ 740 virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE; 741 742 /** 743 * Tests if the character is normalization-inert. 744 * For details see the Normalizer2 base class documentation. 745 * @param c character to test 746 * @return TRUE if c is normalization-inert 747 * @stable ICU 4.4 748 */ 749 virtual UBool isInert(UChar32 c) const U_OVERRIDE; 750 private: 751 UnicodeString & 752 normalize(const UnicodeString &src, 753 UnicodeString &dest, 754 USetSpanCondition spanCondition, 755 UErrorCode &errorCode) const; 756 757 void 758 normalizeUTF8(uint32_t options, const char *src, int32_t length, 759 ByteSink &sink, Edits *edits, 760 USetSpanCondition spanCondition, 761 UErrorCode &errorCode) const; 762 763 UnicodeString & 764 normalizeSecondAndAppend(UnicodeString &first, 765 const UnicodeString &second, 766 UBool doNormalize, 767 UErrorCode &errorCode) const; 768 769 const Normalizer2 &norm2; 770 const UnicodeSet &set; 771 }; 772 773 U_NAMESPACE_END 774 775 #endif // !UCONFIG_NO_NORMALIZATION 776 777 #endif /* U_SHOW_CPLUSPLUS_API */ 778 779 #endif // __NORMALIZER2_H__ 780