1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2013, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: normalizer2.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009nov22 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __NORMALIZER2_H__ 20 #define __NORMALIZER2_H__ 21 22 /** 23 * \file 24 * \brief C++ API: New API for Unicode Normalization. 25 */ 26 27 #include "unicode/utypes.h" 28 29 #if !UCONFIG_NO_NORMALIZATION 30 31 #include "unicode/stringpiece.h" 32 #include "unicode/uniset.h" 33 #include "unicode/unistr.h" 34 #include "unicode/unorm2.h" 35 36 U_NAMESPACE_BEGIN 37 38 class ByteSink; 39 40 /** 41 * Unicode normalization functionality for standard Unicode normalization or 42 * for using custom mapping tables. 43 * All instances of this class are unmodifiable/immutable. 44 * Instances returned by getInstance() are singletons that must not be deleted by the caller. 45 * The Normalizer2 class is not intended for public subclassing. 46 * 47 * The primary functions are to produce a normalized string and to detect whether 48 * a string is already normalized. 49 * The most commonly used normalization forms are those defined in 50 * http://www.unicode.org/unicode/reports/tr15/ 51 * However, this API supports additional normalization forms for specialized purposes. 52 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) 53 * and can be used in implementations of UTS #46. 54 * 55 * Not only are the standard compose and decompose modes supplied, 56 * but additional modes are provided as documented in the Mode enum. 57 * 58 * Some of the functions in this class identify normalization boundaries. 59 * At a normalization boundary, the portions of the string 60 * before it and starting from it do not interact and can be handled independently. 61 * 62 * The spanQuickCheckYes() stops at a normalization boundary. 63 * When the goal is a normalized string, then the text before the boundary 64 * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). 65 * 66 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether 67 * a character is guaranteed to be at a normalization boundary, 68 * regardless of context. 69 * This is used for moving from one normalization boundary to the next 70 * or preceding boundary, and for performing iterative normalization. 71 * 72 * Iterative normalization is useful when only a small portion of a 73 * longer string needs to be processed. 74 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator 75 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() 76 * (to process only the substring for which sort key bytes are computed). 77 * 78 * The set of normalization boundaries returned by these functions may not be 79 * complete: There may be more boundaries that could be returned. 80 * Different functions may return different boundaries. 81 * @stable ICU 4.4 82 */ 83 class U_COMMON_API Normalizer2 : public UObject { 84 public: 85 /** 86 * Destructor. 87 * @stable ICU 4.4 88 */ 89 ~Normalizer2(); 90 91 /** 92 * Returns a Normalizer2 instance for Unicode NFC normalization. 93 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode). 94 * Returns an unmodifiable singleton instance. Do not delete it. 95 * @param errorCode Standard ICU error code. Its input value must 96 * pass the U_SUCCESS() test, or else the function returns 97 * immediately. Check for U_FAILURE() on output or use with 98 * function chaining. (See User Guide for details.) 99 * @return the requested Normalizer2, if successful 100 * @stable ICU 49 101 */ 102 static const Normalizer2 * 103 getNFCInstance(UErrorCode &errorCode); 104 105 /** 106 * Returns a Normalizer2 instance for Unicode NFD normalization. 107 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode). 108 * Returns an unmodifiable singleton instance. Do not delete it. 109 * @param errorCode Standard ICU error code. Its input value must 110 * pass the U_SUCCESS() test, or else the function returns 111 * immediately. Check for U_FAILURE() on output or use with 112 * function chaining. (See User Guide for details.) 113 * @return the requested Normalizer2, if successful 114 * @stable ICU 49 115 */ 116 static const Normalizer2 * 117 getNFDInstance(UErrorCode &errorCode); 118 119 /** 120 * Returns a Normalizer2 instance for Unicode NFKC normalization. 121 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode). 122 * Returns an unmodifiable singleton instance. Do not delete it. 123 * @param errorCode Standard ICU error code. Its input value must 124 * pass the U_SUCCESS() test, or else the function returns 125 * immediately. Check for U_FAILURE() on output or use with 126 * function chaining. (See User Guide for details.) 127 * @return the requested Normalizer2, if successful 128 * @stable ICU 49 129 */ 130 static const Normalizer2 * 131 getNFKCInstance(UErrorCode &errorCode); 132 133 /** 134 * Returns a Normalizer2 instance for Unicode NFKD normalization. 135 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode). 136 * Returns an unmodifiable singleton instance. Do not delete it. 137 * @param errorCode Standard ICU error code. Its input value must 138 * pass the U_SUCCESS() test, or else the function returns 139 * immediately. Check for U_FAILURE() on output or use with 140 * function chaining. (See User Guide for details.) 141 * @return the requested Normalizer2, if successful 142 * @stable ICU 49 143 */ 144 static const Normalizer2 * 145 getNFKDInstance(UErrorCode &errorCode); 146 147 /** 148 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization. 149 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode). 150 * Returns an unmodifiable singleton instance. Do not delete it. 151 * @param errorCode Standard ICU error code. Its input value must 152 * pass the U_SUCCESS() test, or else the function returns 153 * immediately. Check for U_FAILURE() on output or use with 154 * function chaining. (See User Guide for details.) 155 * @return the requested Normalizer2, if successful 156 * @stable ICU 49 157 */ 158 static const Normalizer2 * 159 getNFKCCasefoldInstance(UErrorCode &errorCode); 160 161 /** 162 * Returns a Normalizer2 instance which uses the specified data file 163 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) 164 * and which composes or decomposes text according to the specified mode. 165 * Returns an unmodifiable singleton instance. Do not delete it. 166 * 167 * Use packageName=NULL for data files that are part of ICU's own data. 168 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. 169 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. 170 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. 171 * 172 * @param packageName NULL for ICU built-in data, otherwise application data package name 173 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file 174 * @param mode normalization mode (compose or decompose etc.) 175 * @param errorCode Standard ICU error code. Its input value must 176 * pass the U_SUCCESS() test, or else the function returns 177 * immediately. Check for U_FAILURE() on output or use with 178 * function chaining. (See User Guide for details.) 179 * @return the requested Normalizer2, if successful 180 * @stable ICU 4.4 181 */ 182 static const Normalizer2 * 183 getInstance(const char *packageName, 184 const char *name, 185 UNormalization2Mode mode, 186 UErrorCode &errorCode); 187 188 /** 189 * Returns the normalized form of the source string. 190 * @param src source string 191 * @param errorCode Standard ICU error code. Its input value must 192 * pass the U_SUCCESS() test, or else the function returns 193 * immediately. Check for U_FAILURE() on output or use with 194 * function chaining. (See User Guide for details.) 195 * @return normalized src 196 * @stable ICU 4.4 197 */ 198 UnicodeString normalize(const UnicodeString & src,UErrorCode & errorCode)199 normalize(const UnicodeString &src, UErrorCode &errorCode) const { 200 UnicodeString result; 201 normalize(src, result, errorCode); 202 return result; 203 } 204 /** 205 * Writes the normalized form of the source string to the destination string 206 * (replacing its contents) and returns the destination string. 207 * The source and destination strings must be different objects. 208 * @param src source string 209 * @param dest destination string; its contents is replaced with normalized src 210 * @param errorCode Standard ICU error code. Its input value must 211 * pass the U_SUCCESS() test, or else the function returns 212 * immediately. Check for U_FAILURE() on output or use with 213 * function chaining. (See User Guide for details.) 214 * @return dest 215 * @stable ICU 4.4 216 */ 217 virtual UnicodeString & 218 normalize(const UnicodeString &src, 219 UnicodeString &dest, 220 UErrorCode &errorCode) const = 0; 221 222 /** 223 * Normalizes a UTF-8 string and optionally records how source substrings 224 * relate to changed and unchanged result substrings. 225 * 226 * Currently implemented completely only for "compose" modes, 227 * such as for NFC, NFKC, and NFKC_Casefold 228 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS). 229 * Otherwise currently converts to & from UTF-16 and does not support edits. 230 * 231 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 232 * @param src Source UTF-8 string. 233 * @param sink A ByteSink to which the normalized UTF-8 result string is written. 234 * sink.Flush() is called at the end. 235 * @param edits Records edits for index mapping, working with styled text, 236 * and getting only changes (if any). 237 * The Edits contents is undefined if any error occurs. 238 * This function calls edits->reset() first unless 239 * options includes U_EDITS_NO_RESET. edits can be nullptr. 240 * @param errorCode Standard ICU error code. Its input value must 241 * pass the U_SUCCESS() test, or else the function returns 242 * immediately. Check for U_FAILURE() on output or use with 243 * function chaining. (See User Guide for details.) 244 * @stable ICU 60 245 */ 246 virtual void 247 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 248 Edits *edits, UErrorCode &errorCode) const; 249 250 /** 251 * Appends the normalized form of the second string to the first string 252 * (merging them at the boundary) and returns the first string. 253 * The result is normalized if the first string was normalized. 254 * The first and second strings must be different objects. 255 * @param first string, should be normalized 256 * @param second string, will be normalized 257 * @param errorCode Standard ICU error code. Its input value must 258 * pass the U_SUCCESS() test, or else the function returns 259 * immediately. Check for U_FAILURE() on output or use with 260 * function chaining. (See User Guide for details.) 261 * @return first 262 * @stable ICU 4.4 263 */ 264 virtual UnicodeString & 265 normalizeSecondAndAppend(UnicodeString &first, 266 const UnicodeString &second, 267 UErrorCode &errorCode) const = 0; 268 /** 269 * Appends the second string to the first string 270 * (merging them at the boundary) and returns the first string. 271 * The result is normalized if both the strings were normalized. 272 * The first and second strings must be different objects. 273 * @param first string, should be normalized 274 * @param second string, should be normalized 275 * @param errorCode Standard ICU error code. Its input value must 276 * pass the U_SUCCESS() test, or else the function returns 277 * immediately. Check for U_FAILURE() on output or use with 278 * function chaining. (See User Guide for details.) 279 * @return first 280 * @stable ICU 4.4 281 */ 282 virtual UnicodeString & 283 append(UnicodeString &first, 284 const UnicodeString &second, 285 UErrorCode &errorCode) const = 0; 286 287 /** 288 * Gets the decomposition mapping of c. 289 * Roughly equivalent to normalizing the String form of c 290 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function 291 * returns FALSE and does not write a string 292 * if c does not have a decomposition mapping in this instance's data. 293 * This function is independent of the mode of the Normalizer2. 294 * @param c code point 295 * @param decomposition String object which will be set to c's 296 * decomposition mapping, if there is one. 297 * @return TRUE if c has a decomposition, otherwise FALSE 298 * @stable ICU 4.6 299 */ 300 virtual UBool 301 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; 302 303 /** 304 * Gets the raw decomposition mapping of c. 305 * 306 * This is similar to the getDecomposition() method but returns the 307 * raw decomposition mapping as specified in UnicodeData.txt or 308 * (for custom data) in the mapping files processed by the gennorm2 tool. 309 * By contrast, getDecomposition() returns the processed, 310 * recursively-decomposed version of this mapping. 311 * 312 * When used on a standard NFKC Normalizer2 instance, 313 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. 314 * 315 * When used on a standard NFC Normalizer2 instance, 316 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); 317 * in this case, the result contains either one or two code points (=1..4 char16_ts). 318 * 319 * This function is independent of the mode of the Normalizer2. 320 * The default implementation returns FALSE. 321 * @param c code point 322 * @param decomposition String object which will be set to c's 323 * raw decomposition mapping, if there is one. 324 * @return TRUE if c has a decomposition, otherwise FALSE 325 * @stable ICU 49 326 */ 327 virtual UBool 328 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; 329 330 /** 331 * Performs pairwise composition of a & b and returns the composite if there is one. 332 * 333 * Returns a composite code point c only if c has a two-way mapping to a+b. 334 * In standard Unicode normalization, this means that 335 * c has a canonical decomposition to a+b 336 * and c does not have the Full_Composition_Exclusion property. 337 * 338 * This function is independent of the mode of the Normalizer2. 339 * The default implementation returns a negative value. 340 * @param a A (normalization starter) code point. 341 * @param b Another code point. 342 * @return The non-negative composite code point if there is one; otherwise a negative value. 343 * @stable ICU 49 344 */ 345 virtual UChar32 346 composePair(UChar32 a, UChar32 b) const; 347 348 /** 349 * Gets the combining class of c. 350 * The default implementation returns 0 351 * but all standard implementations return the Unicode Canonical_Combining_Class value. 352 * @param c code point 353 * @return c's combining class 354 * @stable ICU 49 355 */ 356 virtual uint8_t 357 getCombiningClass(UChar32 c) const; 358 359 /** 360 * Tests if the string is normalized. 361 * Internally, in cases where the quickCheck() method would return "maybe" 362 * (which is only possible for the two COMPOSE modes) this method 363 * resolves to "yes" or "no" to provide a definitive result, 364 * at the cost of doing more work in those cases. 365 * @param s input string 366 * @param errorCode Standard ICU error code. Its input value must 367 * pass the U_SUCCESS() test, or else the function returns 368 * immediately. Check for U_FAILURE() on output or use with 369 * function chaining. (See User Guide for details.) 370 * @return TRUE if s is normalized 371 * @stable ICU 4.4 372 */ 373 virtual UBool 374 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; 375 /** 376 * Tests if the UTF-8 string is normalized. 377 * Internally, in cases where the quickCheck() method would return "maybe" 378 * (which is only possible for the two COMPOSE modes) this method 379 * resolves to "yes" or "no" to provide a definitive result, 380 * at the cost of doing more work in those cases. 381 * 382 * This works for all normalization modes, 383 * but it is currently optimized for UTF-8 only for "compose" modes, 384 * such as for NFC, NFKC, and NFKC_Casefold 385 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS). 386 * For other modes it currently converts to UTF-16 and calls isNormalized(). 387 * 388 * @param s UTF-8 input string 389 * @param errorCode Standard ICU error code. Its input value must 390 * pass the U_SUCCESS() test, or else the function returns 391 * immediately. Check for U_FAILURE() on output or use with 392 * function chaining. (See User Guide for details.) 393 * @return TRUE if s is normalized 394 * @stable ICU 60 395 */ 396 virtual UBool 397 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const; 398 399 400 /** 401 * Tests if the string is normalized. 402 * For the two COMPOSE modes, the result could be "maybe" in cases that 403 * would take a little more work to resolve definitively. 404 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster 405 * combination of quick check + normalization, to avoid 406 * re-checking the "yes" prefix. 407 * @param s input string 408 * @param errorCode Standard ICU error code. Its input value must 409 * pass the U_SUCCESS() test, or else the function returns 410 * immediately. Check for U_FAILURE() on output or use with 411 * function chaining. (See User Guide for details.) 412 * @return UNormalizationCheckResult 413 * @stable ICU 4.4 414 */ 415 virtual UNormalizationCheckResult 416 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; 417 418 /** 419 * Returns the end of the normalized substring of the input string. 420 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> 421 * the substring <code>UnicodeString(s, 0, end)</code> 422 * will pass the quick check with a "yes" result. 423 * 424 * The returned end index is usually one or more characters before the 425 * "no" or "maybe" character: The end index is at a normalization boundary. 426 * (See the class documentation for more about normalization boundaries.) 427 * 428 * When the goal is a normalized string and most input strings are expected 429 * to be normalized already, then call this method, 430 * and if it returns a prefix shorter than the input string, 431 * copy that prefix and use normalizeSecondAndAppend() for the remainder. 432 * @param s input string 433 * @param errorCode Standard ICU error code. Its input value must 434 * pass the U_SUCCESS() test, or else the function returns 435 * immediately. Check for U_FAILURE() on output or use with 436 * function chaining. (See User Guide for details.) 437 * @return "yes" span end index 438 * @stable ICU 4.4 439 */ 440 virtual int32_t 441 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; 442 443 /** 444 * Tests if the character always has a normalization boundary before it, 445 * regardless of context. 446 * If true, then the character does not normalization-interact with 447 * preceding characters. 448 * In other words, a string containing this character can be normalized 449 * by processing portions before this character and starting from this 450 * character independently. 451 * This is used for iterative normalization. See the class documentation for details. 452 * @param c character to test 453 * @return TRUE if c has a normalization boundary before it 454 * @stable ICU 4.4 455 */ 456 virtual UBool hasBoundaryBefore(UChar32 c) const = 0; 457 458 /** 459 * Tests if the character always has a normalization boundary after it, 460 * regardless of context. 461 * If true, then the character does not normalization-interact with 462 * following characters. 463 * In other words, a string containing this character can be normalized 464 * by processing portions up to this character and after this 465 * character independently. 466 * This is used for iterative normalization. See the class documentation for details. 467 * Note that this operation may be significantly slower than hasBoundaryBefore(). 468 * @param c character to test 469 * @return TRUE if c has a normalization boundary after it 470 * @stable ICU 4.4 471 */ 472 virtual UBool hasBoundaryAfter(UChar32 c) const = 0; 473 474 /** 475 * Tests if the character is normalization-inert. 476 * If true, then the character does not change, nor normalization-interact with 477 * preceding or following characters. 478 * In other words, a string containing this character can be normalized 479 * by processing portions before this character and after this 480 * character independently. 481 * This is used for iterative normalization. See the class documentation for details. 482 * Note that this operation may be significantly slower than hasBoundaryBefore(). 483 * @param c character to test 484 * @return TRUE if c is normalization-inert 485 * @stable ICU 4.4 486 */ 487 virtual UBool isInert(UChar32 c) const = 0; 488 }; 489 490 /** 491 * Normalization filtered by a UnicodeSet. 492 * Normalizes portions of the text contained in the filter set and leaves 493 * portions not contained in the filter set unchanged. 494 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). 495 * Not-in-the-filter text is treated as "is normalized" and "quick check yes". 496 * This class implements all of (and only) the Normalizer2 API. 497 * An instance of this class is unmodifiable/immutable but is constructed and 498 * must be destructed by the owner. 499 * @stable ICU 4.4 500 */ 501 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { 502 public: 503 /** 504 * Constructs a filtered normalizer wrapping any Normalizer2 instance 505 * and a filter set. 506 * Both are aliased and must not be modified or deleted while this object 507 * is used. 508 * The filter set should be frozen; otherwise the performance will suffer greatly. 509 * @param n2 wrapped Normalizer2 instance 510 * @param filterSet UnicodeSet which determines the characters to be normalized 511 * @stable ICU 4.4 512 */ FilteredNormalizer2(const Normalizer2 & n2,const UnicodeSet & filterSet)513 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : 514 norm2(n2), set(filterSet) {} 515 516 /** 517 * Destructor. 518 * @stable ICU 4.4 519 */ 520 ~FilteredNormalizer2(); 521 522 /** 523 * Writes the normalized form of the source string to the destination string 524 * (replacing its contents) and returns the destination string. 525 * The source and destination strings must be different objects. 526 * @param src source string 527 * @param dest destination string; its contents is replaced with normalized src 528 * @param errorCode Standard ICU error code. Its input value must 529 * pass the U_SUCCESS() test, or else the function returns 530 * immediately. Check for U_FAILURE() on output or use with 531 * function chaining. (See User Guide for details.) 532 * @return dest 533 * @stable ICU 4.4 534 */ 535 virtual UnicodeString & 536 normalize(const UnicodeString &src, 537 UnicodeString &dest, 538 UErrorCode &errorCode) const U_OVERRIDE; 539 540 /** 541 * Normalizes a UTF-8 string and optionally records how source substrings 542 * relate to changed and unchanged result substrings. 543 * 544 * Currently implemented completely only for "compose" modes, 545 * such as for NFC, NFKC, and NFKC_Casefold 546 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS). 547 * Otherwise currently converts to & from UTF-16 and does not support edits. 548 * 549 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 550 * @param src Source UTF-8 string. 551 * @param sink A ByteSink to which the normalized UTF-8 result string is written. 552 * sink.Flush() is called at the end. 553 * @param edits Records edits for index mapping, working with styled text, 554 * and getting only changes (if any). 555 * The Edits contents is undefined if any error occurs. 556 * This function calls edits->reset() first unless 557 * options includes U_EDITS_NO_RESET. edits can be nullptr. 558 * @param errorCode Standard ICU error code. Its input value must 559 * pass the U_SUCCESS() test, or else the function returns 560 * immediately. Check for U_FAILURE() on output or use with 561 * function chaining. (See User Guide for details.) 562 * @stable ICU 60 563 */ 564 virtual void 565 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 566 Edits *edits, UErrorCode &errorCode) const U_OVERRIDE; 567 568 /** 569 * Appends the normalized form of the second string to the first string 570 * (merging them at the boundary) and returns the first string. 571 * The result is normalized if the first string was normalized. 572 * The first and second strings must be different objects. 573 * @param first string, should be normalized 574 * @param second string, will be normalized 575 * @param errorCode Standard ICU error code. Its input value must 576 * pass the U_SUCCESS() test, or else the function returns 577 * immediately. Check for U_FAILURE() on output or use with 578 * function chaining. (See User Guide for details.) 579 * @return first 580 * @stable ICU 4.4 581 */ 582 virtual UnicodeString & 583 normalizeSecondAndAppend(UnicodeString &first, 584 const UnicodeString &second, 585 UErrorCode &errorCode) const U_OVERRIDE; 586 /** 587 * Appends the second string to the first string 588 * (merging them at the boundary) and returns the first string. 589 * The result is normalized if both the strings were normalized. 590 * The first and second strings must be different objects. 591 * @param first string, should be normalized 592 * @param second string, should be normalized 593 * @param errorCode Standard ICU error code. Its input value must 594 * pass the U_SUCCESS() test, or else the function returns 595 * immediately. Check for U_FAILURE() on output or use with 596 * function chaining. (See User Guide for details.) 597 * @return first 598 * @stable ICU 4.4 599 */ 600 virtual UnicodeString & 601 append(UnicodeString &first, 602 const UnicodeString &second, 603 UErrorCode &errorCode) const U_OVERRIDE; 604 605 /** 606 * Gets the decomposition mapping of c. 607 * For details see the base class documentation. 608 * 609 * This function is independent of the mode of the Normalizer2. 610 * @param c code point 611 * @param decomposition String object which will be set to c's 612 * decomposition mapping, if there is one. 613 * @return TRUE if c has a decomposition, otherwise FALSE 614 * @stable ICU 4.6 615 */ 616 virtual UBool 617 getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE; 618 619 /** 620 * Gets the raw decomposition mapping of c. 621 * For details see the base class documentation. 622 * 623 * This function is independent of the mode of the Normalizer2. 624 * @param c code point 625 * @param decomposition String object which will be set to c's 626 * raw decomposition mapping, if there is one. 627 * @return TRUE if c has a decomposition, otherwise FALSE 628 * @stable ICU 49 629 */ 630 virtual UBool 631 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE; 632 633 /** 634 * Performs pairwise composition of a & b and returns the composite if there is one. 635 * For details see the base class documentation. 636 * 637 * This function is independent of the mode of the Normalizer2. 638 * @param a A (normalization starter) code point. 639 * @param b Another code point. 640 * @return The non-negative composite code point if there is one; otherwise a negative value. 641 * @stable ICU 49 642 */ 643 virtual UChar32 644 composePair(UChar32 a, UChar32 b) const U_OVERRIDE; 645 646 /** 647 * Gets the combining class of c. 648 * The default implementation returns 0 649 * but all standard implementations return the Unicode Canonical_Combining_Class value. 650 * @param c code point 651 * @return c's combining class 652 * @stable ICU 49 653 */ 654 virtual uint8_t 655 getCombiningClass(UChar32 c) const U_OVERRIDE; 656 657 /** 658 * Tests if the string is normalized. 659 * For details see the Normalizer2 base class documentation. 660 * @param s input string 661 * @param errorCode Standard ICU error code. Its input value must 662 * pass the U_SUCCESS() test, or else the function returns 663 * immediately. Check for U_FAILURE() on output or use with 664 * function chaining. (See User Guide for details.) 665 * @return TRUE if s is normalized 666 * @stable ICU 4.4 667 */ 668 virtual UBool 669 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; 670 /** 671 * Tests if the UTF-8 string is normalized. 672 * Internally, in cases where the quickCheck() method would return "maybe" 673 * (which is only possible for the two COMPOSE modes) this method 674 * resolves to "yes" or "no" to provide a definitive result, 675 * at the cost of doing more work in those cases. 676 * 677 * This works for all normalization modes, 678 * but it is currently optimized for UTF-8 only for "compose" modes, 679 * such as for NFC, NFKC, and NFKC_Casefold 680 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS). 681 * For other modes it currently converts to UTF-16 and calls isNormalized(). 682 * 683 * @param s UTF-8 input string 684 * @param errorCode Standard ICU error code. Its input value must 685 * pass the U_SUCCESS() test, or else the function returns 686 * immediately. Check for U_FAILURE() on output or use with 687 * function chaining. (See User Guide for details.) 688 * @return TRUE if s is normalized 689 * @stable ICU 60 690 */ 691 virtual UBool 692 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE; 693 /** 694 * Tests if the string is normalized. 695 * For details see the Normalizer2 base class documentation. 696 * @param s input string 697 * @param errorCode Standard ICU error code. Its input value must 698 * pass the U_SUCCESS() test, or else the function returns 699 * immediately. Check for U_FAILURE() on output or use with 700 * function chaining. (See User Guide for details.) 701 * @return UNormalizationCheckResult 702 * @stable ICU 4.4 703 */ 704 virtual UNormalizationCheckResult 705 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; 706 /** 707 * Returns the end of the normalized substring of the input string. 708 * For details see the Normalizer2 base class documentation. 709 * @param s input string 710 * @param errorCode Standard ICU error code. Its input value must 711 * pass the U_SUCCESS() test, or else the function returns 712 * immediately. Check for U_FAILURE() on output or use with 713 * function chaining. (See User Guide for details.) 714 * @return "yes" span end index 715 * @stable ICU 4.4 716 */ 717 virtual int32_t 718 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; 719 720 /** 721 * Tests if the character always has a normalization boundary before it, 722 * regardless of context. 723 * For details see the Normalizer2 base class documentation. 724 * @param c character to test 725 * @return TRUE if c has a normalization boundary before it 726 * @stable ICU 4.4 727 */ 728 virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE; 729 730 /** 731 * Tests if the character always has a normalization boundary after it, 732 * regardless of context. 733 * For details see the Normalizer2 base class documentation. 734 * @param c character to test 735 * @return TRUE if c has a normalization boundary after it 736 * @stable ICU 4.4 737 */ 738 virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE; 739 740 /** 741 * Tests if the character is normalization-inert. 742 * For details see the Normalizer2 base class documentation. 743 * @param c character to test 744 * @return TRUE if c is normalization-inert 745 * @stable ICU 4.4 746 */ 747 virtual UBool isInert(UChar32 c) const U_OVERRIDE; 748 private: 749 UnicodeString & 750 normalize(const UnicodeString &src, 751 UnicodeString &dest, 752 USetSpanCondition spanCondition, 753 UErrorCode &errorCode) const; 754 755 void 756 normalizeUTF8(uint32_t options, const char *src, int32_t length, 757 ByteSink &sink, Edits *edits, 758 USetSpanCondition spanCondition, 759 UErrorCode &errorCode) const; 760 761 UnicodeString & 762 normalizeSecondAndAppend(UnicodeString &first, 763 const UnicodeString &second, 764 UBool doNormalize, 765 UErrorCode &errorCode) const; 766 767 const Normalizer2 &norm2; 768 const UnicodeSet &set; 769 }; 770 771 U_NAMESPACE_END 772 773 #endif // !UCONFIG_NO_NORMALIZATION 774 #endif // __NORMALIZER2_H__ 775