1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2013, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: normalizer2.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009nov22 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __NORMALIZER2_H__ 20 #define __NORMALIZER2_H__ 21 22 /** 23 * \file 24 * \brief C++ API: New API for Unicode Normalization. 25 */ 26 27 #include "unicode/utypes.h" 28 29 #if U_SHOW_CPLUSPLUS_API 30 31 #if !UCONFIG_NO_NORMALIZATION 32 33 #include "unicode/stringpiece.h" 34 #include "unicode/uniset.h" 35 #include "unicode/unistr.h" 36 #include "unicode/unorm2.h" 37 38 U_NAMESPACE_BEGIN 39 40 class ByteSink; 41 42 /** 43 * Unicode normalization functionality for standard Unicode normalization or 44 * for using custom mapping tables. 45 * All instances of this class are unmodifiable/immutable. 46 * Instances returned by getInstance() are singletons that must not be deleted by the caller. 47 * The Normalizer2 class is not intended for public subclassing. 48 * 49 * The primary functions are to produce a normalized string and to detect whether 50 * a string is already normalized. 51 * The most commonly used normalization forms are those defined in 52 * http://www.unicode.org/unicode/reports/tr15/ 53 * However, this API supports additional normalization forms for specialized purposes. 54 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) 55 * and can be used in implementations of UTS #46. 56 * 57 * Not only are the standard compose and decompose modes supplied, 58 * but additional modes are provided as documented in the Mode enum. 59 * 60 * Some of the functions in this class identify normalization boundaries. 61 * At a normalization boundary, the portions of the string 62 * before it and starting from it do not interact and can be handled independently. 63 * 64 * The spanQuickCheckYes() stops at a normalization boundary. 65 * When the goal is a normalized string, then the text before the boundary 66 * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). 67 * 68 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether 69 * a character is guaranteed to be at a normalization boundary, 70 * regardless of context. 71 * This is used for moving from one normalization boundary to the next 72 * or preceding boundary, and for performing iterative normalization. 73 * 74 * Iterative normalization is useful when only a small portion of a 75 * longer string needs to be processed. 76 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator 77 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() 78 * (to process only the substring for which sort key bytes are computed). 79 * 80 * The set of normalization boundaries returned by these functions may not be 81 * complete: There may be more boundaries that could be returned. 82 * Different functions may return different boundaries. 83 * @stable ICU 4.4 84 */ 85 class U_COMMON_API Normalizer2 : public UObject { 86 public: 87 /** 88 * Destructor. 89 * @stable ICU 4.4 90 */ 91 ~Normalizer2(); 92 93 /** 94 * Returns a Normalizer2 instance for Unicode NFC normalization. 95 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode). 96 * Returns an unmodifiable singleton instance. Do not delete it. 97 * @param errorCode Standard ICU error code. Its input value must 98 * pass the U_SUCCESS() test, or else the function returns 99 * immediately. Check for U_FAILURE() on output or use with 100 * function chaining. (See User Guide for details.) 101 * @return the requested Normalizer2, if successful 102 * @stable ICU 49 103 */ 104 static const Normalizer2 * 105 getNFCInstance(UErrorCode &errorCode); 106 107 /** 108 * Returns a Normalizer2 instance for Unicode NFD normalization. 109 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode). 110 * Returns an unmodifiable singleton instance. Do not delete it. 111 * @param errorCode Standard ICU error code. Its input value must 112 * pass the U_SUCCESS() test, or else the function returns 113 * immediately. Check for U_FAILURE() on output or use with 114 * function chaining. (See User Guide for details.) 115 * @return the requested Normalizer2, if successful 116 * @stable ICU 49 117 */ 118 static const Normalizer2 * 119 getNFDInstance(UErrorCode &errorCode); 120 121 /** 122 * Returns a Normalizer2 instance for Unicode NFKC normalization. 123 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode). 124 * Returns an unmodifiable singleton instance. Do not delete it. 125 * @param errorCode Standard ICU error code. Its input value must 126 * pass the U_SUCCESS() test, or else the function returns 127 * immediately. Check for U_FAILURE() on output or use with 128 * function chaining. (See User Guide for details.) 129 * @return the requested Normalizer2, if successful 130 * @stable ICU 49 131 */ 132 static const Normalizer2 * 133 getNFKCInstance(UErrorCode &errorCode); 134 135 /** 136 * Returns a Normalizer2 instance for Unicode NFKD normalization. 137 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode). 138 * Returns an unmodifiable singleton instance. Do not delete it. 139 * @param errorCode Standard ICU error code. Its input value must 140 * pass the U_SUCCESS() test, or else the function returns 141 * immediately. Check for U_FAILURE() on output or use with 142 * function chaining. (See User Guide for details.) 143 * @return the requested Normalizer2, if successful 144 * @stable ICU 49 145 */ 146 static const Normalizer2 * 147 getNFKDInstance(UErrorCode &errorCode); 148 149 /** 150 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization. 151 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode). 152 * Returns an unmodifiable singleton instance. Do not delete it. 153 * @param errorCode Standard ICU error code. Its input value must 154 * pass the U_SUCCESS() test, or else the function returns 155 * immediately. Check for U_FAILURE() on output or use with 156 * function chaining. (See User Guide for details.) 157 * @return the requested Normalizer2, if successful 158 * @stable ICU 49 159 */ 160 static const Normalizer2 * 161 getNFKCCasefoldInstance(UErrorCode &errorCode); 162 163 /** 164 * Returns a Normalizer2 instance which uses the specified data file 165 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) 166 * and which composes or decomposes text according to the specified mode. 167 * Returns an unmodifiable singleton instance. Do not delete it. 168 * 169 * Use packageName=NULL for data files that are part of ICU's own data. 170 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. 171 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. 172 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. 173 * 174 * @param packageName NULL for ICU built-in data, otherwise application data package name 175 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file 176 * @param mode normalization mode (compose or decompose etc.) 177 * @param errorCode Standard ICU error code. Its input value must 178 * pass the U_SUCCESS() test, or else the function returns 179 * immediately. Check for U_FAILURE() on output or use with 180 * function chaining. (See User Guide for details.) 181 * @return the requested Normalizer2, if successful 182 * @stable ICU 4.4 183 */ 184 static const Normalizer2 * 185 getInstance(const char *packageName, 186 const char *name, 187 UNormalization2Mode mode, 188 UErrorCode &errorCode); 189 190 /** 191 * Returns the normalized form of the source string. 192 * @param src source string 193 * @param errorCode Standard ICU error code. Its input value must 194 * pass the U_SUCCESS() test, or else the function returns 195 * immediately. Check for U_FAILURE() on output or use with 196 * function chaining. (See User Guide for details.) 197 * @return normalized src 198 * @stable ICU 4.4 199 */ 200 UnicodeString normalize(const UnicodeString & src,UErrorCode & errorCode)201 normalize(const UnicodeString &src, UErrorCode &errorCode) const { 202 UnicodeString result; 203 normalize(src, result, errorCode); 204 return result; 205 } 206 /** 207 * Writes the normalized form of the source string to the destination string 208 * (replacing its contents) and returns the destination string. 209 * The source and destination strings must be different objects. 210 * @param src source string 211 * @param dest destination string; its contents is replaced with normalized src 212 * @param errorCode Standard ICU error code. Its input value must 213 * pass the U_SUCCESS() test, or else the function returns 214 * immediately. Check for U_FAILURE() on output or use with 215 * function chaining. (See User Guide for details.) 216 * @return dest 217 * @stable ICU 4.4 218 */ 219 virtual UnicodeString & 220 normalize(const UnicodeString &src, 221 UnicodeString &dest, 222 UErrorCode &errorCode) const = 0; 223 224 /** 225 * Normalizes a UTF-8 string and optionally records how source substrings 226 * relate to changed and unchanged result substrings. 227 * 228 * Implemented completely for all built-in modes except for FCD. 229 * The base class implementation converts to & from UTF-16 and does not support edits. 230 * 231 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 232 * @param src Source UTF-8 string. 233 * @param sink A ByteSink to which the normalized UTF-8 result string is written. 234 * sink.Flush() is called at the end. 235 * @param edits Records edits for index mapping, working with styled text, 236 * and getting only changes (if any). 237 * The Edits contents is undefined if any error occurs. 238 * This function calls edits->reset() first unless 239 * options includes U_EDITS_NO_RESET. edits can be nullptr. 240 * @param errorCode Standard ICU error code. Its input value must 241 * pass the U_SUCCESS() test, or else the function returns 242 * immediately. Check for U_FAILURE() on output or use with 243 * function chaining. (See User Guide for details.) 244 * @stable ICU 60 245 */ 246 virtual void 247 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 248 Edits *edits, UErrorCode &errorCode) const; 249 250 /** 251 * Appends the normalized form of the second string to the first string 252 * (merging them at the boundary) and returns the first string. 253 * The result is normalized if the first string was normalized. 254 * The first and second strings must be different objects. 255 * @param first string, should be normalized 256 * @param second string, will be normalized 257 * @param errorCode Standard ICU error code. Its input value must 258 * pass the U_SUCCESS() test, or else the function returns 259 * immediately. Check for U_FAILURE() on output or use with 260 * function chaining. (See User Guide for details.) 261 * @return first 262 * @stable ICU 4.4 263 */ 264 virtual UnicodeString & 265 normalizeSecondAndAppend(UnicodeString &first, 266 const UnicodeString &second, 267 UErrorCode &errorCode) const = 0; 268 /** 269 * Appends the second string to the first string 270 * (merging them at the boundary) and returns the first string. 271 * The result is normalized if both the strings were normalized. 272 * The first and second strings must be different objects. 273 * @param first string, should be normalized 274 * @param second string, should be normalized 275 * @param errorCode Standard ICU error code. Its input value must 276 * pass the U_SUCCESS() test, or else the function returns 277 * immediately. Check for U_FAILURE() on output or use with 278 * function chaining. (See User Guide for details.) 279 * @return first 280 * @stable ICU 4.4 281 */ 282 virtual UnicodeString & 283 append(UnicodeString &first, 284 const UnicodeString &second, 285 UErrorCode &errorCode) const = 0; 286 287 /** 288 * Gets the decomposition mapping of c. 289 * Roughly equivalent to normalizing the String form of c 290 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function 291 * returns false and does not write a string 292 * if c does not have a decomposition mapping in this instance's data. 293 * This function is independent of the mode of the Normalizer2. 294 * @param c code point 295 * @param decomposition String object which will be set to c's 296 * decomposition mapping, if there is one. 297 * @return true if c has a decomposition, otherwise false 298 * @stable ICU 4.6 299 */ 300 virtual UBool 301 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; 302 303 /** 304 * Gets the raw decomposition mapping of c. 305 * 306 * This is similar to the getDecomposition() method but returns the 307 * raw decomposition mapping as specified in UnicodeData.txt or 308 * (for custom data) in the mapping files processed by the gennorm2 tool. 309 * By contrast, getDecomposition() returns the processed, 310 * recursively-decomposed version of this mapping. 311 * 312 * When used on a standard NFKC Normalizer2 instance, 313 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. 314 * 315 * When used on a standard NFC Normalizer2 instance, 316 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); 317 * in this case, the result contains either one or two code points (=1..4 char16_ts). 318 * 319 * This function is independent of the mode of the Normalizer2. 320 * The default implementation returns false. 321 * @param c code point 322 * @param decomposition String object which will be set to c's 323 * raw decomposition mapping, if there is one. 324 * @return true if c has a decomposition, otherwise false 325 * @stable ICU 49 326 */ 327 virtual UBool 328 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; 329 330 /** 331 * Performs pairwise composition of a & b and returns the composite if there is one. 332 * 333 * Returns a composite code point c only if c has a two-way mapping to a+b. 334 * In standard Unicode normalization, this means that 335 * c has a canonical decomposition to a+b 336 * and c does not have the Full_Composition_Exclusion property. 337 * 338 * This function is independent of the mode of the Normalizer2. 339 * The default implementation returns a negative value. 340 * @param a A (normalization starter) code point. 341 * @param b Another code point. 342 * @return The non-negative composite code point if there is one; otherwise a negative value. 343 * @stable ICU 49 344 */ 345 virtual UChar32 346 composePair(UChar32 a, UChar32 b) const; 347 348 /** 349 * Gets the combining class of c. 350 * The default implementation returns 0 351 * but all standard implementations return the Unicode Canonical_Combining_Class value. 352 * @param c code point 353 * @return c's combining class 354 * @stable ICU 49 355 */ 356 virtual uint8_t 357 getCombiningClass(UChar32 c) const; 358 359 /** 360 * Tests if the string is normalized. 361 * Internally, in cases where the quickCheck() method would return "maybe" 362 * (which is only possible for the two COMPOSE modes) this method 363 * resolves to "yes" or "no" to provide a definitive result, 364 * at the cost of doing more work in those cases. 365 * @param s input string 366 * @param errorCode Standard ICU error code. Its input value must 367 * pass the U_SUCCESS() test, or else the function returns 368 * immediately. Check for U_FAILURE() on output or use with 369 * function chaining. (See User Guide for details.) 370 * @return true if s is normalized 371 * @stable ICU 4.4 372 */ 373 virtual UBool 374 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; 375 /** 376 * Tests if the UTF-8 string is normalized. 377 * Internally, in cases where the quickCheck() method would return "maybe" 378 * (which is only possible for the two COMPOSE modes) this method 379 * resolves to "yes" or "no" to provide a definitive result, 380 * at the cost of doing more work in those cases. 381 * 382 * This works for all normalization modes. 383 * It is optimized for UTF-8 for all built-in modes except for FCD. 384 * The base class implementation converts to UTF-16 and calls isNormalized(). 385 * 386 * @param s UTF-8 input string 387 * @param errorCode Standard ICU error code. Its input value must 388 * pass the U_SUCCESS() test, or else the function returns 389 * immediately. Check for U_FAILURE() on output or use with 390 * function chaining. (See User Guide for details.) 391 * @return true if s is normalized 392 * @stable ICU 60 393 */ 394 virtual UBool 395 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const; 396 397 398 /** 399 * Tests if the string is normalized. 400 * For the two COMPOSE modes, the result could be "maybe" in cases that 401 * would take a little more work to resolve definitively. 402 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster 403 * combination of quick check + normalization, to avoid 404 * re-checking the "yes" prefix. 405 * @param s input string 406 * @param errorCode Standard ICU error code. Its input value must 407 * pass the U_SUCCESS() test, or else the function returns 408 * immediately. Check for U_FAILURE() on output or use with 409 * function chaining. (See User Guide for details.) 410 * @return UNormalizationCheckResult 411 * @stable ICU 4.4 412 */ 413 virtual UNormalizationCheckResult 414 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; 415 416 /** 417 * Returns the end of the normalized substring of the input string. 418 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> 419 * the substring <code>UnicodeString(s, 0, end)</code> 420 * will pass the quick check with a "yes" result. 421 * 422 * The returned end index is usually one or more characters before the 423 * "no" or "maybe" character: The end index is at a normalization boundary. 424 * (See the class documentation for more about normalization boundaries.) 425 * 426 * When the goal is a normalized string and most input strings are expected 427 * to be normalized already, then call this method, 428 * and if it returns a prefix shorter than the input string, 429 * copy that prefix and use normalizeSecondAndAppend() for the remainder. 430 * @param s input string 431 * @param errorCode Standard ICU error code. Its input value must 432 * pass the U_SUCCESS() test, or else the function returns 433 * immediately. Check for U_FAILURE() on output or use with 434 * function chaining. (See User Guide for details.) 435 * @return "yes" span end index 436 * @stable ICU 4.4 437 */ 438 virtual int32_t 439 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; 440 441 /** 442 * Tests if the character always has a normalization boundary before it, 443 * regardless of context. 444 * If true, then the character does not normalization-interact with 445 * preceding characters. 446 * In other words, a string containing this character can be normalized 447 * by processing portions before this character and starting from this 448 * character independently. 449 * This is used for iterative normalization. See the class documentation for details. 450 * @param c character to test 451 * @return true if c has a normalization boundary before it 452 * @stable ICU 4.4 453 */ 454 virtual UBool hasBoundaryBefore(UChar32 c) const = 0; 455 456 /** 457 * Tests if the character always has a normalization boundary after it, 458 * regardless of context. 459 * If true, then the character does not normalization-interact with 460 * following characters. 461 * In other words, a string containing this character can be normalized 462 * by processing portions up to this character and after this 463 * character independently. 464 * This is used for iterative normalization. See the class documentation for details. 465 * Note that this operation may be significantly slower than hasBoundaryBefore(). 466 * @param c character to test 467 * @return true if c has a normalization boundary after it 468 * @stable ICU 4.4 469 */ 470 virtual UBool hasBoundaryAfter(UChar32 c) const = 0; 471 472 /** 473 * Tests if the character is normalization-inert. 474 * If true, then the character does not change, nor normalization-interact with 475 * preceding or following characters. 476 * In other words, a string containing this character can be normalized 477 * by processing portions before this character and after this 478 * character independently. 479 * This is used for iterative normalization. See the class documentation for details. 480 * Note that this operation may be significantly slower than hasBoundaryBefore(). 481 * @param c character to test 482 * @return true if c is normalization-inert 483 * @stable ICU 4.4 484 */ 485 virtual UBool isInert(UChar32 c) const = 0; 486 }; 487 488 /** 489 * Normalization filtered by a UnicodeSet. 490 * Normalizes portions of the text contained in the filter set and leaves 491 * portions not contained in the filter set unchanged. 492 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). 493 * Not-in-the-filter text is treated as "is normalized" and "quick check yes". 494 * This class implements all of (and only) the Normalizer2 API. 495 * An instance of this class is unmodifiable/immutable but is constructed and 496 * must be destructed by the owner. 497 * @stable ICU 4.4 498 */ 499 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { 500 public: 501 /** 502 * Constructs a filtered normalizer wrapping any Normalizer2 instance 503 * and a filter set. 504 * Both are aliased and must not be modified or deleted while this object 505 * is used. 506 * The filter set should be frozen; otherwise the performance will suffer greatly. 507 * @param n2 wrapped Normalizer2 instance 508 * @param filterSet UnicodeSet which determines the characters to be normalized 509 * @stable ICU 4.4 510 */ FilteredNormalizer2(const Normalizer2 & n2,const UnicodeSet & filterSet)511 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : 512 norm2(n2), set(filterSet) {} 513 514 /** 515 * Destructor. 516 * @stable ICU 4.4 517 */ 518 ~FilteredNormalizer2(); 519 520 /** 521 * Writes the normalized form of the source string to the destination string 522 * (replacing its contents) and returns the destination string. 523 * The source and destination strings must be different objects. 524 * @param src source string 525 * @param dest destination string; its contents is replaced with normalized src 526 * @param errorCode Standard ICU error code. Its input value must 527 * pass the U_SUCCESS() test, or else the function returns 528 * immediately. Check for U_FAILURE() on output or use with 529 * function chaining. (See User Guide for details.) 530 * @return dest 531 * @stable ICU 4.4 532 */ 533 virtual UnicodeString & 534 normalize(const UnicodeString &src, 535 UnicodeString &dest, 536 UErrorCode &errorCode) const U_OVERRIDE; 537 538 /** 539 * Normalizes a UTF-8 string and optionally records how source substrings 540 * relate to changed and unchanged result substrings. 541 * 542 * Implemented completely for most built-in modes except for FCD. 543 * The base class implementation converts to & from UTF-16 and does not support edits. 544 * 545 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 546 * @param src Source UTF-8 string. 547 * @param sink A ByteSink to which the normalized UTF-8 result string is written. 548 * sink.Flush() is called at the end. 549 * @param edits Records edits for index mapping, working with styled text, 550 * and getting only changes (if any). 551 * The Edits contents is undefined if any error occurs. 552 * This function calls edits->reset() first unless 553 * options includes U_EDITS_NO_RESET. edits can be nullptr. 554 * @param errorCode Standard ICU error code. Its input value must 555 * pass the U_SUCCESS() test, or else the function returns 556 * immediately. Check for U_FAILURE() on output or use with 557 * function chaining. (See User Guide for details.) 558 * @stable ICU 60 559 */ 560 virtual void 561 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 562 Edits *edits, UErrorCode &errorCode) const U_OVERRIDE; 563 564 /** 565 * Appends the normalized form of the second string to the first string 566 * (merging them at the boundary) and returns the first string. 567 * The result is normalized if the first string was normalized. 568 * The first and second strings must be different objects. 569 * @param first string, should be normalized 570 * @param second string, will be normalized 571 * @param errorCode Standard ICU error code. Its input value must 572 * pass the U_SUCCESS() test, or else the function returns 573 * immediately. Check for U_FAILURE() on output or use with 574 * function chaining. (See User Guide for details.) 575 * @return first 576 * @stable ICU 4.4 577 */ 578 virtual UnicodeString & 579 normalizeSecondAndAppend(UnicodeString &first, 580 const UnicodeString &second, 581 UErrorCode &errorCode) const U_OVERRIDE; 582 /** 583 * Appends the second string to the first string 584 * (merging them at the boundary) and returns the first string. 585 * The result is normalized if both the strings were normalized. 586 * The first and second strings must be different objects. 587 * @param first string, should be normalized 588 * @param second string, should be normalized 589 * @param errorCode Standard ICU error code. Its input value must 590 * pass the U_SUCCESS() test, or else the function returns 591 * immediately. Check for U_FAILURE() on output or use with 592 * function chaining. (See User Guide for details.) 593 * @return first 594 * @stable ICU 4.4 595 */ 596 virtual UnicodeString & 597 append(UnicodeString &first, 598 const UnicodeString &second, 599 UErrorCode &errorCode) const U_OVERRIDE; 600 601 /** 602 * Gets the decomposition mapping of c. 603 * For details see the base class documentation. 604 * 605 * This function is independent of the mode of the Normalizer2. 606 * @param c code point 607 * @param decomposition String object which will be set to c's 608 * decomposition mapping, if there is one. 609 * @return true if c has a decomposition, otherwise false 610 * @stable ICU 4.6 611 */ 612 virtual UBool 613 getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE; 614 615 /** 616 * Gets the raw decomposition mapping of c. 617 * For details see the base class documentation. 618 * 619 * This function is independent of the mode of the Normalizer2. 620 * @param c code point 621 * @param decomposition String object which will be set to c's 622 * raw decomposition mapping, if there is one. 623 * @return true if c has a decomposition, otherwise false 624 * @stable ICU 49 625 */ 626 virtual UBool 627 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE; 628 629 /** 630 * Performs pairwise composition of a & b and returns the composite if there is one. 631 * For details see the base class documentation. 632 * 633 * This function is independent of the mode of the Normalizer2. 634 * @param a A (normalization starter) code point. 635 * @param b Another code point. 636 * @return The non-negative composite code point if there is one; otherwise a negative value. 637 * @stable ICU 49 638 */ 639 virtual UChar32 640 composePair(UChar32 a, UChar32 b) const U_OVERRIDE; 641 642 /** 643 * Gets the combining class of c. 644 * The default implementation returns 0 645 * but all standard implementations return the Unicode Canonical_Combining_Class value. 646 * @param c code point 647 * @return c's combining class 648 * @stable ICU 49 649 */ 650 virtual uint8_t 651 getCombiningClass(UChar32 c) const U_OVERRIDE; 652 653 /** 654 * Tests if the string is normalized. 655 * For details see the Normalizer2 base class documentation. 656 * @param s input string 657 * @param errorCode Standard ICU error code. Its input value must 658 * pass the U_SUCCESS() test, or else the function returns 659 * immediately. Check for U_FAILURE() on output or use with 660 * function chaining. (See User Guide for details.) 661 * @return true if s is normalized 662 * @stable ICU 4.4 663 */ 664 virtual UBool 665 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; 666 /** 667 * Tests if the UTF-8 string is normalized. 668 * Internally, in cases where the quickCheck() method would return "maybe" 669 * (which is only possible for the two COMPOSE modes) this method 670 * resolves to "yes" or "no" to provide a definitive result, 671 * at the cost of doing more work in those cases. 672 * 673 * This works for all normalization modes. 674 * It is optimized for UTF-8 for all built-in modes except for FCD. 675 * The base class implementation converts to UTF-16 and calls isNormalized(). 676 * 677 * @param s UTF-8 input string 678 * @param errorCode Standard ICU error code. Its input value must 679 * pass the U_SUCCESS() test, or else the function returns 680 * immediately. Check for U_FAILURE() on output or use with 681 * function chaining. (See User Guide for details.) 682 * @return true if s is normalized 683 * @stable ICU 60 684 */ 685 virtual UBool 686 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE; 687 /** 688 * Tests if the string is normalized. 689 * For details see the Normalizer2 base class documentation. 690 * @param s input string 691 * @param errorCode Standard ICU error code. Its input value must 692 * pass the U_SUCCESS() test, or else the function returns 693 * immediately. Check for U_FAILURE() on output or use with 694 * function chaining. (See User Guide for details.) 695 * @return UNormalizationCheckResult 696 * @stable ICU 4.4 697 */ 698 virtual UNormalizationCheckResult 699 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; 700 /** 701 * Returns the end of the normalized substring of the input string. 702 * For details see the Normalizer2 base class documentation. 703 * @param s input string 704 * @param errorCode Standard ICU error code. Its input value must 705 * pass the U_SUCCESS() test, or else the function returns 706 * immediately. Check for U_FAILURE() on output or use with 707 * function chaining. (See User Guide for details.) 708 * @return "yes" span end index 709 * @stable ICU 4.4 710 */ 711 virtual int32_t 712 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; 713 714 /** 715 * Tests if the character always has a normalization boundary before it, 716 * regardless of context. 717 * For details see the Normalizer2 base class documentation. 718 * @param c character to test 719 * @return true if c has a normalization boundary before it 720 * @stable ICU 4.4 721 */ 722 virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE; 723 724 /** 725 * Tests if the character always has a normalization boundary after it, 726 * regardless of context. 727 * For details see the Normalizer2 base class documentation. 728 * @param c character to test 729 * @return true if c has a normalization boundary after it 730 * @stable ICU 4.4 731 */ 732 virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE; 733 734 /** 735 * Tests if the character is normalization-inert. 736 * For details see the Normalizer2 base class documentation. 737 * @param c character to test 738 * @return true if c is normalization-inert 739 * @stable ICU 4.4 740 */ 741 virtual UBool isInert(UChar32 c) const U_OVERRIDE; 742 private: 743 UnicodeString & 744 normalize(const UnicodeString &src, 745 UnicodeString &dest, 746 USetSpanCondition spanCondition, 747 UErrorCode &errorCode) const; 748 749 void 750 normalizeUTF8(uint32_t options, const char *src, int32_t length, 751 ByteSink &sink, Edits *edits, 752 USetSpanCondition spanCondition, 753 UErrorCode &errorCode) const; 754 755 UnicodeString & 756 normalizeSecondAndAppend(UnicodeString &first, 757 const UnicodeString &second, 758 UBool doNormalize, 759 UErrorCode &errorCode) const; 760 761 const Normalizer2 &norm2; 762 const UnicodeSet &set; 763 }; 764 765 U_NAMESPACE_END 766 767 #endif // !UCONFIG_NO_NORMALIZATION 768 769 #endif /* U_SHOW_CPLUSPLUS_API */ 770 771 #endif // __NORMALIZER2_H__ 772