1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2011, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: normalizer2.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009nov22 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __NORMALIZER2_H__ 18 #define __NORMALIZER2_H__ 19 20 /** 21 * \file 22 * \brief C++ API: New API for Unicode Normalization. 23 */ 24 25 #include "unicode/utypes.h" 26 27 #if !UCONFIG_NO_NORMALIZATION 28 29 #include "unicode/uniset.h" 30 #include "unicode/unistr.h" 31 #include "unicode/unorm2.h" 32 33 U_NAMESPACE_BEGIN 34 35 /** 36 * Unicode normalization functionality for standard Unicode normalization or 37 * for using custom mapping tables. 38 * All instances of this class are unmodifiable/immutable. 39 * Instances returned by getInstance() are singletons that must not be deleted by the caller. 40 * The Normalizer2 class is not intended for public subclassing. 41 * 42 * The primary functions are to produce a normalized string and to detect whether 43 * a string is already normalized. 44 * The most commonly used normalization forms are those defined in 45 * http://www.unicode.org/unicode/reports/tr15/ 46 * However, this API supports additional normalization forms for specialized purposes. 47 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) 48 * and can be used in implementations of UTS #46. 49 * 50 * Not only are the standard compose and decompose modes supplied, 51 * but additional modes are provided as documented in the Mode enum. 52 * 53 * Some of the functions in this class identify normalization boundaries. 54 * At a normalization boundary, the portions of the string 55 * before it and starting from it do not interact and can be handled independently. 56 * 57 * The spanQuickCheckYes() stops at a normalization boundary. 58 * When the goal is a normalized string, then the text before the boundary 59 * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). 60 * 61 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether 62 * a character is guaranteed to be at a normalization boundary, 63 * regardless of context. 64 * This is used for moving from one normalization boundary to the next 65 * or preceding boundary, and for performing iterative normalization. 66 * 67 * Iterative normalization is useful when only a small portion of a 68 * longer string needs to be processed. 69 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator 70 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() 71 * (to process only the substring for which sort key bytes are computed). 72 * 73 * The set of normalization boundaries returned by these functions may not be 74 * complete: There may be more boundaries that could be returned. 75 * Different functions may return different boundaries. 76 * @stable ICU 4.4 77 */ 78 class U_COMMON_API Normalizer2 : public UObject { 79 public: 80 /** 81 * Returns a Normalizer2 instance which uses the specified data file 82 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) 83 * and which composes or decomposes text according to the specified mode. 84 * Returns an unmodifiable singleton instance. Do not delete it. 85 * 86 * Use packageName=NULL for data files that are part of ICU's own data. 87 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. 88 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. 89 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. 90 * 91 * @param packageName NULL for ICU built-in data, otherwise application data package name 92 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file 93 * @param mode normalization mode (compose or decompose etc.) 94 * @param errorCode Standard ICU error code. Its input value must 95 * pass the U_SUCCESS() test, or else the function returns 96 * immediately. Check for U_FAILURE() on output or use with 97 * function chaining. (See User Guide for details.) 98 * @return the requested Normalizer2, if successful 99 * @stable ICU 4.4 100 */ 101 static const Normalizer2 * 102 getInstance(const char *packageName, 103 const char *name, 104 UNormalization2Mode mode, 105 UErrorCode &errorCode); 106 107 /** 108 * Returns the normalized form of the source string. 109 * @param src source string 110 * @param errorCode Standard ICU error code. Its input value must 111 * pass the U_SUCCESS() test, or else the function returns 112 * immediately. Check for U_FAILURE() on output or use with 113 * function chaining. (See User Guide for details.) 114 * @return normalized src 115 * @stable ICU 4.4 116 */ 117 UnicodeString normalize(const UnicodeString & src,UErrorCode & errorCode)118 normalize(const UnicodeString &src, UErrorCode &errorCode) const { 119 UnicodeString result; 120 normalize(src, result, errorCode); 121 return result; 122 } 123 /** 124 * Writes the normalized form of the source string to the destination string 125 * (replacing its contents) and returns the destination string. 126 * The source and destination strings must be different objects. 127 * @param src source string 128 * @param dest destination string; its contents is replaced with normalized src 129 * @param errorCode Standard ICU error code. Its input value must 130 * pass the U_SUCCESS() test, or else the function returns 131 * immediately. Check for U_FAILURE() on output or use with 132 * function chaining. (See User Guide for details.) 133 * @return dest 134 * @stable ICU 4.4 135 */ 136 virtual UnicodeString & 137 normalize(const UnicodeString &src, 138 UnicodeString &dest, 139 UErrorCode &errorCode) const = 0; 140 /** 141 * Appends the normalized form of the second string to the first string 142 * (merging them at the boundary) and returns the first string. 143 * The result is normalized if the first string was normalized. 144 * The first and second strings must be different objects. 145 * @param first string, should be normalized 146 * @param second string, will be normalized 147 * @param errorCode Standard ICU error code. Its input value must 148 * pass the U_SUCCESS() test, or else the function returns 149 * immediately. Check for U_FAILURE() on output or use with 150 * function chaining. (See User Guide for details.) 151 * @return first 152 * @stable ICU 4.4 153 */ 154 virtual UnicodeString & 155 normalizeSecondAndAppend(UnicodeString &first, 156 const UnicodeString &second, 157 UErrorCode &errorCode) const = 0; 158 /** 159 * Appends the second string to the first string 160 * (merging them at the boundary) and returns the first string. 161 * The result is normalized if both the strings were normalized. 162 * The first and second strings must be different objects. 163 * @param first string, should be normalized 164 * @param second string, should be normalized 165 * @param errorCode Standard ICU error code. Its input value must 166 * pass the U_SUCCESS() test, or else the function returns 167 * immediately. Check for U_FAILURE() on output or use with 168 * function chaining. (See User Guide for details.) 169 * @return first 170 * @stable ICU 4.4 171 */ 172 virtual UnicodeString & 173 append(UnicodeString &first, 174 const UnicodeString &second, 175 UErrorCode &errorCode) const = 0; 176 177 /** 178 * Gets the decomposition mapping of c. 179 * Roughly equivalent to normalizing the String form of c 180 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function 181 * returns FALSE and does not write a string 182 * if c does not have a decomposition mapping in this instance's data. 183 * This function is independent of the mode of the Normalizer2. 184 * @param c code point 185 * @param decomposition String object which will be set to c's 186 * decomposition mapping, if there is one. 187 * @return TRUE if c has a decomposition, otherwise FALSE 188 * @draft ICU 4.6 189 */ 190 virtual UBool 191 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; 192 193 /** 194 * Tests if the string is normalized. 195 * Internally, in cases where the quickCheck() method would return "maybe" 196 * (which is only possible for the two COMPOSE modes) this method 197 * resolves to "yes" or "no" to provide a definitive result, 198 * at the cost of doing more work in those cases. 199 * @param s input string 200 * @param errorCode Standard ICU error code. Its input value must 201 * pass the U_SUCCESS() test, or else the function returns 202 * immediately. Check for U_FAILURE() on output or use with 203 * function chaining. (See User Guide for details.) 204 * @return TRUE if s is normalized 205 * @stable ICU 4.4 206 */ 207 virtual UBool 208 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; 209 210 /** 211 * Tests if the string is normalized. 212 * For the two COMPOSE modes, the result could be "maybe" in cases that 213 * would take a little more work to resolve definitively. 214 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster 215 * combination of quick check + normalization, to avoid 216 * re-checking the "yes" prefix. 217 * @param s input string 218 * @param errorCode Standard ICU error code. Its input value must 219 * pass the U_SUCCESS() test, or else the function returns 220 * immediately. Check for U_FAILURE() on output or use with 221 * function chaining. (See User Guide for details.) 222 * @return UNormalizationCheckResult 223 * @stable ICU 4.4 224 */ 225 virtual UNormalizationCheckResult 226 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; 227 228 /** 229 * Returns the end of the normalized substring of the input string. 230 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> 231 * the substring <code>UnicodeString(s, 0, end)</code> 232 * will pass the quick check with a "yes" result. 233 * 234 * The returned end index is usually one or more characters before the 235 * "no" or "maybe" character: The end index is at a normalization boundary. 236 * (See the class documentation for more about normalization boundaries.) 237 * 238 * When the goal is a normalized string and most input strings are expected 239 * to be normalized already, then call this method, 240 * and if it returns a prefix shorter than the input string, 241 * copy that prefix and use normalizeSecondAndAppend() for the remainder. 242 * @param s input string 243 * @param errorCode Standard ICU error code. Its input value must 244 * pass the U_SUCCESS() test, or else the function returns 245 * immediately. Check for U_FAILURE() on output or use with 246 * function chaining. (See User Guide for details.) 247 * @return "yes" span end index 248 * @stable ICU 4.4 249 */ 250 virtual int32_t 251 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; 252 253 /** 254 * Tests if the character always has a normalization boundary before it, 255 * regardless of context. 256 * If true, then the character does not normalization-interact with 257 * preceding characters. 258 * In other words, a string containing this character can be normalized 259 * by processing portions before this character and starting from this 260 * character independently. 261 * This is used for iterative normalization. See the class documentation for details. 262 * @param c character to test 263 * @return TRUE if c has a normalization boundary before it 264 * @stable ICU 4.4 265 */ 266 virtual UBool hasBoundaryBefore(UChar32 c) const = 0; 267 268 /** 269 * Tests if the character always has a normalization boundary after it, 270 * regardless of context. 271 * If true, then the character does not normalization-interact with 272 * following characters. 273 * In other words, a string containing this character can be normalized 274 * by processing portions up to this character and after this 275 * character independently. 276 * This is used for iterative normalization. See the class documentation for details. 277 * Note that this operation may be significantly slower than hasBoundaryBefore(). 278 * @param c character to test 279 * @return TRUE if c has a normalization boundary after it 280 * @stable ICU 4.4 281 */ 282 virtual UBool hasBoundaryAfter(UChar32 c) const = 0; 283 284 /** 285 * Tests if the character is normalization-inert. 286 * If true, then the character does not change, nor normalization-interact with 287 * preceding or following characters. 288 * In other words, a string containing this character can be normalized 289 * by processing portions before this character and after this 290 * character independently. 291 * This is used for iterative normalization. See the class documentation for details. 292 * Note that this operation may be significantly slower than hasBoundaryBefore(). 293 * @param c character to test 294 * @return TRUE if c is normalization-inert 295 * @stable ICU 4.4 296 */ 297 virtual UBool isInert(UChar32 c) const = 0; 298 299 private: 300 // No ICU "poor man's RTTI" for this class nor its subclasses. 301 virtual UClassID getDynamicClassID() const; 302 }; 303 304 /** 305 * Normalization filtered by a UnicodeSet. 306 * Normalizes portions of the text contained in the filter set and leaves 307 * portions not contained in the filter set unchanged. 308 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). 309 * Not-in-the-filter text is treated as "is normalized" and "quick check yes". 310 * This class implements all of (and only) the Normalizer2 API. 311 * An instance of this class is unmodifiable/immutable but is constructed and 312 * must be destructed by the owner. 313 * @stable ICU 4.4 314 */ 315 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { 316 public: 317 /** 318 * Constructs a filtered normalizer wrapping any Normalizer2 instance 319 * and a filter set. 320 * Both are aliased and must not be modified or deleted while this object 321 * is used. 322 * The filter set should be frozen; otherwise the performance will suffer greatly. 323 * @param n2 wrapped Normalizer2 instance 324 * @param filterSet UnicodeSet which determines the characters to be normalized 325 * @stable ICU 4.4 326 */ FilteredNormalizer2(const Normalizer2 & n2,const UnicodeSet & filterSet)327 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : 328 norm2(n2), set(filterSet) {} 329 330 /** 331 * Writes the normalized form of the source string to the destination string 332 * (replacing its contents) and returns the destination string. 333 * The source and destination strings must be different objects. 334 * @param src source string 335 * @param dest destination string; its contents is replaced with normalized src 336 * @param errorCode Standard ICU error code. Its input value must 337 * pass the U_SUCCESS() test, or else the function returns 338 * immediately. Check for U_FAILURE() on output or use with 339 * function chaining. (See User Guide for details.) 340 * @return dest 341 * @stable ICU 4.4 342 */ 343 virtual UnicodeString & 344 normalize(const UnicodeString &src, 345 UnicodeString &dest, 346 UErrorCode &errorCode) const; 347 /** 348 * Appends the normalized form of the second string to the first string 349 * (merging them at the boundary) and returns the first string. 350 * The result is normalized if the first string was normalized. 351 * The first and second strings must be different objects. 352 * @param first string, should be normalized 353 * @param second string, will be normalized 354 * @param errorCode Standard ICU error code. Its input value must 355 * pass the U_SUCCESS() test, or else the function returns 356 * immediately. Check for U_FAILURE() on output or use with 357 * function chaining. (See User Guide for details.) 358 * @return first 359 * @stable ICU 4.4 360 */ 361 virtual UnicodeString & 362 normalizeSecondAndAppend(UnicodeString &first, 363 const UnicodeString &second, 364 UErrorCode &errorCode) const; 365 /** 366 * Appends the second string to the first string 367 * (merging them at the boundary) and returns the first string. 368 * The result is normalized if both the strings were normalized. 369 * The first and second strings must be different objects. 370 * @param first string, should be normalized 371 * @param second string, should be normalized 372 * @param errorCode Standard ICU error code. Its input value must 373 * pass the U_SUCCESS() test, or else the function returns 374 * immediately. Check for U_FAILURE() on output or use with 375 * function chaining. (See User Guide for details.) 376 * @return first 377 * @stable ICU 4.4 378 */ 379 virtual UnicodeString & 380 append(UnicodeString &first, 381 const UnicodeString &second, 382 UErrorCode &errorCode) const; 383 384 /** 385 * Gets the decomposition mapping of c. Equivalent to normalize(UnicodeString(c)) 386 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster. 387 * This function is independent of the mode of the Normalizer2. 388 * @param c code point 389 * @param decomposition String object which will be set to c's 390 * decomposition mapping, if there is one. 391 * @return TRUE if c has a decomposition, otherwise FALSE 392 * @draft ICU 4.6 393 */ 394 virtual UBool 395 getDecomposition(UChar32 c, UnicodeString &decomposition) const; 396 397 /** 398 * Tests if the string is normalized. 399 * For details see the Normalizer2 base class documentation. 400 * @param s input string 401 * @param errorCode Standard ICU error code. Its input value must 402 * pass the U_SUCCESS() test, or else the function returns 403 * immediately. Check for U_FAILURE() on output or use with 404 * function chaining. (See User Guide for details.) 405 * @return TRUE if s is normalized 406 * @stable ICU 4.4 407 */ 408 virtual UBool 409 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const; 410 /** 411 * Tests if the string is normalized. 412 * For details see the Normalizer2 base class documentation. 413 * @param s input string 414 * @param errorCode Standard ICU error code. Its input value must 415 * pass the U_SUCCESS() test, or else the function returns 416 * immediately. Check for U_FAILURE() on output or use with 417 * function chaining. (See User Guide for details.) 418 * @return UNormalizationCheckResult 419 * @stable ICU 4.4 420 */ 421 virtual UNormalizationCheckResult 422 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const; 423 /** 424 * Returns the end of the normalized substring of the input string. 425 * For details see the Normalizer2 base class documentation. 426 * @param s input string 427 * @param errorCode Standard ICU error code. Its input value must 428 * pass the U_SUCCESS() test, or else the function returns 429 * immediately. Check for U_FAILURE() on output or use with 430 * function chaining. (See User Guide for details.) 431 * @return "yes" span end index 432 * @stable ICU 4.4 433 */ 434 virtual int32_t 435 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const; 436 437 /** 438 * Tests if the character always has a normalization boundary before it, 439 * regardless of context. 440 * For details see the Normalizer2 base class documentation. 441 * @param c character to test 442 * @return TRUE if c has a normalization boundary before it 443 * @stable ICU 4.4 444 */ 445 virtual UBool hasBoundaryBefore(UChar32 c) const; 446 447 /** 448 * Tests if the character always has a normalization boundary after it, 449 * regardless of context. 450 * For details see the Normalizer2 base class documentation. 451 * @param c character to test 452 * @return TRUE if c has a normalization boundary after it 453 * @stable ICU 4.4 454 */ 455 virtual UBool hasBoundaryAfter(UChar32 c) const; 456 457 /** 458 * Tests if the character is normalization-inert. 459 * For details see the Normalizer2 base class documentation. 460 * @param c character to test 461 * @return TRUE if c is normalization-inert 462 * @stable ICU 4.4 463 */ 464 virtual UBool isInert(UChar32 c) const; 465 private: 466 UnicodeString & 467 normalize(const UnicodeString &src, 468 UnicodeString &dest, 469 USetSpanCondition spanCondition, 470 UErrorCode &errorCode) const; 471 472 UnicodeString & 473 normalizeSecondAndAppend(UnicodeString &first, 474 const UnicodeString &second, 475 UBool doNormalize, 476 UErrorCode &errorCode) const; 477 478 const Normalizer2 &norm2; 479 const UnicodeSet &set; 480 }; 481 482 U_NAMESPACE_END 483 484 #endif // !UCONFIG_NO_NORMALIZATION 485 #endif // __NORMALIZER2_H__ 486