1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2012, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: unorm2.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009dec15 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __UNORM2_H__ 18 #define __UNORM2_H__ 19 20 /** 21 * \file 22 * \brief C API: New API for Unicode Normalization. 23 * 24 * Unicode normalization functionality for standard Unicode normalization or 25 * for using custom mapping tables. 26 * All instances of UNormalizer2 are unmodifiable/immutable. 27 * Instances returned by unorm2_getInstance() are singletons that must not be deleted by the caller. 28 * For more details see the Normalizer2 C++ class. 29 */ 30 31 #include "unicode/utypes.h" 32 #include "unicode/localpointer.h" 33 #include "unicode/uset.h" 34 35 /** 36 * Constants for normalization modes. 37 * For details about standard Unicode normalization forms 38 * and about the algorithms which are also used with custom mapping tables 39 * see http://www.unicode.org/unicode/reports/tr15/ 40 * @stable ICU 4.4 41 */ 42 typedef enum { 43 /** 44 * Decomposition followed by composition. 45 * Same as standard NFC when using an "nfc" instance. 46 * Same as standard NFKC when using an "nfkc" instance. 47 * For details about standard Unicode normalization forms 48 * see http://www.unicode.org/unicode/reports/tr15/ 49 * @stable ICU 4.4 50 */ 51 UNORM2_COMPOSE, 52 /** 53 * Map, and reorder canonically. 54 * Same as standard NFD when using an "nfc" instance. 55 * Same as standard NFKD when using an "nfkc" instance. 56 * For details about standard Unicode normalization forms 57 * see http://www.unicode.org/unicode/reports/tr15/ 58 * @stable ICU 4.4 59 */ 60 UNORM2_DECOMPOSE, 61 /** 62 * "Fast C or D" form. 63 * If a string is in this form, then further decomposition <i>without reordering</i> 64 * would yield the same form as DECOMPOSE. 65 * Text in "Fast C or D" form can be processed efficiently with data tables 66 * that are "canonically closed", that is, that provide equivalent data for 67 * equivalent text, without having to be fully normalized. 68 * Not a standard Unicode normalization form. 69 * Not a unique form: Different FCD strings can be canonically equivalent. 70 * For details see http://www.unicode.org/notes/tn5/#FCD 71 * @stable ICU 4.4 72 */ 73 UNORM2_FCD, 74 /** 75 * Compose only contiguously. 76 * Also known as "FCC" or "Fast C Contiguous". 77 * The result will often but not always be in NFC. 78 * The result will conform to FCD which is useful for processing. 79 * Not a standard Unicode normalization form. 80 * For details see http://www.unicode.org/notes/tn5/#FCC 81 * @stable ICU 4.4 82 */ 83 UNORM2_COMPOSE_CONTIGUOUS 84 } UNormalization2Mode; 85 86 /** 87 * Result values for normalization quick check functions. 88 * For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms 89 * @stable ICU 2.0 90 */ 91 typedef enum UNormalizationCheckResult { 92 /** 93 * The input string is not in the normalization form. 94 * @stable ICU 2.0 95 */ 96 UNORM_NO, 97 /** 98 * The input string is in the normalization form. 99 * @stable ICU 2.0 100 */ 101 UNORM_YES, 102 /** 103 * The input string may or may not be in the normalization form. 104 * This value is only returned for composition forms like NFC and FCC, 105 * when a backward-combining character is found for which the surrounding text 106 * would have to be analyzed further. 107 * @stable ICU 2.0 108 */ 109 UNORM_MAYBE 110 } UNormalizationCheckResult; 111 112 /** 113 * Opaque C service object type for the new normalization API. 114 * @stable ICU 4.4 115 */ 116 struct UNormalizer2; 117 typedef struct UNormalizer2 UNormalizer2; /**< C typedef for struct UNormalizer2. @stable ICU 4.4 */ 118 119 #if !UCONFIG_NO_NORMALIZATION 120 121 #ifndef U_HIDE_DRAFT_API 122 /** 123 * Returns a UNormalizer2 instance for Unicode NFC normalization. 124 * Same as unorm2_getInstance(NULL, "nfc", UNORM2_COMPOSE, pErrorCode). 125 * Returns an unmodifiable singleton instance. Do not delete it. 126 * @param pErrorCode Standard ICU error code. Its input value must 127 * pass the U_SUCCESS() test, or else the function returns 128 * immediately. Check for U_FAILURE() on output or use with 129 * function chaining. (See User Guide for details.) 130 * @return the requested Normalizer2, if successful 131 * @draft ICU 49 132 */ 133 U_DRAFT const UNormalizer2 * U_EXPORT2 134 unorm2_getNFCInstance(UErrorCode *pErrorCode); 135 136 /** 137 * Returns a UNormalizer2 instance for Unicode NFD normalization. 138 * Same as unorm2_getInstance(NULL, "nfc", UNORM2_DECOMPOSE, pErrorCode). 139 * Returns an unmodifiable singleton instance. Do not delete it. 140 * @param pErrorCode Standard ICU error code. Its input value must 141 * pass the U_SUCCESS() test, or else the function returns 142 * immediately. Check for U_FAILURE() on output or use with 143 * function chaining. (See User Guide for details.) 144 * @return the requested Normalizer2, if successful 145 * @draft ICU 49 146 */ 147 U_DRAFT const UNormalizer2 * U_EXPORT2 148 unorm2_getNFDInstance(UErrorCode *pErrorCode); 149 150 /** 151 * Returns a UNormalizer2 instance for Unicode NFKC normalization. 152 * Same as unorm2_getInstance(NULL, "nfkc", UNORM2_COMPOSE, pErrorCode). 153 * Returns an unmodifiable singleton instance. Do not delete it. 154 * @param pErrorCode Standard ICU error code. Its input value must 155 * pass the U_SUCCESS() test, or else the function returns 156 * immediately. Check for U_FAILURE() on output or use with 157 * function chaining. (See User Guide for details.) 158 * @return the requested Normalizer2, if successful 159 * @draft ICU 49 160 */ 161 U_DRAFT const UNormalizer2 * U_EXPORT2 162 unorm2_getNFKCInstance(UErrorCode *pErrorCode); 163 164 /** 165 * Returns a UNormalizer2 instance for Unicode NFKD normalization. 166 * Same as unorm2_getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, pErrorCode). 167 * Returns an unmodifiable singleton instance. Do not delete it. 168 * @param pErrorCode Standard ICU error code. Its input value must 169 * pass the U_SUCCESS() test, or else the function returns 170 * immediately. Check for U_FAILURE() on output or use with 171 * function chaining. (See User Guide for details.) 172 * @return the requested Normalizer2, if successful 173 * @draft ICU 49 174 */ 175 U_DRAFT const UNormalizer2 * U_EXPORT2 176 unorm2_getNFKDInstance(UErrorCode *pErrorCode); 177 178 /** 179 * Returns a UNormalizer2 instance for Unicode NFKC_Casefold normalization. 180 * Same as unorm2_getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, pErrorCode). 181 * Returns an unmodifiable singleton instance. Do not delete it. 182 * @param pErrorCode Standard ICU error code. Its input value must 183 * pass the U_SUCCESS() test, or else the function returns 184 * immediately. Check for U_FAILURE() on output or use with 185 * function chaining. (See User Guide for details.) 186 * @return the requested Normalizer2, if successful 187 * @draft ICU 49 188 */ 189 U_DRAFT const UNormalizer2 * U_EXPORT2 190 unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode); 191 #endif /* U_HIDE_DRAFT_API */ 192 193 /** 194 * Returns a UNormalizer2 instance which uses the specified data file 195 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) 196 * and which composes or decomposes text according to the specified mode. 197 * Returns an unmodifiable singleton instance. Do not delete it. 198 * 199 * Use packageName=NULL for data files that are part of ICU's own data. 200 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. 201 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. 202 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. 203 * 204 * @param packageName NULL for ICU built-in data, otherwise application data package name 205 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file 206 * @param mode normalization mode (compose or decompose etc.) 207 * @param pErrorCode Standard ICU error code. Its input value must 208 * pass the U_SUCCESS() test, or else the function returns 209 * immediately. Check for U_FAILURE() on output or use with 210 * function chaining. (See User Guide for details.) 211 * @return the requested UNormalizer2, if successful 212 * @stable ICU 4.4 213 */ 214 U_STABLE const UNormalizer2 * U_EXPORT2 215 unorm2_getInstance(const char *packageName, 216 const char *name, 217 UNormalization2Mode mode, 218 UErrorCode *pErrorCode); 219 220 /** 221 * Constructs a filtered normalizer wrapping any UNormalizer2 instance 222 * and a filter set. 223 * Both are aliased and must not be modified or deleted while this object 224 * is used. 225 * The filter set should be frozen; otherwise the performance will suffer greatly. 226 * @param norm2 wrapped UNormalizer2 instance 227 * @param filterSet USet which determines the characters to be normalized 228 * @param pErrorCode Standard ICU error code. Its input value must 229 * pass the U_SUCCESS() test, or else the function returns 230 * immediately. Check for U_FAILURE() on output or use with 231 * function chaining. (See User Guide for details.) 232 * @return the requested UNormalizer2, if successful 233 * @stable ICU 4.4 234 */ 235 U_STABLE UNormalizer2 * U_EXPORT2 236 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode); 237 238 /** 239 * Closes a UNormalizer2 instance from unorm2_openFiltered(). 240 * Do not close instances from unorm2_getInstance()! 241 * @param norm2 UNormalizer2 instance to be closed 242 * @stable ICU 4.4 243 */ 244 U_STABLE void U_EXPORT2 245 unorm2_close(UNormalizer2 *norm2); 246 247 #if U_SHOW_CPLUSPLUS_API 248 249 U_NAMESPACE_BEGIN 250 251 /** 252 * \class LocalUNormalizer2Pointer 253 * "Smart pointer" class, closes a UNormalizer2 via unorm2_close(). 254 * For most methods see the LocalPointerBase base class. 255 * 256 * @see LocalPointerBase 257 * @see LocalPointer 258 * @stable ICU 4.4 259 */ 260 U_DEFINE_LOCAL_OPEN_POINTER(LocalUNormalizer2Pointer, UNormalizer2, unorm2_close); 261 262 U_NAMESPACE_END 263 264 #endif 265 266 /** 267 * Writes the normalized form of the source string to the destination string 268 * (replacing its contents) and returns the length of the destination string. 269 * The source and destination strings must be different buffers. 270 * @param norm2 UNormalizer2 instance 271 * @param src source string 272 * @param length length of the source string, or -1 if NUL-terminated 273 * @param dest destination string; its contents is replaced with normalized src 274 * @param capacity number of UChars that can be written to dest 275 * @param pErrorCode Standard ICU error code. Its input value must 276 * pass the U_SUCCESS() test, or else the function returns 277 * immediately. Check for U_FAILURE() on output or use with 278 * function chaining. (See User Guide for details.) 279 * @return dest 280 * @stable ICU 4.4 281 */ 282 U_STABLE int32_t U_EXPORT2 283 unorm2_normalize(const UNormalizer2 *norm2, 284 const UChar *src, int32_t length, 285 UChar *dest, int32_t capacity, 286 UErrorCode *pErrorCode); 287 /** 288 * Appends the normalized form of the second string to the first string 289 * (merging them at the boundary) and returns the length of the first string. 290 * The result is normalized if the first string was normalized. 291 * The first and second strings must be different buffers. 292 * @param norm2 UNormalizer2 instance 293 * @param first string, should be normalized 294 * @param firstLength length of the first string, or -1 if NUL-terminated 295 * @param firstCapacity number of UChars that can be written to first 296 * @param second string, will be normalized 297 * @param secondLength length of the source string, or -1 if NUL-terminated 298 * @param pErrorCode Standard ICU error code. Its input value must 299 * pass the U_SUCCESS() test, or else the function returns 300 * immediately. Check for U_FAILURE() on output or use with 301 * function chaining. (See User Guide for details.) 302 * @return first 303 * @stable ICU 4.4 304 */ 305 U_STABLE int32_t U_EXPORT2 306 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2, 307 UChar *first, int32_t firstLength, int32_t firstCapacity, 308 const UChar *second, int32_t secondLength, 309 UErrorCode *pErrorCode); 310 /** 311 * Appends the second string to the first string 312 * (merging them at the boundary) and returns the length of the first string. 313 * The result is normalized if both the strings were normalized. 314 * The first and second strings must be different buffers. 315 * @param norm2 UNormalizer2 instance 316 * @param first string, should be normalized 317 * @param firstLength length of the first string, or -1 if NUL-terminated 318 * @param firstCapacity number of UChars that can be written to first 319 * @param second string, should be normalized 320 * @param secondLength length of the source string, or -1 if NUL-terminated 321 * @param pErrorCode Standard ICU error code. Its input value must 322 * pass the U_SUCCESS() test, or else the function returns 323 * immediately. Check for U_FAILURE() on output or use with 324 * function chaining. (See User Guide for details.) 325 * @return first 326 * @stable ICU 4.4 327 */ 328 U_STABLE int32_t U_EXPORT2 329 unorm2_append(const UNormalizer2 *norm2, 330 UChar *first, int32_t firstLength, int32_t firstCapacity, 331 const UChar *second, int32_t secondLength, 332 UErrorCode *pErrorCode); 333 334 /** 335 * Gets the decomposition mapping of c. 336 * Roughly equivalent to normalizing the String form of c 337 * on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster, and except that this function 338 * returns a negative value and does not write a string 339 * if c does not have a decomposition mapping in this instance's data. 340 * This function is independent of the mode of the UNormalizer2. 341 * @param norm2 UNormalizer2 instance 342 * @param c code point 343 * @param decomposition String buffer which will be set to c's 344 * decomposition mapping, if there is one. 345 * @param capacity number of UChars that can be written to decomposition 346 * @param pErrorCode Standard ICU error code. Its input value must 347 * pass the U_SUCCESS() test, or else the function returns 348 * immediately. Check for U_FAILURE() on output or use with 349 * function chaining. (See User Guide for details.) 350 * @return the non-negative length of c's decomposition, if there is one; otherwise a negative value 351 * @stable ICU 4.6 352 */ 353 U_STABLE int32_t U_EXPORT2 354 unorm2_getDecomposition(const UNormalizer2 *norm2, 355 UChar32 c, UChar *decomposition, int32_t capacity, 356 UErrorCode *pErrorCode); 357 358 #ifndef U_HIDE_DRAFT_API 359 /** 360 * Gets the raw decomposition mapping of c. 361 * 362 * This is similar to the unorm2_getDecomposition() function but returns the 363 * raw decomposition mapping as specified in UnicodeData.txt or 364 * (for custom data) in the mapping files processed by the gennorm2 tool. 365 * By contrast, unorm2_getDecomposition() returns the processed, 366 * recursively-decomposed version of this mapping. 367 * 368 * When used on a standard NFKC Normalizer2 instance, 369 * unorm2_getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. 370 * 371 * When used on a standard NFC Normalizer2 instance, 372 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); 373 * in this case, the result contains either one or two code points (=1..4 UChars). 374 * 375 * This function is independent of the mode of the UNormalizer2. 376 * @param norm2 UNormalizer2 instance 377 * @param c code point 378 * @param decomposition String buffer which will be set to c's 379 * raw decomposition mapping, if there is one. 380 * @param capacity number of UChars that can be written to decomposition 381 * @param pErrorCode Standard ICU error code. Its input value must 382 * pass the U_SUCCESS() test, or else the function returns 383 * immediately. Check for U_FAILURE() on output or use with 384 * function chaining. (See User Guide for details.) 385 * @return the non-negative length of c's raw decomposition, if there is one; otherwise a negative value 386 * @draft ICU 49 387 */ 388 U_DRAFT int32_t U_EXPORT2 389 unorm2_getRawDecomposition(const UNormalizer2 *norm2, 390 UChar32 c, UChar *decomposition, int32_t capacity, 391 UErrorCode *pErrorCode); 392 393 /** 394 * Performs pairwise composition of a & b and returns the composite if there is one. 395 * 396 * Returns a composite code point c only if c has a two-way mapping to a+b. 397 * In standard Unicode normalization, this means that 398 * c has a canonical decomposition to a+b 399 * and c does not have the Full_Composition_Exclusion property. 400 * 401 * This function is independent of the mode of the UNormalizer2. 402 * @param norm2 UNormalizer2 instance 403 * @param a A (normalization starter) code point. 404 * @param b Another code point. 405 * @return The non-negative composite code point if there is one; otherwise a negative value. 406 * @draft ICU 49 407 */ 408 U_DRAFT UChar32 U_EXPORT2 409 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b); 410 411 /** 412 * Gets the combining class of c. 413 * The default implementation returns 0 414 * but all standard implementations return the Unicode Canonical_Combining_Class value. 415 * @param norm2 UNormalizer2 instance 416 * @param c code point 417 * @return c's combining class 418 * @draft ICU 49 419 */ 420 U_DRAFT uint8_t U_EXPORT2 421 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c); 422 #endif /* U_HIDE_DRAFT_API */ 423 424 /** 425 * Tests if the string is normalized. 426 * Internally, in cases where the quickCheck() method would return "maybe" 427 * (which is only possible for the two COMPOSE modes) this method 428 * resolves to "yes" or "no" to provide a definitive result, 429 * at the cost of doing more work in those cases. 430 * @param norm2 UNormalizer2 instance 431 * @param s input string 432 * @param length length of the string, or -1 if NUL-terminated 433 * @param pErrorCode Standard ICU error code. Its input value must 434 * pass the U_SUCCESS() test, or else the function returns 435 * immediately. Check for U_FAILURE() on output or use with 436 * function chaining. (See User Guide for details.) 437 * @return TRUE if s is normalized 438 * @stable ICU 4.4 439 */ 440 U_STABLE UBool U_EXPORT2 441 unorm2_isNormalized(const UNormalizer2 *norm2, 442 const UChar *s, int32_t length, 443 UErrorCode *pErrorCode); 444 445 /** 446 * Tests if the string is normalized. 447 * For the two COMPOSE modes, the result could be "maybe" in cases that 448 * would take a little more work to resolve definitively. 449 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster 450 * combination of quick check + normalization, to avoid 451 * re-checking the "yes" prefix. 452 * @param norm2 UNormalizer2 instance 453 * @param s input string 454 * @param length length of the string, or -1 if NUL-terminated 455 * @param pErrorCode Standard ICU error code. Its input value must 456 * pass the U_SUCCESS() test, or else the function returns 457 * immediately. Check for U_FAILURE() on output or use with 458 * function chaining. (See User Guide for details.) 459 * @return UNormalizationCheckResult 460 * @stable ICU 4.4 461 */ 462 U_STABLE UNormalizationCheckResult U_EXPORT2 463 unorm2_quickCheck(const UNormalizer2 *norm2, 464 const UChar *s, int32_t length, 465 UErrorCode *pErrorCode); 466 467 /** 468 * Returns the end of the normalized substring of the input string. 469 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> 470 * the substring <code>UnicodeString(s, 0, end)</code> 471 * will pass the quick check with a "yes" result. 472 * 473 * The returned end index is usually one or more characters before the 474 * "no" or "maybe" character: The end index is at a normalization boundary. 475 * (See the class documentation for more about normalization boundaries.) 476 * 477 * When the goal is a normalized string and most input strings are expected 478 * to be normalized already, then call this method, 479 * and if it returns a prefix shorter than the input string, 480 * copy that prefix and use normalizeSecondAndAppend() for the remainder. 481 * @param norm2 UNormalizer2 instance 482 * @param s input string 483 * @param length length of the string, or -1 if NUL-terminated 484 * @param pErrorCode Standard ICU error code. Its input value must 485 * pass the U_SUCCESS() test, or else the function returns 486 * immediately. Check for U_FAILURE() on output or use with 487 * function chaining. (See User Guide for details.) 488 * @return "yes" span end index 489 * @stable ICU 4.4 490 */ 491 U_STABLE int32_t U_EXPORT2 492 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2, 493 const UChar *s, int32_t length, 494 UErrorCode *pErrorCode); 495 496 /** 497 * Tests if the character always has a normalization boundary before it, 498 * regardless of context. 499 * For details see the Normalizer2 base class documentation. 500 * @param norm2 UNormalizer2 instance 501 * @param c character to test 502 * @return TRUE if c has a normalization boundary before it 503 * @stable ICU 4.4 504 */ 505 U_STABLE UBool U_EXPORT2 506 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c); 507 508 /** 509 * Tests if the character always has a normalization boundary after it, 510 * regardless of context. 511 * For details see the Normalizer2 base class documentation. 512 * @param norm2 UNormalizer2 instance 513 * @param c character to test 514 * @return TRUE if c has a normalization boundary after it 515 * @stable ICU 4.4 516 */ 517 U_STABLE UBool U_EXPORT2 518 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c); 519 520 /** 521 * Tests if the character is normalization-inert. 522 * For details see the Normalizer2 base class documentation. 523 * @param norm2 UNormalizer2 instance 524 * @param c character to test 525 * @return TRUE if c is normalization-inert 526 * @stable ICU 4.4 527 */ 528 U_STABLE UBool U_EXPORT2 529 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c); 530 531 #endif /* !UCONFIG_NO_NORMALIZATION */ 532 #endif /* __UNORM2_H__ */ 533