1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2005-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ucasemap.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2005may06 14 * created by: Markus W. Scherer 15 * 16 * Case mapping service object and functions using it. 17 */ 18 19 #ifndef __UCASEMAP_H__ 20 #define __UCASEMAP_H__ 21 22 #include "unicode/utypes.h" 23 #include "unicode/ustring.h" 24 #include "unicode/localpointer.h" 25 26 /** 27 * \file 28 * \brief C API: Unicode case mapping functions using a UCaseMap service object. 29 * 30 * The service object takes care of memory allocations, data loading, and setup 31 * for the attributes, as usual. 32 * 33 * Currently, the functionality provided here does not overlap with uchar.h 34 * and ustring.h, except for ucasemap_toTitle(). 35 * 36 * ucasemap_utf8XYZ() functions operate directly on UTF-8 strings. 37 */ 38 39 /** 40 * UCaseMap is an opaque service object for newer ICU case mapping functions. 41 * Older functions did not use a service object. 42 * @stable ICU 3.4 43 */ 44 struct UCaseMap; 45 typedef struct UCaseMap UCaseMap; /**< C typedef for struct UCaseMap. @stable ICU 3.4 */ 46 47 /** 48 * Open a UCaseMap service object for a locale and a set of options. 49 * The locale ID and options are preprocessed so that functions using the 50 * service object need not process them in each call. 51 * 52 * @param locale ICU locale ID, used for language-dependent 53 * upper-/lower-/title-casing according to the Unicode standard. 54 * Usual semantics: ""=root, NULL=default locale, etc. 55 * @param options Options bit set, used for case folding and string comparisons. 56 * Same flags as for u_foldCase(), u_strFoldCase(), 57 * u_strCaseCompare(), etc. 58 * Use 0 or U_FOLD_CASE_DEFAULT for default behavior. 59 * @param pErrorCode Must be a valid pointer to an error code value, 60 * which must not indicate a failure before the function call. 61 * @return Pointer to a UCaseMap service object, if successful. 62 * 63 * @see U_FOLD_CASE_DEFAULT 64 * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I 65 * @see U_TITLECASE_NO_LOWERCASE 66 * @see U_TITLECASE_NO_BREAK_ADJUSTMENT 67 * @stable ICU 3.4 68 */ 69 U_STABLE UCaseMap * U_EXPORT2 70 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode); 71 72 /** 73 * Close a UCaseMap service object. 74 * @param csm Object to be closed. 75 * @stable ICU 3.4 76 */ 77 U_STABLE void U_EXPORT2 78 ucasemap_close(UCaseMap *csm); 79 80 #if U_SHOW_CPLUSPLUS_API 81 82 U_NAMESPACE_BEGIN 83 84 /** 85 * \class LocalUCaseMapPointer 86 * "Smart pointer" class, closes a UCaseMap via ucasemap_close(). 87 * For most methods see the LocalPointerBase base class. 88 * 89 * @see LocalPointerBase 90 * @see LocalPointer 91 * @stable ICU 4.4 92 */ 93 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCaseMapPointer, UCaseMap, ucasemap_close); 94 95 U_NAMESPACE_END 96 97 #endif 98 99 /** 100 * Get the locale ID that is used for language-dependent case mappings. 101 * @param csm UCaseMap service object. 102 * @return locale ID 103 * @stable ICU 3.4 104 */ 105 U_STABLE const char * U_EXPORT2 106 ucasemap_getLocale(const UCaseMap *csm); 107 108 /** 109 * Get the options bit set that is used for case folding and string comparisons. 110 * @param csm UCaseMap service object. 111 * @return options bit set 112 * @stable ICU 3.4 113 */ 114 U_STABLE uint32_t U_EXPORT2 115 ucasemap_getOptions(const UCaseMap *csm); 116 117 /** 118 * Set the locale ID that is used for language-dependent case mappings. 119 * 120 * @param csm UCaseMap service object. 121 * @param locale Locale ID, see ucasemap_open(). 122 * @param pErrorCode Must be a valid pointer to an error code value, 123 * which must not indicate a failure before the function call. 124 * 125 * @see ucasemap_open 126 * @stable ICU 3.4 127 */ 128 U_STABLE void U_EXPORT2 129 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode); 130 131 /** 132 * Set the options bit set that is used for case folding and string comparisons. 133 * 134 * @param csm UCaseMap service object. 135 * @param options Options bit set, see ucasemap_open(). 136 * @param pErrorCode Must be a valid pointer to an error code value, 137 * which must not indicate a failure before the function call. 138 * 139 * @see ucasemap_open 140 * @stable ICU 3.4 141 */ 142 U_STABLE void U_EXPORT2 143 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode); 144 145 /** 146 * Do not lowercase non-initial parts of words when titlecasing. 147 * Option bit for titlecasing APIs that take an options bit set. 148 * 149 * By default, titlecasing will titlecase the first cased character 150 * of a word and lowercase all other characters. 151 * With this option, the other characters will not be modified. 152 * 153 * @see ucasemap_setOptions 154 * @see ucasemap_toTitle 155 * @see ucasemap_utf8ToTitle 156 * @see UnicodeString::toTitle 157 * @stable ICU 3.8 158 */ 159 #define U_TITLECASE_NO_LOWERCASE 0x100 160 161 /** 162 * Do not adjust the titlecasing indexes from BreakIterator::next() indexes; 163 * titlecase exactly the characters at breaks from the iterator. 164 * Option bit for titlecasing APIs that take an options bit set. 165 * 166 * By default, titlecasing will take each break iterator index, 167 * adjust it by looking for the next cased character, and titlecase that one. 168 * Other characters are lowercased. 169 * 170 * This follows Unicode 4 & 5 section 3.13 Default Case Operations: 171 * 172 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex 173 * #29, "Text Boundaries." Between each pair of word boundaries, find the first 174 * cased character F. If F exists, map F to default_title(F); then map each 175 * subsequent character C to default_lower(C). 176 * 177 * @see ucasemap_setOptions 178 * @see ucasemap_toTitle 179 * @see ucasemap_utf8ToTitle 180 * @see UnicodeString::toTitle 181 * @see U_TITLECASE_NO_LOWERCASE 182 * @stable ICU 3.8 183 */ 184 #define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200 185 186 #if !UCONFIG_NO_BREAK_ITERATION 187 188 /** 189 * Get the break iterator that is used for titlecasing. 190 * Do not modify the returned break iterator. 191 * @param csm UCaseMap service object. 192 * @return titlecasing break iterator 193 * @stable ICU 3.8 194 */ 195 U_STABLE const UBreakIterator * U_EXPORT2 196 ucasemap_getBreakIterator(const UCaseMap *csm); 197 198 /** 199 * Set the break iterator that is used for titlecasing. 200 * The UCaseMap service object releases a previously set break iterator 201 * and "adopts" this new one, taking ownership of it. 202 * It will be released in a subsequent call to ucasemap_setBreakIterator() 203 * or ucasemap_close(). 204 * 205 * Break iterator operations are not thread-safe. Therefore, titlecasing 206 * functions use non-const UCaseMap objects. It is not possible to titlecase 207 * strings concurrently using the same UCaseMap. 208 * 209 * @param csm UCaseMap service object. 210 * @param iterToAdopt Break iterator to be adopted for titlecasing. 211 * @param pErrorCode Must be a valid pointer to an error code value, 212 * which must not indicate a failure before the function call. 213 * 214 * @see ucasemap_toTitle 215 * @see ucasemap_utf8ToTitle 216 * @stable ICU 3.8 217 */ 218 U_STABLE void U_EXPORT2 219 ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode); 220 221 /** 222 * Titlecase a UTF-16 string. This function is almost a duplicate of u_strToTitle(), 223 * except that it takes ucasemap_setOptions() into account and has performance 224 * advantages from being able to use a UCaseMap object for multiple case mapping 225 * operations, saving setup time. 226 * 227 * Casing is locale-dependent and context-sensitive. 228 * Titlecasing uses a break iterator to find the first characters of words 229 * that are to be titlecased. It titlecases those characters and lowercases 230 * all others. (This can be modified with ucasemap_setOptions().) 231 * 232 * Note: This function takes a non-const UCaseMap pointer because it will 233 * open a default break iterator if no break iterator was set yet, 234 * and effectively call ucasemap_setBreakIterator(); 235 * also because the break iterator is stateful and will be modified during 236 * the iteration. 237 * 238 * The titlecase break iterator can be provided to customize for arbitrary 239 * styles, using rules and dictionaries beyond the standard iterators. 240 * The standard titlecase iterator for the root locale implements the 241 * algorithm of Unicode TR 21. 242 * 243 * This function uses only the setUText(), first(), next() and close() methods of the 244 * provided break iterator. 245 * 246 * The result may be longer or shorter than the original. 247 * The source string and the destination buffer must not overlap. 248 * 249 * @param csm UCaseMap service object. This pointer is non-const! 250 * See the note above for details. 251 * @param dest A buffer for the result string. The result will be NUL-terminated if 252 * the buffer is large enough. 253 * The contents is undefined in case of failure. 254 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 255 * dest may be NULL and the function will only return the length of the result 256 * without writing any of the result string. 257 * @param src The original string. 258 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 259 * @param pErrorCode Must be a valid pointer to an error code value, 260 * which must not indicate a failure before the function call. 261 * @return The length of the result string, if successful - or in case of a buffer overflow, 262 * in which case it will be greater than destCapacity. 263 * 264 * @see u_strToTitle 265 * @stable ICU 3.8 266 */ 267 U_STABLE int32_t U_EXPORT2 268 ucasemap_toTitle(UCaseMap *csm, 269 UChar *dest, int32_t destCapacity, 270 const UChar *src, int32_t srcLength, 271 UErrorCode *pErrorCode); 272 273 #endif 274 275 /** 276 * Lowercase the characters in a UTF-8 string. 277 * Casing is locale-dependent and context-sensitive. 278 * The result may be longer or shorter than the original. 279 * The source string and the destination buffer must not overlap. 280 * 281 * @param csm UCaseMap service object. 282 * @param dest A buffer for the result string. The result will be NUL-terminated if 283 * the buffer is large enough. 284 * The contents is undefined in case of failure. 285 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 286 * dest may be NULL and the function will only return the length of the result 287 * without writing any of the result string. 288 * @param src The original string. 289 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 290 * @param pErrorCode Must be a valid pointer to an error code value, 291 * which must not indicate a failure before the function call. 292 * @return The length of the result string, if successful - or in case of a buffer overflow, 293 * in which case it will be greater than destCapacity. 294 * 295 * @see u_strToLower 296 * @stable ICU 3.4 297 */ 298 U_STABLE int32_t U_EXPORT2 299 ucasemap_utf8ToLower(const UCaseMap *csm, 300 char *dest, int32_t destCapacity, 301 const char *src, int32_t srcLength, 302 UErrorCode *pErrorCode); 303 304 /** 305 * Uppercase the characters in a UTF-8 string. 306 * Casing is locale-dependent and context-sensitive. 307 * The result may be longer or shorter than the original. 308 * The source string and the destination buffer must not overlap. 309 * 310 * @param csm UCaseMap service object. 311 * @param dest A buffer for the result string. The result will be NUL-terminated if 312 * the buffer is large enough. 313 * The contents is undefined in case of failure. 314 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 315 * dest may be NULL and the function will only return the length of the result 316 * without writing any of the result string. 317 * @param src The original string. 318 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 319 * @param pErrorCode Must be a valid pointer to an error code value, 320 * which must not indicate a failure before the function call. 321 * @return The length of the result string, if successful - or in case of a buffer overflow, 322 * in which case it will be greater than destCapacity. 323 * 324 * @see u_strToUpper 325 * @stable ICU 3.4 326 */ 327 U_STABLE int32_t U_EXPORT2 328 ucasemap_utf8ToUpper(const UCaseMap *csm, 329 char *dest, int32_t destCapacity, 330 const char *src, int32_t srcLength, 331 UErrorCode *pErrorCode); 332 333 #if !UCONFIG_NO_BREAK_ITERATION 334 335 /** 336 * Titlecase a UTF-8 string. 337 * Casing is locale-dependent and context-sensitive. 338 * Titlecasing uses a break iterator to find the first characters of words 339 * that are to be titlecased. It titlecases those characters and lowercases 340 * all others. (This can be modified with ucasemap_setOptions().) 341 * 342 * Note: This function takes a non-const UCaseMap pointer because it will 343 * open a default break iterator if no break iterator was set yet, 344 * and effectively call ucasemap_setBreakIterator(); 345 * also because the break iterator is stateful and will be modified during 346 * the iteration. 347 * 348 * The titlecase break iterator can be provided to customize for arbitrary 349 * styles, using rules and dictionaries beyond the standard iterators. 350 * The standard titlecase iterator for the root locale implements the 351 * algorithm of Unicode TR 21. 352 * 353 * This function uses only the setUText(), first(), next() and close() methods of the 354 * provided break iterator. 355 * 356 * The result may be longer or shorter than the original. 357 * The source string and the destination buffer must not overlap. 358 * 359 * @param csm UCaseMap service object. This pointer is non-const! 360 * See the note above for details. 361 * @param dest A buffer for the result string. The result will be NUL-terminated if 362 * the buffer is large enough. 363 * The contents is undefined in case of failure. 364 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 365 * dest may be NULL and the function will only return the length of the result 366 * without writing any of the result string. 367 * @param src The original string. 368 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 369 * @param pErrorCode Must be a valid pointer to an error code value, 370 * which must not indicate a failure before the function call. 371 * @return The length of the result string, if successful - or in case of a buffer overflow, 372 * in which case it will be greater than destCapacity. 373 * 374 * @see u_strToTitle 375 * @see U_TITLECASE_NO_LOWERCASE 376 * @see U_TITLECASE_NO_BREAK_ADJUSTMENT 377 * @stable ICU 3.8 378 */ 379 U_STABLE int32_t U_EXPORT2 380 ucasemap_utf8ToTitle(UCaseMap *csm, 381 char *dest, int32_t destCapacity, 382 const char *src, int32_t srcLength, 383 UErrorCode *pErrorCode); 384 385 #endif 386 387 /** 388 * Case-fold the characters in a UTF-8 string. 389 * Case-folding is locale-independent and not context-sensitive, 390 * but there is an option for whether to include or exclude mappings for dotted I 391 * and dotless i that are marked with 'I' in CaseFolding.txt. 392 * The result may be longer or shorter than the original. 393 * The source string and the destination buffer must not overlap. 394 * 395 * @param csm UCaseMap service object. 396 * @param dest A buffer for the result string. The result will be NUL-terminated if 397 * the buffer is large enough. 398 * The contents is undefined in case of failure. 399 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 400 * dest may be NULL and the function will only return the length of the result 401 * without writing any of the result string. 402 * @param src The original string. 403 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 404 * @param pErrorCode Must be a valid pointer to an error code value, 405 * which must not indicate a failure before the function call. 406 * @return The length of the result string, if successful - or in case of a buffer overflow, 407 * in which case it will be greater than destCapacity. 408 * 409 * @see u_strFoldCase 410 * @see ucasemap_setOptions 411 * @see U_FOLD_CASE_DEFAULT 412 * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I 413 * @stable ICU 3.8 414 */ 415 U_STABLE int32_t U_EXPORT2 416 ucasemap_utf8FoldCase(const UCaseMap *csm, 417 char *dest, int32_t destCapacity, 418 const char *src, int32_t srcLength, 419 UErrorCode *pErrorCode); 420 421 #endif 422