1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // casemap.h 5 // created: 2017jan12 Markus W. Scherer 6 7 #ifndef __CASEMAP_H__ 8 #define __CASEMAP_H__ 9 10 #include "unicode/utypes.h" 11 12 #if U_SHOW_CPLUSPLUS_API 13 14 #include "unicode/stringpiece.h" 15 #include "unicode/uobject.h" 16 17 /** 18 * \file 19 * \brief C++ API: Low-level C++ case mapping functions. 20 */ 21 22 U_NAMESPACE_BEGIN 23 24 class BreakIterator; 25 class ByteSink; 26 class Edits; 27 28 /** 29 * Low-level C++ case mapping functions. 30 * 31 * @stable ICU 59 32 */ 33 class U_COMMON_API CaseMap U_FINAL : public UMemory { 34 public: 35 /** 36 * Lowercases a UTF-16 string and optionally records edits. 37 * Casing is locale-dependent and context-sensitive. 38 * The result may be longer or shorter than the original. 39 * The source string and the destination buffer must not overlap. 40 * 41 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 42 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 43 * @param src The original string. 44 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 45 * @param dest A buffer for the result string. The result will be NUL-terminated if 46 * the buffer is large enough. 47 * The contents is undefined in case of failure. 48 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then 49 * dest may be NULL and the function will only return the length of the result 50 * without writing any of the result string. 51 * @param edits Records edits for index mapping, working with styled text, 52 * and getting only changes (if any). 53 * The Edits contents is undefined if any error occurs. 54 * This function calls edits->reset() first unless 55 * options includes U_EDITS_NO_RESET. edits can be NULL. 56 * @param errorCode Reference to an in/out error code value 57 * which must not indicate a failure before the function call. 58 * @return The length of the result string, if successful. 59 * When the result would be longer than destCapacity, 60 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 61 * 62 * @see u_strToLower 63 * @stable ICU 59 64 */ 65 static int32_t toLower( 66 const char *locale, uint32_t options, 67 const char16_t *src, int32_t srcLength, 68 char16_t *dest, int32_t destCapacity, Edits *edits, 69 UErrorCode &errorCode); 70 71 /** 72 * Uppercases a UTF-16 string and optionally records edits. 73 * Casing is locale-dependent and context-sensitive. 74 * The result may be longer or shorter than the original. 75 * The source string and the destination buffer must not overlap. 76 * 77 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 78 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 79 * @param src The original string. 80 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 81 * @param dest A buffer for the result string. The result will be NUL-terminated if 82 * the buffer is large enough. 83 * The contents is undefined in case of failure. 84 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then 85 * dest may be NULL and the function will only return the length of the result 86 * without writing any of the result string. 87 * @param edits Records edits for index mapping, working with styled text, 88 * and getting only changes (if any). 89 * The Edits contents is undefined if any error occurs. 90 * This function calls edits->reset() first unless 91 * options includes U_EDITS_NO_RESET. edits can be NULL. 92 * @param errorCode Reference to an in/out error code value 93 * which must not indicate a failure before the function call. 94 * @return The length of the result string, if successful. 95 * When the result would be longer than destCapacity, 96 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 97 * 98 * @see u_strToUpper 99 * @stable ICU 59 100 */ 101 static int32_t toUpper( 102 const char *locale, uint32_t options, 103 const char16_t *src, int32_t srcLength, 104 char16_t *dest, int32_t destCapacity, Edits *edits, 105 UErrorCode &errorCode); 106 107 #if !UCONFIG_NO_BREAK_ITERATION 108 109 /** 110 * Titlecases a UTF-16 string and optionally records edits. 111 * Casing is locale-dependent and context-sensitive. 112 * The result may be longer or shorter than the original. 113 * The source string and the destination buffer must not overlap. 114 * 115 * Titlecasing uses a break iterator to find the first characters of words 116 * that are to be titlecased. It titlecases those characters and lowercases 117 * all others. (This can be modified with options bits.) 118 * 119 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 120 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 121 * U_TITLECASE_NO_LOWERCASE, 122 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED, 123 * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES. 124 * @param iter A break iterator to find the first characters of words that are to be titlecased. 125 * It is set to the source string (setText()) 126 * and used one or more times for iteration (first() and next()). 127 * If NULL, then a word break iterator for the locale is used 128 * (or something equivalent). 129 * @param src The original string. 130 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 131 * @param dest A buffer for the result string. The result will be NUL-terminated if 132 * the buffer is large enough. 133 * The contents is undefined in case of failure. 134 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then 135 * dest may be NULL and the function will only return the length of the result 136 * without writing any of the result string. 137 * @param edits Records edits for index mapping, working with styled text, 138 * and getting only changes (if any). 139 * The Edits contents is undefined if any error occurs. 140 * This function calls edits->reset() first unless 141 * options includes U_EDITS_NO_RESET. edits can be NULL. 142 * @param errorCode Reference to an in/out error code value 143 * which must not indicate a failure before the function call. 144 * @return The length of the result string, if successful. 145 * When the result would be longer than destCapacity, 146 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 147 * 148 * @see u_strToTitle 149 * @see ucasemap_toTitle 150 * @stable ICU 59 151 */ 152 static int32_t toTitle( 153 const char *locale, uint32_t options, BreakIterator *iter, 154 const char16_t *src, int32_t srcLength, 155 char16_t *dest, int32_t destCapacity, Edits *edits, 156 UErrorCode &errorCode); 157 158 #endif // UCONFIG_NO_BREAK_ITERATION 159 160 /** 161 * Case-folds a UTF-16 string and optionally records edits. 162 * 163 * Case folding is locale-independent and not context-sensitive, 164 * but there is an option for whether to include or exclude mappings for dotted I 165 * and dotless i that are marked with 'T' in CaseFolding.txt. 166 * 167 * The result may be longer or shorter than the original. 168 * The source string and the destination buffer must not overlap. 169 * 170 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 171 * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I. 172 * @param src The original string. 173 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 174 * @param dest A buffer for the result string. The result will be NUL-terminated if 175 * the buffer is large enough. 176 * The contents is undefined in case of failure. 177 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then 178 * dest may be NULL and the function will only return the length of the result 179 * without writing any of the result string. 180 * @param edits Records edits for index mapping, working with styled text, 181 * and getting only changes (if any). 182 * The Edits contents is undefined if any error occurs. 183 * This function calls edits->reset() first unless 184 * options includes U_EDITS_NO_RESET. edits can be NULL. 185 * @param errorCode Reference to an in/out error code value 186 * which must not indicate a failure before the function call. 187 * @return The length of the result string, if successful. 188 * When the result would be longer than destCapacity, 189 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 190 * 191 * @see u_strFoldCase 192 * @stable ICU 59 193 */ 194 static int32_t fold( 195 uint32_t options, 196 const char16_t *src, int32_t srcLength, 197 char16_t *dest, int32_t destCapacity, Edits *edits, 198 UErrorCode &errorCode); 199 200 /** 201 * Lowercases a UTF-8 string and optionally records edits. 202 * Casing is locale-dependent and context-sensitive. 203 * The result may be longer or shorter than the original. 204 * 205 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 206 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 207 * @param src The original string. 208 * @param sink A ByteSink to which the result string is written. 209 * sink.Flush() is called at the end. 210 * @param edits Records edits for index mapping, working with styled text, 211 * and getting only changes (if any). 212 * The Edits contents is undefined if any error occurs. 213 * This function calls edits->reset() first unless 214 * options includes U_EDITS_NO_RESET. edits can be NULL. 215 * @param errorCode Reference to an in/out error code value 216 * which must not indicate a failure before the function call. 217 * 218 * @see ucasemap_utf8ToLower 219 * @stable ICU 60 220 */ 221 static void utf8ToLower( 222 const char *locale, uint32_t options, 223 StringPiece src, ByteSink &sink, Edits *edits, 224 UErrorCode &errorCode); 225 226 /** 227 * Uppercases a UTF-8 string and optionally records edits. 228 * Casing is locale-dependent and context-sensitive. 229 * The result may be longer or shorter than the original. 230 * 231 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 232 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 233 * @param src The original string. 234 * @param sink A ByteSink to which the result string is written. 235 * sink.Flush() is called at the end. 236 * @param edits Records edits for index mapping, working with styled text, 237 * and getting only changes (if any). 238 * The Edits contents is undefined if any error occurs. 239 * This function calls edits->reset() first unless 240 * options includes U_EDITS_NO_RESET. edits can be NULL. 241 * @param errorCode Reference to an in/out error code value 242 * which must not indicate a failure before the function call. 243 * 244 * @see ucasemap_utf8ToUpper 245 * @stable ICU 60 246 */ 247 static void utf8ToUpper( 248 const char *locale, uint32_t options, 249 StringPiece src, ByteSink &sink, Edits *edits, 250 UErrorCode &errorCode); 251 252 #if !UCONFIG_NO_BREAK_ITERATION 253 254 /** 255 * Titlecases a UTF-8 string and optionally records edits. 256 * Casing is locale-dependent and context-sensitive. 257 * The result may be longer or shorter than the original. 258 * 259 * Titlecasing uses a break iterator to find the first characters of words 260 * that are to be titlecased. It titlecases those characters and lowercases 261 * all others. (This can be modified with options bits.) 262 * 263 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 264 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 265 * U_TITLECASE_NO_LOWERCASE, 266 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED, 267 * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES. 268 * @param iter A break iterator to find the first characters of words that are to be titlecased. 269 * It is set to the source string (setUText()) 270 * and used one or more times for iteration (first() and next()). 271 * If NULL, then a word break iterator for the locale is used 272 * (or something equivalent). 273 * @param src The original string. 274 * @param sink A ByteSink to which the result string is written. 275 * sink.Flush() is called at the end. 276 * @param edits Records edits for index mapping, working with styled text, 277 * and getting only changes (if any). 278 * The Edits contents is undefined if any error occurs. 279 * This function calls edits->reset() first unless 280 * options includes U_EDITS_NO_RESET. edits can be NULL. 281 * @param errorCode Reference to an in/out error code value 282 * which must not indicate a failure before the function call. 283 * 284 * @see ucasemap_utf8ToTitle 285 * @stable ICU 60 286 */ 287 static void utf8ToTitle( 288 const char *locale, uint32_t options, BreakIterator *iter, 289 StringPiece src, ByteSink &sink, Edits *edits, 290 UErrorCode &errorCode); 291 292 #endif // UCONFIG_NO_BREAK_ITERATION 293 294 /** 295 * Case-folds a UTF-8 string and optionally records edits. 296 * 297 * Case folding is locale-independent and not context-sensitive, 298 * but there is an option for whether to include or exclude mappings for dotted I 299 * and dotless i that are marked with 'T' in CaseFolding.txt. 300 * 301 * The result may be longer or shorter than the original. 302 * 303 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 304 * @param src The original string. 305 * @param sink A ByteSink to which the result string is written. 306 * sink.Flush() is called at the end. 307 * @param edits Records edits for index mapping, working with styled text, 308 * and getting only changes (if any). 309 * The Edits contents is undefined if any error occurs. 310 * This function calls edits->reset() first unless 311 * options includes U_EDITS_NO_RESET. edits can be NULL. 312 * @param errorCode Reference to an in/out error code value 313 * which must not indicate a failure before the function call. 314 * 315 * @see ucasemap_utf8FoldCase 316 * @stable ICU 60 317 */ 318 static void utf8Fold( 319 uint32_t options, 320 StringPiece src, ByteSink &sink, Edits *edits, 321 UErrorCode &errorCode); 322 323 /** 324 * Lowercases a UTF-8 string and optionally records edits. 325 * Casing is locale-dependent and context-sensitive. 326 * The result may be longer or shorter than the original. 327 * The source string and the destination buffer must not overlap. 328 * 329 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 330 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 331 * @param src The original string. 332 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 333 * @param dest A buffer for the result string. The result will be NUL-terminated if 334 * the buffer is large enough. 335 * The contents is undefined in case of failure. 336 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 337 * dest may be NULL and the function will only return the length of the result 338 * without writing any of the result string. 339 * @param edits Records edits for index mapping, working with styled text, 340 * and getting only changes (if any). 341 * The Edits contents is undefined if any error occurs. 342 * This function calls edits->reset() first unless 343 * options includes U_EDITS_NO_RESET. edits can be NULL. 344 * @param errorCode Reference to an in/out error code value 345 * which must not indicate a failure before the function call. 346 * @return The length of the result string, if successful. 347 * When the result would be longer than destCapacity, 348 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 349 * 350 * @see ucasemap_utf8ToLower 351 * @stable ICU 59 352 */ 353 static int32_t utf8ToLower( 354 const char *locale, uint32_t options, 355 const char *src, int32_t srcLength, 356 char *dest, int32_t destCapacity, Edits *edits, 357 UErrorCode &errorCode); 358 359 /** 360 * Uppercases a UTF-8 string and optionally records edits. 361 * Casing is locale-dependent and context-sensitive. 362 * The result may be longer or shorter than the original. 363 * The source string and the destination buffer must not overlap. 364 * 365 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 366 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 367 * @param src The original string. 368 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 369 * @param dest A buffer for the result string. The result will be NUL-terminated if 370 * the buffer is large enough. 371 * The contents is undefined in case of failure. 372 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 373 * dest may be NULL and the function will only return the length of the result 374 * without writing any of the result string. 375 * @param edits Records edits for index mapping, working with styled text, 376 * and getting only changes (if any). 377 * The Edits contents is undefined if any error occurs. 378 * This function calls edits->reset() first unless 379 * options includes U_EDITS_NO_RESET. edits can be NULL. 380 * @param errorCode Reference to an in/out error code value 381 * which must not indicate a failure before the function call. 382 * @return The length of the result string, if successful. 383 * When the result would be longer than destCapacity, 384 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 385 * 386 * @see ucasemap_utf8ToUpper 387 * @stable ICU 59 388 */ 389 static int32_t utf8ToUpper( 390 const char *locale, uint32_t options, 391 const char *src, int32_t srcLength, 392 char *dest, int32_t destCapacity, Edits *edits, 393 UErrorCode &errorCode); 394 395 #if !UCONFIG_NO_BREAK_ITERATION 396 397 /** 398 * Titlecases a UTF-8 string and optionally records edits. 399 * Casing is locale-dependent and context-sensitive. 400 * The result may be longer or shorter than the original. 401 * The source string and the destination buffer must not overlap. 402 * 403 * Titlecasing uses a break iterator to find the first characters of words 404 * that are to be titlecased. It titlecases those characters and lowercases 405 * all others. (This can be modified with options bits.) 406 * 407 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 408 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 409 * U_TITLECASE_NO_LOWERCASE, 410 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED, 411 * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES. 412 * @param iter A break iterator to find the first characters of words that are to be titlecased. 413 * It is set to the source string (setUText()) 414 * and used one or more times for iteration (first() and next()). 415 * If NULL, then a word break iterator for the locale is used 416 * (or something equivalent). 417 * @param src The original string. 418 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 419 * @param dest A buffer for the result string. The result will be NUL-terminated if 420 * the buffer is large enough. 421 * The contents is undefined in case of failure. 422 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 423 * dest may be NULL and the function will only return the length of the result 424 * without writing any of the result string. 425 * @param edits Records edits for index mapping, working with styled text, 426 * and getting only changes (if any). 427 * The Edits contents is undefined if any error occurs. 428 * This function calls edits->reset() first unless 429 * options includes U_EDITS_NO_RESET. edits can be NULL. 430 * @param errorCode Reference to an in/out error code value 431 * which must not indicate a failure before the function call. 432 * @return The length of the result string, if successful. 433 * When the result would be longer than destCapacity, 434 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 435 * 436 * @see ucasemap_utf8ToTitle 437 * @stable ICU 59 438 */ 439 static int32_t utf8ToTitle( 440 const char *locale, uint32_t options, BreakIterator *iter, 441 const char *src, int32_t srcLength, 442 char *dest, int32_t destCapacity, Edits *edits, 443 UErrorCode &errorCode); 444 445 #endif // UCONFIG_NO_BREAK_ITERATION 446 447 /** 448 * Case-folds a UTF-8 string and optionally records edits. 449 * 450 * Case folding is locale-independent and not context-sensitive, 451 * but there is an option for whether to include or exclude mappings for dotted I 452 * and dotless i that are marked with 'T' in CaseFolding.txt. 453 * 454 * The result may be longer or shorter than the original. 455 * The source string and the destination buffer must not overlap. 456 * 457 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 458 * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I. 459 * @param src The original string. 460 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 461 * @param dest A buffer for the result string. The result will be NUL-terminated if 462 * the buffer is large enough. 463 * The contents is undefined in case of failure. 464 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 465 * dest may be NULL and the function will only return the length of the result 466 * without writing any of the result string. 467 * @param edits Records edits for index mapping, working with styled text, 468 * and getting only changes (if any). 469 * The Edits contents is undefined if any error occurs. 470 * This function calls edits->reset() first unless 471 * options includes U_EDITS_NO_RESET. edits can be NULL. 472 * @param errorCode Reference to an in/out error code value 473 * which must not indicate a failure before the function call. 474 * @return The length of the result string, if successful. 475 * When the result would be longer than destCapacity, 476 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 477 * 478 * @see ucasemap_utf8FoldCase 479 * @stable ICU 59 480 */ 481 static int32_t utf8Fold( 482 uint32_t options, 483 const char *src, int32_t srcLength, 484 char *dest, int32_t destCapacity, Edits *edits, 485 UErrorCode &errorCode); 486 487 private: 488 CaseMap() = delete; 489 CaseMap(const CaseMap &other) = delete; 490 CaseMap &operator=(const CaseMap &other) = delete; 491 }; 492 493 U_NAMESPACE_END 494 495 #endif /* U_SHOW_CPLUSPLUS_API */ 496 497 #endif // __CASEMAP_H__ 498