1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // casemap.h 5 // created: 2017jan12 Markus W. Scherer 6 7 #ifndef __CASEMAP_H__ 8 #define __CASEMAP_H__ 9 10 #include "unicode/utypes.h" 11 #include "unicode/stringpiece.h" 12 #include "unicode/uobject.h" 13 14 /** 15 * \file 16 * \brief C++ API: Low-level C++ case mapping functions. 17 */ 18 19 U_NAMESPACE_BEGIN 20 21 class BreakIterator; 22 class ByteSink; 23 class Edits; 24 25 /** 26 * Low-level C++ case mapping functions. 27 * 28 * @stable ICU 59 29 */ 30 class U_COMMON_API CaseMap U_FINAL : public UMemory { 31 public: 32 /** 33 * Lowercases a UTF-16 string and optionally records edits. 34 * Casing is locale-dependent and context-sensitive. 35 * The result may be longer or shorter than the original. 36 * The source string and the destination buffer must not overlap. 37 * 38 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 39 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 40 * @param src The original string. 41 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 42 * @param dest A buffer for the result string. The result will be NUL-terminated if 43 * the buffer is large enough. 44 * The contents is undefined in case of failure. 45 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then 46 * dest may be NULL and the function will only return the length of the result 47 * without writing any of the result string. 48 * @param edits Records edits for index mapping, working with styled text, 49 * and getting only changes (if any). 50 * The Edits contents is undefined if any error occurs. 51 * This function calls edits->reset() first unless 52 * options includes U_EDITS_NO_RESET. edits can be NULL. 53 * @param errorCode Reference to an in/out error code value 54 * which must not indicate a failure before the function call. 55 * @return The length of the result string, if successful. 56 * When the result would be longer than destCapacity, 57 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 58 * 59 * @see u_strToLower 60 * @stable ICU 59 61 */ 62 static int32_t toLower( 63 const char *locale, uint32_t options, 64 const char16_t *src, int32_t srcLength, 65 char16_t *dest, int32_t destCapacity, Edits *edits, 66 UErrorCode &errorCode); 67 68 /** 69 * Uppercases a UTF-16 string and optionally records edits. 70 * Casing is locale-dependent and context-sensitive. 71 * The result may be longer or shorter than the original. 72 * The source string and the destination buffer must not overlap. 73 * 74 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 75 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 76 * @param src The original string. 77 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 78 * @param dest A buffer for the result string. The result will be NUL-terminated if 79 * the buffer is large enough. 80 * The contents is undefined in case of failure. 81 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then 82 * dest may be NULL and the function will only return the length of the result 83 * without writing any of the result string. 84 * @param edits Records edits for index mapping, working with styled text, 85 * and getting only changes (if any). 86 * The Edits contents is undefined if any error occurs. 87 * This function calls edits->reset() first unless 88 * options includes U_EDITS_NO_RESET. edits can be NULL. 89 * @param errorCode Reference to an in/out error code value 90 * which must not indicate a failure before the function call. 91 * @return The length of the result string, if successful. 92 * When the result would be longer than destCapacity, 93 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 94 * 95 * @see u_strToUpper 96 * @stable ICU 59 97 */ 98 static int32_t toUpper( 99 const char *locale, uint32_t options, 100 const char16_t *src, int32_t srcLength, 101 char16_t *dest, int32_t destCapacity, Edits *edits, 102 UErrorCode &errorCode); 103 104 #if !UCONFIG_NO_BREAK_ITERATION 105 106 /** 107 * Titlecases a UTF-16 string and optionally records edits. 108 * Casing is locale-dependent and context-sensitive. 109 * The result may be longer or shorter than the original. 110 * The source string and the destination buffer must not overlap. 111 * 112 * Titlecasing uses a break iterator to find the first characters of words 113 * that are to be titlecased. It titlecases those characters and lowercases 114 * all others. (This can be modified with options bits.) 115 * 116 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 117 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 118 * U_TITLECASE_NO_LOWERCASE, 119 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED, 120 * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES. 121 * @param iter A break iterator to find the first characters of words that are to be titlecased. 122 * It is set to the source string (setText()) 123 * and used one or more times for iteration (first() and next()). 124 * If NULL, then a word break iterator for the locale is used 125 * (or something equivalent). 126 * @param src The original string. 127 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 128 * @param dest A buffer for the result string. The result will be NUL-terminated if 129 * the buffer is large enough. 130 * The contents is undefined in case of failure. 131 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then 132 * dest may be NULL and the function will only return the length of the result 133 * without writing any of the result string. 134 * @param edits Records edits for index mapping, working with styled text, 135 * and getting only changes (if any). 136 * The Edits contents is undefined if any error occurs. 137 * This function calls edits->reset() first unless 138 * options includes U_EDITS_NO_RESET. edits can be NULL. 139 * @param errorCode Reference to an in/out error code value 140 * which must not indicate a failure before the function call. 141 * @return The length of the result string, if successful. 142 * When the result would be longer than destCapacity, 143 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 144 * 145 * @see u_strToTitle 146 * @see ucasemap_toTitle 147 * @stable ICU 59 148 */ 149 static int32_t toTitle( 150 const char *locale, uint32_t options, BreakIterator *iter, 151 const char16_t *src, int32_t srcLength, 152 char16_t *dest, int32_t destCapacity, Edits *edits, 153 UErrorCode &errorCode); 154 155 #endif // UCONFIG_NO_BREAK_ITERATION 156 157 /** 158 * Case-folds a UTF-16 string and optionally records edits. 159 * 160 * Case folding is locale-independent and not context-sensitive, 161 * but there is an option for whether to include or exclude mappings for dotted I 162 * and dotless i that are marked with 'T' in CaseFolding.txt. 163 * 164 * The result may be longer or shorter than the original. 165 * The source string and the destination buffer must not overlap. 166 * 167 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 168 * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I. 169 * @param src The original string. 170 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 171 * @param dest A buffer for the result string. The result will be NUL-terminated if 172 * the buffer is large enough. 173 * The contents is undefined in case of failure. 174 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then 175 * dest may be NULL and the function will only return the length of the result 176 * without writing any of the result string. 177 * @param edits Records edits for index mapping, working with styled text, 178 * and getting only changes (if any). 179 * The Edits contents is undefined if any error occurs. 180 * This function calls edits->reset() first unless 181 * options includes U_EDITS_NO_RESET. edits can be NULL. 182 * @param errorCode Reference to an in/out error code value 183 * which must not indicate a failure before the function call. 184 * @return The length of the result string, if successful. 185 * When the result would be longer than destCapacity, 186 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 187 * 188 * @see u_strFoldCase 189 * @stable ICU 59 190 */ 191 static int32_t fold( 192 uint32_t options, 193 const char16_t *src, int32_t srcLength, 194 char16_t *dest, int32_t destCapacity, Edits *edits, 195 UErrorCode &errorCode); 196 197 /** 198 * Lowercases a UTF-8 string and optionally records edits. 199 * Casing is locale-dependent and context-sensitive. 200 * The result may be longer or shorter than the original. 201 * 202 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 203 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 204 * @param src The original string. 205 * @param sink A ByteSink to which the result string is written. 206 * sink.Flush() is called at the end. 207 * @param edits Records edits for index mapping, working with styled text, 208 * and getting only changes (if any). 209 * The Edits contents is undefined if any error occurs. 210 * This function calls edits->reset() first unless 211 * options includes U_EDITS_NO_RESET. edits can be NULL. 212 * @param errorCode Reference to an in/out error code value 213 * which must not indicate a failure before the function call. 214 * 215 * @see ucasemap_utf8ToLower 216 * @stable ICU 60 217 */ 218 static void utf8ToLower( 219 const char *locale, uint32_t options, 220 StringPiece src, ByteSink &sink, Edits *edits, 221 UErrorCode &errorCode); 222 223 /** 224 * Uppercases a UTF-8 string and optionally records edits. 225 * Casing is locale-dependent and context-sensitive. 226 * The result may be longer or shorter than the original. 227 * 228 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 229 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 230 * @param src The original string. 231 * @param sink A ByteSink to which the result string is written. 232 * sink.Flush() is called at the end. 233 * @param edits Records edits for index mapping, working with styled text, 234 * and getting only changes (if any). 235 * The Edits contents is undefined if any error occurs. 236 * This function calls edits->reset() first unless 237 * options includes U_EDITS_NO_RESET. edits can be NULL. 238 * @param errorCode Reference to an in/out error code value 239 * which must not indicate a failure before the function call. 240 * 241 * @see ucasemap_utf8ToUpper 242 * @stable ICU 60 243 */ 244 static void utf8ToUpper( 245 const char *locale, uint32_t options, 246 StringPiece src, ByteSink &sink, Edits *edits, 247 UErrorCode &errorCode); 248 249 #if !UCONFIG_NO_BREAK_ITERATION 250 251 /** 252 * Titlecases a UTF-8 string and optionally records edits. 253 * Casing is locale-dependent and context-sensitive. 254 * The result may be longer or shorter than the original. 255 * 256 * Titlecasing uses a break iterator to find the first characters of words 257 * that are to be titlecased. It titlecases those characters and lowercases 258 * all others. (This can be modified with options bits.) 259 * 260 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 261 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 262 * U_TITLECASE_NO_LOWERCASE, 263 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED, 264 * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES. 265 * @param iter A break iterator to find the first characters of words that are to be titlecased. 266 * It is set to the source string (setUText()) 267 * and used one or more times for iteration (first() and next()). 268 * If NULL, then a word break iterator for the locale is used 269 * (or something equivalent). 270 * @param src The original string. 271 * @param sink A ByteSink to which the result string is written. 272 * sink.Flush() is called at the end. 273 * @param edits Records edits for index mapping, working with styled text, 274 * and getting only changes (if any). 275 * The Edits contents is undefined if any error occurs. 276 * This function calls edits->reset() first unless 277 * options includes U_EDITS_NO_RESET. edits can be NULL. 278 * @param errorCode Reference to an in/out error code value 279 * which must not indicate a failure before the function call. 280 * 281 * @see ucasemap_utf8ToTitle 282 * @stable ICU 60 283 */ 284 static void utf8ToTitle( 285 const char *locale, uint32_t options, BreakIterator *iter, 286 StringPiece src, ByteSink &sink, Edits *edits, 287 UErrorCode &errorCode); 288 289 #endif // UCONFIG_NO_BREAK_ITERATION 290 291 /** 292 * Case-folds a UTF-8 string and optionally records edits. 293 * 294 * Case folding is locale-independent and not context-sensitive, 295 * but there is an option for whether to include or exclude mappings for dotted I 296 * and dotless i that are marked with 'T' in CaseFolding.txt. 297 * 298 * The result may be longer or shorter than the original. 299 * 300 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 301 * @param src The original string. 302 * @param sink A ByteSink to which the result string is written. 303 * sink.Flush() is called at the end. 304 * @param edits Records edits for index mapping, working with styled text, 305 * and getting only changes (if any). 306 * The Edits contents is undefined if any error occurs. 307 * This function calls edits->reset() first unless 308 * options includes U_EDITS_NO_RESET. edits can be NULL. 309 * @param errorCode Reference to an in/out error code value 310 * which must not indicate a failure before the function call. 311 * 312 * @see ucasemap_utf8FoldCase 313 * @stable ICU 60 314 */ 315 static void utf8Fold( 316 uint32_t options, 317 StringPiece src, ByteSink &sink, Edits *edits, 318 UErrorCode &errorCode); 319 320 /** 321 * Lowercases a UTF-8 string and optionally records edits. 322 * Casing is locale-dependent and context-sensitive. 323 * The result may be longer or shorter than the original. 324 * The source string and the destination buffer must not overlap. 325 * 326 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 327 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 328 * @param src The original string. 329 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 330 * @param dest A buffer for the result string. The result will be NUL-terminated if 331 * the buffer is large enough. 332 * The contents is undefined in case of failure. 333 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 334 * dest may be NULL and the function will only return the length of the result 335 * without writing any of the result string. 336 * @param edits Records edits for index mapping, working with styled text, 337 * and getting only changes (if any). 338 * The Edits contents is undefined if any error occurs. 339 * This function calls edits->reset() first unless 340 * options includes U_EDITS_NO_RESET. edits can be NULL. 341 * @param errorCode Reference to an in/out error code value 342 * which must not indicate a failure before the function call. 343 * @return The length of the result string, if successful. 344 * When the result would be longer than destCapacity, 345 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 346 * 347 * @see ucasemap_utf8ToLower 348 * @stable ICU 59 349 */ 350 static int32_t utf8ToLower( 351 const char *locale, uint32_t options, 352 const char *src, int32_t srcLength, 353 char *dest, int32_t destCapacity, Edits *edits, 354 UErrorCode &errorCode); 355 356 /** 357 * Uppercases a UTF-8 string and optionally records edits. 358 * Casing is locale-dependent and context-sensitive. 359 * The result may be longer or shorter than the original. 360 * The source string and the destination buffer must not overlap. 361 * 362 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 363 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 364 * @param src The original string. 365 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 366 * @param dest A buffer for the result string. The result will be NUL-terminated if 367 * the buffer is large enough. 368 * The contents is undefined in case of failure. 369 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 370 * dest may be NULL and the function will only return the length of the result 371 * without writing any of the result string. 372 * @param edits Records edits for index mapping, working with styled text, 373 * and getting only changes (if any). 374 * The Edits contents is undefined if any error occurs. 375 * This function calls edits->reset() first unless 376 * options includes U_EDITS_NO_RESET. edits can be NULL. 377 * @param errorCode Reference to an in/out error code value 378 * which must not indicate a failure before the function call. 379 * @return The length of the result string, if successful. 380 * When the result would be longer than destCapacity, 381 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 382 * 383 * @see ucasemap_utf8ToUpper 384 * @stable ICU 59 385 */ 386 static int32_t utf8ToUpper( 387 const char *locale, uint32_t options, 388 const char *src, int32_t srcLength, 389 char *dest, int32_t destCapacity, Edits *edits, 390 UErrorCode &errorCode); 391 392 #if !UCONFIG_NO_BREAK_ITERATION 393 394 /** 395 * Titlecases a UTF-8 string and optionally records edits. 396 * Casing is locale-dependent and context-sensitive. 397 * The result may be longer or shorter than the original. 398 * The source string and the destination buffer must not overlap. 399 * 400 * Titlecasing uses a break iterator to find the first characters of words 401 * that are to be titlecased. It titlecases those characters and lowercases 402 * all others. (This can be modified with options bits.) 403 * 404 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 405 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 406 * U_TITLECASE_NO_LOWERCASE, 407 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED, 408 * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES. 409 * @param iter A break iterator to find the first characters of words that are to be titlecased. 410 * It is set to the source string (setUText()) 411 * and used one or more times for iteration (first() and next()). 412 * If NULL, then a word break iterator for the locale is used 413 * (or something equivalent). 414 * @param src The original string. 415 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 416 * @param dest A buffer for the result string. The result will be NUL-terminated if 417 * the buffer is large enough. 418 * The contents is undefined in case of failure. 419 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 420 * dest may be NULL and the function will only return the length of the result 421 * without writing any of the result string. 422 * @param edits Records edits for index mapping, working with styled text, 423 * and getting only changes (if any). 424 * The Edits contents is undefined if any error occurs. 425 * This function calls edits->reset() first unless 426 * options includes U_EDITS_NO_RESET. edits can be NULL. 427 * @param errorCode Reference to an in/out error code value 428 * which must not indicate a failure before the function call. 429 * @return The length of the result string, if successful. 430 * When the result would be longer than destCapacity, 431 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 432 * 433 * @see ucasemap_utf8ToTitle 434 * @stable ICU 59 435 */ 436 static int32_t utf8ToTitle( 437 const char *locale, uint32_t options, BreakIterator *iter, 438 const char *src, int32_t srcLength, 439 char *dest, int32_t destCapacity, Edits *edits, 440 UErrorCode &errorCode); 441 442 #endif // UCONFIG_NO_BREAK_ITERATION 443 444 /** 445 * Case-folds a UTF-8 string and optionally records edits. 446 * 447 * Case folding is locale-independent and not context-sensitive, 448 * but there is an option for whether to include or exclude mappings for dotted I 449 * and dotless i that are marked with 'T' in CaseFolding.txt. 450 * 451 * The result may be longer or shorter than the original. 452 * The source string and the destination buffer must not overlap. 453 * 454 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 455 * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I. 456 * @param src The original string. 457 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 458 * @param dest A buffer for the result string. The result will be NUL-terminated if 459 * the buffer is large enough. 460 * The contents is undefined in case of failure. 461 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 462 * dest may be NULL and the function will only return the length of the result 463 * without writing any of the result string. 464 * @param edits Records edits for index mapping, working with styled text, 465 * and getting only changes (if any). 466 * The Edits contents is undefined if any error occurs. 467 * This function calls edits->reset() first unless 468 * options includes U_EDITS_NO_RESET. edits can be NULL. 469 * @param errorCode Reference to an in/out error code value 470 * which must not indicate a failure before the function call. 471 * @return The length of the result string, if successful. 472 * When the result would be longer than destCapacity, 473 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 474 * 475 * @see ucasemap_utf8FoldCase 476 * @stable ICU 59 477 */ 478 static int32_t utf8Fold( 479 uint32_t options, 480 const char *src, int32_t srcLength, 481 char *dest, int32_t destCapacity, Edits *edits, 482 UErrorCode &errorCode); 483 484 private: 485 CaseMap() = delete; 486 CaseMap(const CaseMap &other) = delete; 487 CaseMap &operator=(const CaseMap &other) = delete; 488 }; 489 490 U_NAMESPACE_END 491 492 #endif // __CASEMAP_H__ 493