1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // casemap.h 5 // created: 2017jan12 Markus W. Scherer 6 7 #ifndef __CASEMAP_H__ 8 #define __CASEMAP_H__ 9 10 #include "unicode/utypes.h" 11 #include "unicode/stringpiece.h" 12 #include "unicode/uobject.h" 13 14 /** 15 * \file 16 * \brief C++ API: Low-level C++ case mapping functions. 17 */ 18 19 U_NAMESPACE_BEGIN 20 21 #ifndef U_HIDE_DRAFT_API 22 23 class BreakIterator; 24 class ByteSink; 25 class Edits; 26 27 /** 28 * Low-level C++ case mapping functions. 29 * 30 * @draft ICU 59 31 */ 32 class U_COMMON_API CaseMap U_FINAL : public UMemory { 33 public: 34 /** 35 * Lowercases a UTF-16 string and optionally records edits. 36 * Casing is locale-dependent and context-sensitive. 37 * The result may be longer or shorter than the original. 38 * The source string and the destination buffer must not overlap. 39 * 40 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 41 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 42 * @param src The original string. 43 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 44 * @param dest A buffer for the result string. The result will be NUL-terminated if 45 * the buffer is large enough. 46 * The contents is undefined in case of failure. 47 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then 48 * dest may be NULL and the function will only return the length of the result 49 * without writing any of the result string. 50 * @param edits Records edits for index mapping, working with styled text, 51 * and getting only changes (if any). 52 * The Edits contents is undefined if any error occurs. 53 * This function calls edits->reset() first unless 54 * options includes U_EDITS_NO_RESET. edits can be NULL. 55 * @param errorCode Reference to an in/out error code value 56 * which must not indicate a failure before the function call. 57 * @return The length of the result string, if successful. 58 * When the result would be longer than destCapacity, 59 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 60 * 61 * @see u_strToLower 62 * @draft ICU 59 63 */ 64 static int32_t toLower( 65 const char *locale, uint32_t options, 66 const char16_t *src, int32_t srcLength, 67 char16_t *dest, int32_t destCapacity, Edits *edits, 68 UErrorCode &errorCode); 69 70 /** 71 * Uppercases a UTF-16 string and optionally records edits. 72 * Casing is locale-dependent and context-sensitive. 73 * The result may be longer or shorter than the original. 74 * The source string and the destination buffer must not overlap. 75 * 76 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 77 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 78 * @param src The original string. 79 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 80 * @param dest A buffer for the result string. The result will be NUL-terminated if 81 * the buffer is large enough. 82 * The contents is undefined in case of failure. 83 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then 84 * dest may be NULL and the function will only return the length of the result 85 * without writing any of the result string. 86 * @param edits Records edits for index mapping, working with styled text, 87 * and getting only changes (if any). 88 * The Edits contents is undefined if any error occurs. 89 * This function calls edits->reset() first unless 90 * options includes U_EDITS_NO_RESET. edits can be NULL. 91 * @param errorCode Reference to an in/out error code value 92 * which must not indicate a failure before the function call. 93 * @return The length of the result string, if successful. 94 * When the result would be longer than destCapacity, 95 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 96 * 97 * @see u_strToUpper 98 * @draft ICU 59 99 */ 100 static int32_t toUpper( 101 const char *locale, uint32_t options, 102 const char16_t *src, int32_t srcLength, 103 char16_t *dest, int32_t destCapacity, Edits *edits, 104 UErrorCode &errorCode); 105 106 #if !UCONFIG_NO_BREAK_ITERATION 107 108 /** 109 * Titlecases a UTF-16 string and optionally records edits. 110 * Casing is locale-dependent and context-sensitive. 111 * The result may be longer or shorter than the original. 112 * The source string and the destination buffer must not overlap. 113 * 114 * Titlecasing uses a break iterator to find the first characters of words 115 * that are to be titlecased. It titlecases those characters and lowercases 116 * all others. (This can be modified with options bits.) 117 * 118 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 119 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 120 * U_TITLECASE_NO_LOWERCASE, 121 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED, 122 * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES. 123 * @param iter A break iterator to find the first characters of words that are to be titlecased. 124 * It is set to the source string (setText()) 125 * and used one or more times for iteration (first() and next()). 126 * If NULL, then a word break iterator for the locale is used 127 * (or something equivalent). 128 * @param src The original string. 129 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 130 * @param dest A buffer for the result string. The result will be NUL-terminated if 131 * the buffer is large enough. 132 * The contents is undefined in case of failure. 133 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then 134 * dest may be NULL and the function will only return the length of the result 135 * without writing any of the result string. 136 * @param edits Records edits for index mapping, working with styled text, 137 * and getting only changes (if any). 138 * The Edits contents is undefined if any error occurs. 139 * This function calls edits->reset() first unless 140 * options includes U_EDITS_NO_RESET. edits can be NULL. 141 * @param errorCode Reference to an in/out error code value 142 * which must not indicate a failure before the function call. 143 * @return The length of the result string, if successful. 144 * When the result would be longer than destCapacity, 145 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 146 * 147 * @see u_strToTitle 148 * @see ucasemap_toTitle 149 * @draft ICU 59 150 */ 151 static int32_t toTitle( 152 const char *locale, uint32_t options, BreakIterator *iter, 153 const char16_t *src, int32_t srcLength, 154 char16_t *dest, int32_t destCapacity, Edits *edits, 155 UErrorCode &errorCode); 156 157 #endif // UCONFIG_NO_BREAK_ITERATION 158 159 /** 160 * Case-folds a UTF-16 string and optionally records edits. 161 * 162 * Case folding is locale-independent and not context-sensitive, 163 * but there is an option for whether to include or exclude mappings for dotted I 164 * and dotless i that are marked with 'T' in CaseFolding.txt. 165 * 166 * The result may be longer or shorter than the original. 167 * The source string and the destination buffer must not overlap. 168 * 169 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 170 * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I. 171 * @param src The original string. 172 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 173 * @param dest A buffer for the result string. The result will be NUL-terminated if 174 * the buffer is large enough. 175 * The contents is undefined in case of failure. 176 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then 177 * dest may be NULL and the function will only return the length of the result 178 * without writing any of the result string. 179 * @param edits Records edits for index mapping, working with styled text, 180 * and getting only changes (if any). 181 * The Edits contents is undefined if any error occurs. 182 * This function calls edits->reset() first unless 183 * options includes U_EDITS_NO_RESET. edits can be NULL. 184 * @param errorCode Reference to an in/out error code value 185 * which must not indicate a failure before the function call. 186 * @return The length of the result string, if successful. 187 * When the result would be longer than destCapacity, 188 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 189 * 190 * @see u_strFoldCase 191 * @draft ICU 59 192 */ 193 static int32_t fold( 194 uint32_t options, 195 const char16_t *src, int32_t srcLength, 196 char16_t *dest, int32_t destCapacity, Edits *edits, 197 UErrorCode &errorCode); 198 199 /** 200 * Lowercases a UTF-8 string and optionally records edits. 201 * Casing is locale-dependent and context-sensitive. 202 * The result may be longer or shorter than the original. 203 * 204 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 205 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 206 * @param src The original string. 207 * @param sink A ByteSink to which the result string is written. 208 * sink.Flush() is called at the end. 209 * @param edits Records edits for index mapping, working with styled text, 210 * and getting only changes (if any). 211 * The Edits contents is undefined if any error occurs. 212 * This function calls edits->reset() first unless 213 * options includes U_EDITS_NO_RESET. edits can be NULL. 214 * @param errorCode Reference to an in/out error code value 215 * which must not indicate a failure before the function call. 216 * 217 * @see ucasemap_utf8ToLower 218 * @draft ICU 60 219 */ 220 static void utf8ToLower( 221 const char *locale, uint32_t options, 222 StringPiece src, ByteSink &sink, Edits *edits, 223 UErrorCode &errorCode); 224 225 /** 226 * Uppercases a UTF-8 string and optionally records edits. 227 * Casing is locale-dependent and context-sensitive. 228 * The result may be longer or shorter than the original. 229 * 230 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 231 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 232 * @param src The original string. 233 * @param sink A ByteSink to which the result string is written. 234 * sink.Flush() is called at the end. 235 * @param edits Records edits for index mapping, working with styled text, 236 * and getting only changes (if any). 237 * The Edits contents is undefined if any error occurs. 238 * This function calls edits->reset() first unless 239 * options includes U_EDITS_NO_RESET. edits can be NULL. 240 * @param errorCode Reference to an in/out error code value 241 * which must not indicate a failure before the function call. 242 * 243 * @see ucasemap_utf8ToUpper 244 * @draft ICU 60 245 */ 246 static void utf8ToUpper( 247 const char *locale, uint32_t options, 248 StringPiece src, ByteSink &sink, Edits *edits, 249 UErrorCode &errorCode); 250 251 #if !UCONFIG_NO_BREAK_ITERATION 252 253 /** 254 * Titlecases a UTF-8 string and optionally records edits. 255 * Casing is locale-dependent and context-sensitive. 256 * The result may be longer or shorter than the original. 257 * 258 * Titlecasing uses a break iterator to find the first characters of words 259 * that are to be titlecased. It titlecases those characters and lowercases 260 * all others. (This can be modified with options bits.) 261 * 262 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 263 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 264 * U_TITLECASE_NO_LOWERCASE, 265 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED, 266 * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES. 267 * @param iter A break iterator to find the first characters of words that are to be titlecased. 268 * It is set to the source string (setUText()) 269 * and used one or more times for iteration (first() and next()). 270 * If NULL, then a word break iterator for the locale is used 271 * (or something equivalent). 272 * @param src The original string. 273 * @param sink A ByteSink to which the result string is written. 274 * sink.Flush() is called at the end. 275 * @param edits Records edits for index mapping, working with styled text, 276 * and getting only changes (if any). 277 * The Edits contents is undefined if any error occurs. 278 * This function calls edits->reset() first unless 279 * options includes U_EDITS_NO_RESET. edits can be NULL. 280 * @param errorCode Reference to an in/out error code value 281 * which must not indicate a failure before the function call. 282 * 283 * @see ucasemap_utf8ToTitle 284 * @draft ICU 60 285 */ 286 static void utf8ToTitle( 287 const char *locale, uint32_t options, BreakIterator *iter, 288 StringPiece src, ByteSink &sink, Edits *edits, 289 UErrorCode &errorCode); 290 291 #endif // UCONFIG_NO_BREAK_ITERATION 292 293 /** 294 * Case-folds a UTF-8 string and optionally records edits. 295 * 296 * Case folding is locale-independent and not context-sensitive, 297 * but there is an option for whether to include or exclude mappings for dotted I 298 * and dotless i that are marked with 'T' in CaseFolding.txt. 299 * 300 * The result may be longer or shorter than the original. 301 * 302 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 303 * @param src The original string. 304 * @param sink A ByteSink to which the result string is written. 305 * sink.Flush() is called at the end. 306 * @param edits Records edits for index mapping, working with styled text, 307 * and getting only changes (if any). 308 * The Edits contents is undefined if any error occurs. 309 * This function calls edits->reset() first unless 310 * options includes U_EDITS_NO_RESET. edits can be NULL. 311 * @param errorCode Reference to an in/out error code value 312 * which must not indicate a failure before the function call. 313 * 314 * @see ucasemap_utf8FoldCase 315 * @draft ICU 60 316 */ 317 static void utf8Fold( 318 uint32_t options, 319 StringPiece src, ByteSink &sink, Edits *edits, 320 UErrorCode &errorCode); 321 322 /** 323 * Lowercases a UTF-8 string and optionally records edits. 324 * Casing is locale-dependent and context-sensitive. 325 * The result may be longer or shorter than the original. 326 * The source string and the destination buffer must not overlap. 327 * 328 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 329 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 330 * @param src The original string. 331 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 332 * @param dest A buffer for the result string. The result will be NUL-terminated if 333 * the buffer is large enough. 334 * The contents is undefined in case of failure. 335 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 336 * dest may be NULL and the function will only return the length of the result 337 * without writing any of the result string. 338 * @param edits Records edits for index mapping, working with styled text, 339 * and getting only changes (if any). 340 * The Edits contents is undefined if any error occurs. 341 * This function calls edits->reset() first unless 342 * options includes U_EDITS_NO_RESET. edits can be NULL. 343 * @param errorCode Reference to an in/out error code value 344 * which must not indicate a failure before the function call. 345 * @return The length of the result string, if successful. 346 * When the result would be longer than destCapacity, 347 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 348 * 349 * @see ucasemap_utf8ToLower 350 * @draft ICU 59 351 */ 352 static int32_t utf8ToLower( 353 const char *locale, uint32_t options, 354 const char *src, int32_t srcLength, 355 char *dest, int32_t destCapacity, Edits *edits, 356 UErrorCode &errorCode); 357 358 /** 359 * Uppercases a UTF-8 string and optionally records edits. 360 * Casing is locale-dependent and context-sensitive. 361 * The result may be longer or shorter than the original. 362 * The source string and the destination buffer must not overlap. 363 * 364 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 365 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 366 * @param src The original string. 367 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 368 * @param dest A buffer for the result string. The result will be NUL-terminated if 369 * the buffer is large enough. 370 * The contents is undefined in case of failure. 371 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 372 * dest may be NULL and the function will only return the length of the result 373 * without writing any of the result string. 374 * @param edits Records edits for index mapping, working with styled text, 375 * and getting only changes (if any). 376 * The Edits contents is undefined if any error occurs. 377 * This function calls edits->reset() first unless 378 * options includes U_EDITS_NO_RESET. edits can be NULL. 379 * @param errorCode Reference to an in/out error code value 380 * which must not indicate a failure before the function call. 381 * @return The length of the result string, if successful. 382 * When the result would be longer than destCapacity, 383 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 384 * 385 * @see ucasemap_utf8ToUpper 386 * @draft ICU 59 387 */ 388 static int32_t utf8ToUpper( 389 const char *locale, uint32_t options, 390 const char *src, int32_t srcLength, 391 char *dest, int32_t destCapacity, Edits *edits, 392 UErrorCode &errorCode); 393 394 #if !UCONFIG_NO_BREAK_ITERATION 395 396 /** 397 * Titlecases a UTF-8 string and optionally records edits. 398 * Casing is locale-dependent and context-sensitive. 399 * The result may be longer or shorter than the original. 400 * The source string and the destination buffer must not overlap. 401 * 402 * Titlecasing uses a break iterator to find the first characters of words 403 * that are to be titlecased. It titlecases those characters and lowercases 404 * all others. (This can be modified with options bits.) 405 * 406 * @param locale The locale ID. ("" = root locale, NULL = default locale.) 407 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 408 * U_TITLECASE_NO_LOWERCASE, 409 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED, 410 * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES. 411 * @param iter A break iterator to find the first characters of words that are to be titlecased. 412 * It is set to the source string (setUText()) 413 * and used one or more times for iteration (first() and next()). 414 * If NULL, then a word break iterator for the locale is used 415 * (or something equivalent). 416 * @param src The original string. 417 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 418 * @param dest A buffer for the result string. The result will be NUL-terminated if 419 * the buffer is large enough. 420 * The contents is undefined in case of failure. 421 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 422 * dest may be NULL and the function will only return the length of the result 423 * without writing any of the result string. 424 * @param edits Records edits for index mapping, working with styled text, 425 * and getting only changes (if any). 426 * The Edits contents is undefined if any error occurs. 427 * This function calls edits->reset() first unless 428 * options includes U_EDITS_NO_RESET. edits can be NULL. 429 * @param errorCode Reference to an in/out error code value 430 * which must not indicate a failure before the function call. 431 * @return The length of the result string, if successful. 432 * When the result would be longer than destCapacity, 433 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 434 * 435 * @see ucasemap_utf8ToTitle 436 * @draft ICU 59 437 */ 438 static int32_t utf8ToTitle( 439 const char *locale, uint32_t options, BreakIterator *iter, 440 const char *src, int32_t srcLength, 441 char *dest, int32_t destCapacity, Edits *edits, 442 UErrorCode &errorCode); 443 444 #endif // UCONFIG_NO_BREAK_ITERATION 445 446 /** 447 * Case-folds a UTF-8 string and optionally records edits. 448 * 449 * Case folding is locale-independent and not context-sensitive, 450 * but there is an option for whether to include or exclude mappings for dotted I 451 * and dotless i that are marked with 'T' in CaseFolding.txt. 452 * 453 * The result may be longer or shorter than the original. 454 * The source string and the destination buffer must not overlap. 455 * 456 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET, 457 * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I. 458 * @param src The original string. 459 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 460 * @param dest A buffer for the result string. The result will be NUL-terminated if 461 * the buffer is large enough. 462 * The contents is undefined in case of failure. 463 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 464 * dest may be NULL and the function will only return the length of the result 465 * without writing any of the result string. 466 * @param edits Records edits for index mapping, working with styled text, 467 * and getting only changes (if any). 468 * The Edits contents is undefined if any error occurs. 469 * This function calls edits->reset() first unless 470 * options includes U_EDITS_NO_RESET. edits can be NULL. 471 * @param errorCode Reference to an in/out error code value 472 * which must not indicate a failure before the function call. 473 * @return The length of the result string, if successful. 474 * When the result would be longer than destCapacity, 475 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. 476 * 477 * @see ucasemap_utf8FoldCase 478 * @draft ICU 59 479 */ 480 static int32_t utf8Fold( 481 uint32_t options, 482 const char *src, int32_t srcLength, 483 char *dest, int32_t destCapacity, Edits *edits, 484 UErrorCode &errorCode); 485 486 private: 487 CaseMap() = delete; 488 CaseMap(const CaseMap &other) = delete; 489 CaseMap &operator=(const CaseMap &other) = delete; 490 }; 491 492 #endif // U_HIDE_DRAFT_API 493 494 U_NAMESPACE_END 495 496 #endif // __CASEMAP_H__ 497