1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * Copyright (C) 1996-2015, International Business Machines Corporation and others. 6 * All Rights Reserved. 7 ****************************************************************************** 8 */ 9 10 #ifndef UBRK_H 11 #define UBRK_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uloc.h" 15 #include "unicode/utext.h" 16 17 #if U_SHOW_CPLUSPLUS_API 18 #include "unicode/localpointer.h" 19 #endif // U_SHOW_CPLUSPLUS_API 20 21 /** 22 * A text-break iterator. 23 * For usage in C programs. 24 */ 25 #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR 26 # define UBRK_TYPEDEF_UBREAK_ITERATOR 27 /** 28 * Opaque type representing an ICU Break iterator object. 29 * @stable ICU 2.0 30 */ 31 typedef struct UBreakIterator UBreakIterator; 32 #endif 33 34 #include "unicode/parseerr.h" 35 36 #if !UCONFIG_NO_BREAK_ITERATION 37 /** 38 * \file 39 * \brief C API: BreakIterator 40 * 41 * <h2> BreakIterator C API </h2> 42 * 43 * The BreakIterator C API defines methods for finding the location 44 * of boundaries in text. Pointer to a UBreakIterator maintain a 45 * current position and scan over text returning the index of characters 46 * where boundaries occur. 47 * <p> 48 * Line boundary analysis determines where a text string can be broken 49 * when line-wrapping. The mechanism correctly handles punctuation and 50 * hyphenated words. 51 * <p> 52 * Note: The locale keyword "lb" can be used to modify line break 53 * behavior according to the CSS level 3 line-break options, see 54 * <http://dev.w3.org/csswg/css-text/#line-breaking>. For example: 55 * "ja@lb=strict", "zh@lb=loose". 56 * <p> 57 * Sentence boundary analysis allows selection with correct 58 * interpretation of periods within numbers and abbreviations, and 59 * trailing punctuation marks such as quotation marks and parentheses. 60 * <p> 61 * Note: The locale keyword "ss" can be used to enable use of 62 * segmentation suppression data (preventing breaks in English after 63 * abbreviations such as "Mr." or "Est.", for example), as follows: 64 * "en@ss=standard". 65 * <p> 66 * Word boundary analysis is used by search and replace functions, as 67 * well as within text editing applications that allow the user to 68 * select words with a double click. Word selection provides correct 69 * interpretation of punctuation marks within and following 70 * words. Characters that are not part of a word, such as symbols or 71 * punctuation marks, have word-breaks on both sides. 72 * <p> 73 * Character boundary analysis identifies the boundaries of 74 * "Extended Grapheme Clusters", which are groupings of codepoints 75 * that should be treated as character-like units for many text operations. 76 * Please see Unicode Standard Annex #29, Unicode Text Segmentation, 77 * http://www.unicode.org/reports/tr29/ for additional information 78 * on grapheme clusters and guidelines on their use. 79 * <p> 80 * Title boundary analysis locates all positions, 81 * typically starts of words, that should be set to Title Case 82 * when title casing the text. 83 * <p> 84 * The text boundary positions are found according to the rules 85 * described in Unicode Standard Annex #29, Text Boundaries, and 86 * Unicode Standard Annex #14, Line Breaking Properties. These 87 * are available at http://www.unicode.org/reports/tr14/ and 88 * http://www.unicode.org/reports/tr29/. 89 * <p> 90 * In addition to the plain C API defined in this header file, an 91 * object oriented C++ API with equivalent functionality is defined in the 92 * file brkiter.h. 93 * <p> 94 * Code snippets illustrating the use of the Break Iterator APIs 95 * are available in the ICU User Guide, 96 * https://unicode-org.github.io/icu/userguide/boundaryanalysis/ 97 * and in the sample program icu/source/samples/break/break.cpp 98 */ 99 100 /** The possible types of text boundaries. @stable ICU 2.0 */ 101 typedef enum UBreakIteratorType { 102 /** Character breaks @stable ICU 2.0 */ 103 UBRK_CHARACTER = 0, 104 /** Word breaks @stable ICU 2.0 */ 105 UBRK_WORD = 1, 106 /** Line breaks @stable ICU 2.0 */ 107 UBRK_LINE = 2, 108 /** Sentence breaks @stable ICU 2.0 */ 109 UBRK_SENTENCE = 3, 110 } UBreakIteratorType; 111 112 /** Value indicating all text boundaries have been returned. 113 * @stable ICU 2.0 114 */ 115 #define UBRK_DONE ((int32_t) -1) 116 117 118 /** 119 * Enum constants for the word break tags returned by 120 * getRuleStatus(). A range of values is defined for each category of 121 * word, to allow for further subdivisions of a category in future releases. 122 * Applications should check for tag values falling within the range, rather 123 * than for single individual values. 124 * 125 * The numeric values of all of these constants are stable (will not change). 126 * 127 * @stable ICU 2.2 128 */ 129 typedef enum UWordBreak { 130 /** Tag value for "words" that do not fit into any of other categories. 131 * Includes spaces and most punctuation. */ 132 UBRK_WORD_NONE = 0, 133 /** Upper bound for tags for uncategorized words. */ 134 UBRK_WORD_NONE_LIMIT = 100, 135 /** Tag value for words that appear to be numbers, lower limit. */ 136 UBRK_WORD_NUMBER = 100, 137 /** Tag value for words that appear to be numbers, upper limit. */ 138 UBRK_WORD_NUMBER_LIMIT = 200, 139 /** Tag value for words that contain letters, excluding 140 * hiragana, katakana or ideographic characters, lower limit. */ 141 UBRK_WORD_LETTER = 200, 142 /** Tag value for words containing letters, upper limit */ 143 UBRK_WORD_LETTER_LIMIT = 300, 144 /** Tag value for words containing kana characters, lower limit */ 145 UBRK_WORD_KANA = 300, 146 /** Tag value for words containing kana characters, upper limit */ 147 UBRK_WORD_KANA_LIMIT = 400, 148 /** Tag value for words containing ideographic characters, lower limit */ 149 UBRK_WORD_IDEO = 400, 150 /** Tag value for words containing ideographic characters, upper limit */ 151 UBRK_WORD_IDEO_LIMIT = 500 152 } UWordBreak; 153 154 /** 155 * Enum constants for the line break tags returned by getRuleStatus(). 156 * A range of values is defined for each category of 157 * word, to allow for further subdivisions of a category in future releases. 158 * Applications should check for tag values falling within the range, rather 159 * than for single individual values. 160 * 161 * The numeric values of all of these constants are stable (will not change). 162 * 163 * @stable ICU 2.8 164 */ 165 typedef enum ULineBreakTag { 166 /** Tag value for soft line breaks, positions at which a line break 167 * is acceptable but not required */ 168 UBRK_LINE_SOFT = 0, 169 /** Upper bound for soft line breaks. */ 170 UBRK_LINE_SOFT_LIMIT = 100, 171 /** Tag value for a hard, or mandatory line break */ 172 UBRK_LINE_HARD = 100, 173 /** Upper bound for hard line breaks. */ 174 UBRK_LINE_HARD_LIMIT = 200 175 } ULineBreakTag; 176 177 178 179 /** 180 * Enum constants for the sentence break tags returned by getRuleStatus(). 181 * A range of values is defined for each category of 182 * sentence, to allow for further subdivisions of a category in future releases. 183 * Applications should check for tag values falling within the range, rather 184 * than for single individual values. 185 * 186 * The numeric values of all of these constants are stable (will not change). 187 * 188 * @stable ICU 2.8 189 */ 190 typedef enum USentenceBreakTag { 191 /** Tag value for for sentences ending with a sentence terminator 192 * ('.', '?', '!', etc.) character, possibly followed by a 193 * hard separator (CR, LF, PS, etc.) 194 */ 195 UBRK_SENTENCE_TERM = 0, 196 /** Upper bound for tags for sentences ended by sentence terminators. */ 197 UBRK_SENTENCE_TERM_LIMIT = 100, 198 /** Tag value for for sentences that do not contain an ending 199 * sentence terminator ('.', '?', '!', etc.) character, but 200 * are ended only by a hard separator (CR, LF, PS, etc.) or end of input. 201 */ 202 UBRK_SENTENCE_SEP = 100, 203 /** Upper bound for tags for sentences ended by a separator. */ 204 UBRK_SENTENCE_SEP_LIMIT = 200 205 /** Tag value for a hard, or mandatory line break */ 206 } USentenceBreakTag; 207 208 209 /** 210 * Open a new UBreakIterator for locating text boundaries for a specified locale. 211 * A UBreakIterator may be used for detecting character, line, word, 212 * and sentence breaks in text. 213 * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD, 214 * UBRK_LINE, UBRK_SENTENCE 215 * @param locale The locale specifying the text-breaking conventions. Note that 216 * locale keys such as "lb" and "ss" may be used to modify text break behavior, 217 * see general discussion of BreakIterator C API. 218 * @param text The text to be iterated over. May be null, in which case ubrk_setText() is 219 * used to specify the text to be iterated. 220 * @param textLength The number of characters in text, or -1 if null-terminated. 221 * @param status A UErrorCode to receive any errors. 222 * @return A UBreakIterator for the specified locale. 223 * @see ubrk_openRules 224 * @stable ICU 2.0 225 */ 226 U_CAPI UBreakIterator* U_EXPORT2 227 ubrk_open(UBreakIteratorType type, 228 const char *locale, 229 const UChar *text, 230 int32_t textLength, 231 UErrorCode *status); 232 233 /** 234 * Open a new UBreakIterator for locating text boundaries using specified breaking rules. 235 * The rule syntax is ... (TBD) 236 * @param rules A set of rules specifying the text breaking conventions. 237 * @param rulesLength The number of characters in rules, or -1 if null-terminated. 238 * @param text The text to be iterated over. May be null, in which case ubrk_setText() is 239 * used to specify the text to be iterated. 240 * @param textLength The number of characters in text, or -1 if null-terminated. 241 * @param parseErr Receives position and context information for any syntax errors 242 * detected while parsing the rules. 243 * @param status A UErrorCode to receive any errors. 244 * @return A UBreakIterator for the specified rules. 245 * @see ubrk_open 246 * @stable ICU 2.2 247 */ 248 U_CAPI UBreakIterator* U_EXPORT2 249 ubrk_openRules(const UChar *rules, 250 int32_t rulesLength, 251 const UChar *text, 252 int32_t textLength, 253 UParseError *parseErr, 254 UErrorCode *status); 255 256 /** 257 * Open a new UBreakIterator for locating text boundaries using precompiled binary rules. 258 * Opening a UBreakIterator this way is substantially faster than using ubrk_openRules. 259 * Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not 260 * compatible across different major versions of ICU, nor across platforms of different 261 * endianness or different base character set family (ASCII vs EBCDIC). 262 * @param binaryRules A set of compiled binary rules specifying the text breaking 263 * conventions. Ownership of the storage containing the compiled 264 * rules remains with the caller of this function. The compiled 265 * rules must not be modified or deleted during the life of the 266 * break iterator. 267 * @param rulesLength The length of binaryRules in bytes; must be >= 0. 268 * @param text The text to be iterated over. May be null, in which case 269 * ubrk_setText() is used to specify the text to be iterated. 270 * @param textLength The number of characters in text, or -1 if null-terminated. 271 * @param status Pointer to UErrorCode to receive any errors. 272 * @return UBreakIterator for the specified rules. 273 * @see ubrk_getBinaryRules 274 * @stable ICU 59 275 */ 276 U_CAPI UBreakIterator* U_EXPORT2 277 ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength, 278 const UChar * text, int32_t textLength, 279 UErrorCode * status); 280 281 #ifndef U_HIDE_DEPRECATED_API 282 283 #endif /* U_HIDE_DEPRECATED_API */ 284 285 /** 286 * Thread safe cloning operation. 287 * @param bi iterator to be cloned 288 * @param status to indicate whether the operation went on smoothly or there were errors 289 * @return pointer to the new clone 290 * @stable ICU 69 291 */ 292 U_CAPI UBreakIterator * U_EXPORT2 293 ubrk_clone(const UBreakIterator *bi, 294 UErrorCode *status); 295 296 /** 297 * Close a UBreakIterator. 298 * Once closed, a UBreakIterator may no longer be used. 299 * @param bi The break iterator to close. 300 * @stable ICU 2.0 301 */ 302 U_CAPI void U_EXPORT2 303 ubrk_close(UBreakIterator *bi); 304 305 #if U_SHOW_CPLUSPLUS_API 306 307 U_NAMESPACE_BEGIN 308 309 /** 310 * \class LocalUBreakIteratorPointer 311 * "Smart pointer" class, closes a UBreakIterator via ubrk_close(). 312 * For most methods see the LocalPointerBase base class. 313 * 314 * @see LocalPointerBase 315 * @see LocalPointer 316 * @stable ICU 4.4 317 */ 318 U_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_close); 319 320 U_NAMESPACE_END 321 322 #endif 323 324 /** 325 * Sets an existing iterator to point to a new piece of text. 326 * The break iterator retains a pointer to the supplied text. 327 * The caller must not modify or delete the text while the BreakIterator 328 * retains the reference. 329 * 330 * @param bi The iterator to use 331 * @param text The text to be set 332 * @param textLength The length of the text 333 * @param status The error code 334 * @stable ICU 2.0 335 */ 336 U_CAPI void U_EXPORT2 337 ubrk_setText(UBreakIterator* bi, 338 const UChar* text, 339 int32_t textLength, 340 UErrorCode* status); 341 342 /** 343 * Sets an existing iterator to point to a new piece of text. 344 * 345 * All index positions returned by break iterator functions are 346 * native indices from the UText. For example, when breaking UTF-8 347 * encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc. 348 * will be UTF-8 string indices, not UTF-16 positions. 349 * 350 * @param bi The iterator to use 351 * @param text The text to be set. 352 * This function makes a shallow clone of the supplied UText. This means 353 * that the caller is free to immediately close or otherwise reuse the 354 * UText that was passed as a parameter, but that the underlying text itself 355 * must not be altered while being referenced by the break iterator. 356 * @param status The error code 357 * @stable ICU 3.4 358 */ 359 U_CAPI void U_EXPORT2 360 ubrk_setUText(UBreakIterator* bi, 361 UText* text, 362 UErrorCode* status); 363 364 /** 365 * Determine the most recently-returned text boundary. 366 * 367 * @param bi The break iterator to use. 368 * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous, 369 * \ref ubrk_first, or \ref ubrk_last. 370 * @stable ICU 2.0 371 */ 372 U_CAPI int32_t U_EXPORT2 373 ubrk_current(const UBreakIterator *bi); 374 375 /** 376 * Advance the iterator to the boundary following the current boundary. 377 * 378 * @param bi The break iterator to use. 379 * @return The character index of the next text boundary, or UBRK_DONE 380 * if all text boundaries have been returned. 381 * @see ubrk_previous 382 * @stable ICU 2.0 383 */ 384 U_CAPI int32_t U_EXPORT2 385 ubrk_next(UBreakIterator *bi); 386 387 /** 388 * Set the iterator position to the boundary preceding the current boundary. 389 * 390 * @param bi The break iterator to use. 391 * @return The character index of the preceding text boundary, or UBRK_DONE 392 * if all text boundaries have been returned. 393 * @see ubrk_next 394 * @stable ICU 2.0 395 */ 396 U_CAPI int32_t U_EXPORT2 397 ubrk_previous(UBreakIterator *bi); 398 399 /** 400 * Set the iterator position to zero, the start of the text being scanned. 401 * @param bi The break iterator to use. 402 * @return The new iterator position (zero). 403 * @see ubrk_last 404 * @stable ICU 2.0 405 */ 406 U_CAPI int32_t U_EXPORT2 407 ubrk_first(UBreakIterator *bi); 408 409 /** 410 * Set the iterator position to the index immediately <EM>beyond</EM> the last character in the text being scanned. 411 * This is not the same as the last character. 412 * @param bi The break iterator to use. 413 * @return The character offset immediately <EM>beyond</EM> the last character in the 414 * text being scanned. 415 * @see ubrk_first 416 * @stable ICU 2.0 417 */ 418 U_CAPI int32_t U_EXPORT2 419 ubrk_last(UBreakIterator *bi); 420 421 /** 422 * Set the iterator position to the first boundary preceding the specified offset. 423 * The new position is always smaller than offset, or UBRK_DONE. 424 * @param bi The break iterator to use. 425 * @param offset The offset to begin scanning. 426 * @return The text boundary preceding offset, or UBRK_DONE. 427 * @see ubrk_following 428 * @stable ICU 2.0 429 */ 430 U_CAPI int32_t U_EXPORT2 431 ubrk_preceding(UBreakIterator *bi, 432 int32_t offset); 433 434 /** 435 * Advance the iterator to the first boundary following the specified offset. 436 * The value returned is always greater than offset, or UBRK_DONE. 437 * @param bi The break iterator to use. 438 * @param offset The offset to begin scanning. 439 * @return The text boundary following offset, or UBRK_DONE. 440 * @see ubrk_preceding 441 * @stable ICU 2.0 442 */ 443 U_CAPI int32_t U_EXPORT2 444 ubrk_following(UBreakIterator *bi, 445 int32_t offset); 446 447 /** 448 * Get a locale for which text breaking information is available. 449 * A UBreakIterator in a locale returned by this function will perform the correct 450 * text breaking for the locale. 451 * @param index The index of the desired locale. 452 * @return A locale for which number text breaking information is available, or 0 if none. 453 * @see ubrk_countAvailable 454 * @stable ICU 2.0 455 */ 456 U_CAPI const char* U_EXPORT2 457 ubrk_getAvailable(int32_t index); 458 459 /** 460 * Determine how many locales have text breaking information available. 461 * This function is most useful as determining the loop ending condition for 462 * calls to \ref ubrk_getAvailable. 463 * @return The number of locales for which text breaking information is available. 464 * @see ubrk_getAvailable 465 * @stable ICU 2.0 466 */ 467 U_CAPI int32_t U_EXPORT2 468 ubrk_countAvailable(void); 469 470 471 /** 472 * Returns true if the specified position is a boundary position. As a side 473 * effect, leaves the iterator pointing to the first boundary position at 474 * or after "offset". 475 * @param bi The break iterator to use. 476 * @param offset the offset to check. 477 * @return True if "offset" is a boundary position. 478 * @stable ICU 2.0 479 */ 480 U_CAPI UBool U_EXPORT2 481 ubrk_isBoundary(UBreakIterator *bi, int32_t offset); 482 483 /** 484 * Return the status from the break rule that determined the most recently 485 * returned break position. The values appear in the rule source 486 * within brackets, {123}, for example. For rules that do not specify a 487 * status, a default value of 0 is returned. 488 * <p> 489 * For word break iterators, the possible values are defined in enum UWordBreak. 490 * @stable ICU 2.2 491 */ 492 U_CAPI int32_t U_EXPORT2 493 ubrk_getRuleStatus(UBreakIterator *bi); 494 495 /** 496 * Get the statuses from the break rules that determined the most recently 497 * returned break position. The values appear in the rule source 498 * within brackets, {123}, for example. The default status value for rules 499 * that do not explicitly provide one is zero. 500 * <p> 501 * For word break iterators, the possible values are defined in enum UWordBreak. 502 * @param bi The break iterator to use 503 * @param fillInVec an array to be filled in with the status values. 504 * @param capacity the length of the supplied vector. A length of zero causes 505 * the function to return the number of status values, in the 506 * normal way, without attempting to store any values. 507 * @param status receives error codes. 508 * @return The number of rule status values from rules that determined 509 * the most recent boundary returned by the break iterator. 510 * @stable ICU 3.0 511 */ 512 U_CAPI int32_t U_EXPORT2 513 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status); 514 515 /** 516 * Return the locale of the break iterator. You can choose between the valid and 517 * the actual locale. 518 * @param bi break iterator 519 * @param type locale type (valid or actual) 520 * @param status error code 521 * @return locale string 522 * @stable ICU 2.8 523 */ 524 U_CAPI const char* U_EXPORT2 525 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status); 526 527 /** 528 * Get a compiled binary version of the rules specifying the behavior of a UBreakIterator. 529 * The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator 530 * more quickly than using ubrk_openRules. The compiled rules are not compatible across 531 * different major versions of ICU, nor across platforms of different endianness or 532 * different base character set family (ASCII vs EBCDIC). Supports preflighting (with 533 * binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to 534 * the binaryRules buffer. However, whether preflighting or not, if the actual length 535 * is greater than INT32_MAX, then the function returns 0 and sets *status to 536 * U_INDEX_OUTOFBOUNDS_ERROR. 537 538 * @param bi The break iterator to use. 539 * @param binaryRules Buffer to receive the compiled binary rules; set to NULL for 540 * preflighting. 541 * @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for 542 * preflighting. Must be >= 0. 543 * @param status Pointer to UErrorCode to receive any errors, such as 544 * U_BUFFER_OVERFLOW_ERROR, U_INDEX_OUTOFBOUNDS_ERROR, or 545 * U_ILLEGAL_ARGUMENT_ERROR. 546 * @return The actual byte length of the binary rules, if <= INT32_MAX; 547 * otherwise 0. If not preflighting and this is larger than 548 * rulesCapacity, *status will be set to an error. 549 * @see ubrk_openBinaryRules 550 * @stable ICU 59 551 */ 552 U_CAPI int32_t U_EXPORT2 553 ubrk_getBinaryRules(UBreakIterator *bi, 554 uint8_t * binaryRules, int32_t rulesCapacity, 555 UErrorCode * status); 556 557 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 558 559 #endif 560