1 /* 2 * Copyright (C) 1996-2004, International Business Machines Corporation and others. All Rights Reserved. 3 ***************************************************************************************** 4 */ 5 6 #ifndef UBRK_H 7 #define UBRK_H 8 9 #include "unicode/utypes.h" 10 #include "unicode/uloc.h" 11 12 /** 13 * A text-break iterator. 14 * For usage in C programs. 15 */ 16 #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR 17 # define UBRK_TYPEDEF_UBREAK_ITERATOR 18 /** 19 * Opaque type representing an ICU Break iterator object. 20 * @stable ICU 2.0 21 */ 22 typedef void UBreakIterator; 23 #endif 24 25 #if !UCONFIG_NO_BREAK_ITERATION 26 27 #include "unicode/parseerr.h" 28 29 /** 30 * \file 31 * \brief C API: BreakIterator 32 * 33 * <h2> BreakIterator C API </h2> 34 * 35 * The BreakIterator C API defines methods for finding the location 36 * of boundaries in text. Pointer to a UBreakIterator maintain a 37 * current position and scan over text returning the index of characters 38 * where boundaries occur. 39 * <P> 40 * Line boundary analysis determines where a text string can be broken 41 * when line-wrapping. The mechanism correctly handles punctuation and 42 * hyphenated words. 43 * <P> 44 * Sentence boundary analysis allows selection with correct 45 * interpretation of periods within numbers and abbreviations, and 46 * trailing punctuation marks such as quotation marks and parentheses. 47 * <P> 48 * Word boundary analysis is used by search and replace functions, as 49 * well as within text editing applications that allow the user to 50 * select words with a double click. Word selection provides correct 51 * interpretation of punctuation marks within and following 52 * words. Characters that are not part of a word, such as symbols or 53 * punctuation marks, have word-breaks on both sides. 54 * <P> 55 * Character boundary analysis allows users to interact with 56 * characters as they expect to, for example, when moving the cursor 57 * through a text string. Character boundary analysis provides correct 58 * navigation of through character strings, regardless of how the 59 * character is stored. For example, an accented character might be 60 * stored as a base character and a diacritical mark. What users 61 * consider to be a character can differ between languages. 62 * <P> 63 * Title boundary analysis locates all positions, 64 * typically starts of words, that should be set to Title Case 65 * when title casing the text. 66 * <P> 67 * 68 * This is the interface for all text boundaries. 69 * <P> 70 * Examples: 71 * <P> 72 * Helper function to output text 73 * <pre> 74 * \code 75 * void printTextRange(UChar* str, int32_t start, int32_t end ) { 76 * UChar* result; 77 * UChar* temp; 78 * const char* res; 79 * temp=(UChar*)malloc(sizeof(UChar) * ((u_strlen(str)-start)+1)); 80 * result=(UChar*)malloc(sizeof(UChar) * ((end-start)+1)); 81 * u_strcpy(temp, &str[start]); 82 * u_strncpy(result, temp, end-start); 83 * res=(char*)malloc(sizeof(char) * (u_strlen(result)+1)); 84 * u_austrcpy(res, result); 85 * printf("%s\n", res); 86 * } 87 * \endcode 88 * </pre> 89 * Print each element in order: 90 * <pre> 91 * \code 92 * void printEachForward( UBreakIterator* boundary, UChar* str) { 93 * int32_t end; 94 * int32_t start = ubrk_first(boundary); 95 * for (end = ubrk_next(boundary)); end != UBRK_DONE; start = end, end = ubrk_next(boundary)) { 96 * printTextRange(str, start, end ); 97 * } 98 * } 99 * \endcode 100 * </pre> 101 * Print each element in reverse order: 102 * <pre> 103 * \code 104 * void printEachBackward( UBreakIterator* boundary, UChar* str) { 105 * int32_t start; 106 * int32_t end = ubrk_last(boundary); 107 * for (start = ubrk_previous(boundary); start != UBRK_DONE; end = start, start =ubrk_previous(boundary)) { 108 * printTextRange( str, start, end ); 109 * } 110 * } 111 * \endcode 112 * </pre> 113 * Print first element 114 * <pre> 115 * \code 116 * void printFirst(UBreakIterator* boundary, UChar* str) { 117 * int32_t end; 118 * int32_t start = ubrk_first(boundary); 119 * end = ubrk_next(boundary); 120 * printTextRange( str, start, end ); 121 * } 122 * \endcode 123 * </pre> 124 * Print last element 125 * <pre> 126 * \code 127 * void printLast(UBreakIterator* boundary, UChar* str) { 128 * int32_t start; 129 * int32_t end = ubrk_last(boundary); 130 * start = ubrk_previous(boundary); 131 * printTextRange(str, start, end ); 132 * } 133 * \endcode 134 * </pre> 135 * Print the element at a specified position 136 * <pre> 137 * \code 138 * void printAt(UBreakIterator* boundary, int32_t pos , UChar* str) { 139 * int32_t start; 140 * int32_t end = ubrk_following(boundary, pos); 141 * start = ubrk_previous(boundary); 142 * printTextRange(str, start, end ); 143 * } 144 * \endcode 145 * </pre> 146 * Creating and using text boundaries 147 * <pre> 148 * \code 149 * void BreakIterator_Example( void ) { 150 * UBreakIterator* boundary; 151 * UChar *stringToExamine; 152 * stringToExamine=(UChar*)malloc(sizeof(UChar) * (strlen("Aaa bbb ccc. Ddd eee fff.")+1) ); 153 * u_uastrcpy(stringToExamine, "Aaa bbb ccc. Ddd eee fff."); 154 * printf("Examining: "Aaa bbb ccc. Ddd eee fff."); 155 * 156 * //print each sentence in forward and reverse order 157 * boundary = ubrk_open(UBRK_SENTENCE, "en_us", stringToExamine, u_strlen(stringToExamine), &status); 158 * printf("----- forward: -----------\n"); 159 * printEachForward(boundary, stringToExamine); 160 * printf("----- backward: ----------\n"); 161 * printEachBackward(boundary, stringToExamine); 162 * ubrk_close(boundary); 163 * 164 * //print each word in order 165 * boundary = ubrk_open(UBRK_WORD, "en_us", stringToExamine, u_strlen(stringToExamine), &status); 166 * printf("----- forward: -----------\n"); 167 * printEachForward(boundary, stringToExamine); 168 * printf("----- backward: ----------\n"); 169 * printEachBackward(boundary, stringToExamine); 170 * //print first element 171 * printf("----- first: -------------\n"); 172 * printFirst(boundary, stringToExamine); 173 * //print last element 174 * printf("----- last: --------------\n"); 175 * printLast(boundary, stringToExamine); 176 * //print word at charpos 10 177 * printf("----- at pos 10: ---------\n"); 178 * printAt(boundary, 10 , stringToExamine); 179 * 180 * ubrk_close(boundary); 181 * } 182 * \endcode 183 * </pre> 184 */ 185 186 /** The possible types of text boundaries. @stable ICU 2.0 */ 187 typedef enum UBreakIteratorType { 188 /** Character breaks @stable ICU 2.0 */ 189 UBRK_CHARACTER, 190 /** Word breaks @stable ICU 2.0 */ 191 UBRK_WORD, 192 /** Line breaks @stable ICU 2.0 */ 193 UBRK_LINE, 194 /** Sentence breaks @stable ICU 2.0 */ 195 UBRK_SENTENCE, 196 197 #ifndef U_HIDE_DEPRECATED_API 198 /** 199 * Title Case breaks 200 * The iterator created using this type locates title boundaries as described for 201 * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration, 202 * please use Word Boundary iterator. 203 * 204 * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later. 205 */ 206 UBRK_TITLE 207 #endif /* U_HIDE_DEPRECATED_API */ 208 209 } UBreakIteratorType; 210 211 /** Value indicating all text boundaries have been returned. 212 * @stable ICU 2.0 213 */ 214 #define UBRK_DONE ((int32_t) -1) 215 216 217 /** 218 * Enum constants for the word break tags returned by 219 * getRuleStatus(). A range of values is defined for each category of 220 * word, to allow for further subdivisions of a category in future releases. 221 * Applications should check for tag values falling within the range, rather 222 * than for single individual values. 223 * @stable ICU 2.2 224 */ 225 typedef enum UWordBreak { 226 /** Tag value for "words" that do not fit into any of other categories. 227 * Includes spaces and most punctuation. */ 228 UBRK_WORD_NONE = 0, 229 /** Upper bound for tags for uncategorized words. */ 230 UBRK_WORD_NONE_LIMIT = 100, 231 /** Tag value for words that appear to be numbers, lower limit. */ 232 UBRK_WORD_NUMBER = 100, 233 /** Tag value for words that appear to be numbers, upper limit. */ 234 UBRK_WORD_NUMBER_LIMIT = 200, 235 /** Tag value for words that contain letters, excluding 236 * hiragana, katakana or ideographic characters, lower limit. */ 237 UBRK_WORD_LETTER = 200, 238 /** Tag value for words containing letters, upper limit */ 239 UBRK_WORD_LETTER_LIMIT = 300, 240 /** Tag value for words containing kana characters, lower limit */ 241 UBRK_WORD_KANA = 300, 242 /** Tag value for words containing kana characters, upper limit */ 243 UBRK_WORD_KANA_LIMIT = 400, 244 /** Tag value for words containing ideographic characters, lower limit */ 245 UBRK_WORD_IDEO = 400, 246 /** Tag value for words containing ideographic characters, upper limit */ 247 UBRK_WORD_IDEO_LIMIT = 500 248 } UWordBreak; 249 250 /** 251 * Enum constants for the line break tags returned by getRuleStatus(). 252 * A range of values is defined for each category of 253 * word, to allow for further subdivisions of a category in future releases. 254 * Applications should check for tag values falling within the range, rather 255 * than for single individual values. 256 * @draft ICU 2.8 257 */ 258 typedef enum ULineBreakTag { 259 /** Tag value for soft line breaks, positions at which a line break 260 * is acceptable but not required */ 261 UBRK_LINE_SOFT = 0, 262 /** Upper bound for soft line breaks. */ 263 UBRK_LINE_SOFT_LIMIT = 100, 264 /** Tag value for a hard, or mandatory line break */ 265 UBRK_LINE_HARD = 100, 266 /** Upper bound for hard line breaks. */ 267 UBRK_LINE_HARD_LIMIT = 200 268 } ULineBreakTag; 269 270 271 272 /** 273 * Enum constants for the sentence break tags returned by getRuleStatus(). 274 * A range of values is defined for each category of 275 * sentence, to allow for further subdivisions of a category in future releases. 276 * Applications should check for tag values falling within the range, rather 277 * than for single individual values. 278 * @draft ICU 2.8 279 */ 280 typedef enum USentenceBreakTag { 281 /** Tag value for for sentences ending with a sentence terminator 282 * ('.', '?', '!', etc.) character, possibly followed by a 283 * hard separator (CR, LF, PS, etc.) 284 */ 285 UBRK_SENTENCE_TERM = 0, 286 /** Upper bound for tags for sentences ended by sentence terminators. */ 287 UBRK_SENTENCE_TERM_LIMIT = 100, 288 /** Tag value for for sentences that do not contain an ending 289 * sentence terminator ('.', '?', '!', etc.) character, but 290 * are ended only by a hard separator (CR, LF, PS, etc.) or end of input. 291 */ 292 UBRK_SENTENCE_SEP = 100, 293 /** Upper bound for tags for sentences ended by a separator. */ 294 UBRK_SENTENCE_SEP_LIMIT = 200 295 /** Tag value for a hard, or mandatory line break */ 296 } USentenceBreakTag; 297 298 299 /** 300 * Open a new UBreakIterator for locating text boundaries for a specified locale. 301 * A UBreakIterator may be used for detecting character, line, word, 302 * and sentence breaks in text. 303 * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD, 304 * UBRK_LINE, UBRK_SENTENCE 305 * @param locale The locale specifying the text-breaking conventions. 306 * @param text The text to be iterated over. 307 * @param textLength The number of characters in text, or -1 if null-terminated. 308 * @param status A UErrorCode to receive any errors. 309 * @return A UBreakIterator for the specified locale. 310 * @see ubrk_openRules 311 * @stable ICU 2.0 312 */ 313 U_STABLE UBreakIterator* U_EXPORT2 314 ubrk_open(UBreakIteratorType type, 315 const char *locale, 316 const UChar *text, 317 int32_t textLength, 318 UErrorCode *status); 319 320 /** 321 * Open a new UBreakIterator for locating text boundaries using specified breaking rules. 322 * The rule syntax is ... (TBD) 323 * @param rules A set of rules specifying the text breaking conventions. 324 * @param rulesLength The number of characters in rules, or -1 if null-terminated. 325 * @param text The text to be iterated over. May be null, in which case ubrk_setText() is 326 * used to specify the text to be iterated. 327 * @param textLength The number of characters in text, or -1 if null-terminated. 328 * @param parseErr Receives position and context information for any syntax errors 329 * detected while parsing the rules. 330 * @param status A UErrorCode to receive any errors. 331 * @return A UBreakIterator for the specified rules. 332 * @see ubrk_open 333 * @stable ICU 2.2 334 */ 335 U_STABLE UBreakIterator* U_EXPORT2 336 ubrk_openRules(const UChar *rules, 337 int32_t rulesLength, 338 const UChar *text, 339 int32_t textLength, 340 UParseError *parseErr, 341 UErrorCode *status); 342 343 /** 344 * Thread safe cloning operation 345 * @param bi iterator to be cloned 346 * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated. 347 * If buffer is not large enough, new memory will be allocated. 348 * Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations. 349 * @param pBufferSize pointer to size of allocated space. 350 * If *pBufferSize == 0, a sufficient size for use in cloning will 351 * be returned ('pre-flighting') 352 * If *pBufferSize is not enough for a stack-based safe clone, 353 * new memory will be allocated. 354 * @param status to indicate whether the operation went on smoothly or there were errors 355 * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary. 356 * @return pointer to the new clone 357 * @stable ICU 2.0 358 */ 359 U_STABLE UBreakIterator * U_EXPORT2 360 ubrk_safeClone( 361 const UBreakIterator *bi, 362 void *stackBuffer, 363 int32_t *pBufferSize, 364 UErrorCode *status); 365 366 /** 367 * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone(). 368 * @stable ICU 2.0 369 */ 370 #define U_BRK_SAFECLONE_BUFFERSIZE 512 371 372 /** 373 * Close a UBreakIterator. 374 * Once closed, a UBreakIterator may no longer be used. 375 * @param bi The break iterator to close. 376 * @stable ICU 2.0 377 */ 378 U_STABLE void U_EXPORT2 379 ubrk_close(UBreakIterator *bi); 380 381 /** 382 * Sets an existing iterator to point to a new piece of text 383 * @param bi The iterator to use 384 * @param text The text to be set 385 * @param textLength The length of the text 386 * @param status The error code 387 * @stable ICU 2.0 388 */ 389 U_STABLE void U_EXPORT2 390 ubrk_setText(UBreakIterator* bi, 391 const UChar* text, 392 int32_t textLength, 393 UErrorCode* status); 394 395 /** 396 * Determine the most recently-returned text boundary. 397 * 398 * @param bi The break iterator to use. 399 * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous, 400 * \ref ubrk_first, or \ref ubrk_last. 401 * @stable ICU 2.0 402 */ 403 U_STABLE int32_t U_EXPORT2 404 ubrk_current(const UBreakIterator *bi); 405 406 /** 407 * Determine the text boundary following the current text boundary. 408 * 409 * @param bi The break iterator to use. 410 * @return The character index of the next text boundary, or UBRK_DONE 411 * if all text boundaries have been returned. 412 * @see ubrk_previous 413 * @stable ICU 2.0 414 */ 415 U_STABLE int32_t U_EXPORT2 416 ubrk_next(UBreakIterator *bi); 417 418 /** 419 * Determine the text boundary preceding the current text boundary. 420 * 421 * @param bi The break iterator to use. 422 * @return The character index of the preceding text boundary, or UBRK_DONE 423 * if all text boundaries have been returned. 424 * @see ubrk_next 425 * @stable ICU 2.0 426 */ 427 U_STABLE int32_t U_EXPORT2 428 ubrk_previous(UBreakIterator *bi); 429 430 /** 431 * Determine the index of the first character in the text being scanned. 432 * This is not always the same as index 0 of the text. 433 * @param bi The break iterator to use. 434 * @return The character index of the first character in the text being scanned. 435 * @see ubrk_last 436 * @stable ICU 2.0 437 */ 438 U_STABLE int32_t U_EXPORT2 439 ubrk_first(UBreakIterator *bi); 440 441 /** 442 * Determine the index immediately <EM>beyond</EM> the last character in the text being 443 * scanned. 444 * This is not the same as the last character. 445 * @param bi The break iterator to use. 446 * @return The character offset immediately <EM>beyond</EM> the last character in the 447 * text being scanned. 448 * @see ubrk_first 449 * @stable ICU 2.0 450 */ 451 U_STABLE int32_t U_EXPORT2 452 ubrk_last(UBreakIterator *bi); 453 454 /** 455 * Determine the text boundary preceding the specified offset. 456 * The value returned is always smaller than offset, or UBRK_DONE. 457 * @param bi The break iterator to use. 458 * @param offset The offset to begin scanning. 459 * @return The text boundary preceding offset, or UBRK_DONE. 460 * @see ubrk_following 461 * @stable ICU 2.0 462 */ 463 U_STABLE int32_t U_EXPORT2 464 ubrk_preceding(UBreakIterator *bi, 465 int32_t offset); 466 467 /** 468 * Determine the text boundary following the specified offset. 469 * The value returned is always greater than offset, or UBRK_DONE. 470 * @param bi The break iterator to use. 471 * @param offset The offset to begin scanning. 472 * @return The text boundary following offset, or UBRK_DONE. 473 * @see ubrk_preceding 474 * @stable ICU 2.0 475 */ 476 U_STABLE int32_t U_EXPORT2 477 ubrk_following(UBreakIterator *bi, 478 int32_t offset); 479 480 /** 481 * Get a locale for which text breaking information is available. 482 * A UBreakIterator in a locale returned by this function will perform the correct 483 * text breaking for the locale. 484 * @param index The index of the desired locale. 485 * @return A locale for which number text breaking information is available, or 0 if none. 486 * @see ubrk_countAvailable 487 * @stable ICU 2.0 488 */ 489 U_STABLE const char* U_EXPORT2 490 ubrk_getAvailable(int32_t index); 491 492 /** 493 * Determine how many locales have text breaking information available. 494 * This function is most useful as determining the loop ending condition for 495 * calls to \ref ubrk_getAvailable. 496 * @return The number of locales for which text breaking information is available. 497 * @see ubrk_getAvailable 498 * @stable ICU 2.0 499 */ 500 U_STABLE int32_t U_EXPORT2 501 ubrk_countAvailable(void); 502 503 504 /** 505 * Returns true if the specfied position is a boundary position. As a side 506 * effect, leaves the iterator pointing to the first boundary position at 507 * or after "offset". 508 * @param bi The break iterator to use. 509 * @param offset the offset to check. 510 * @return True if "offset" is a boundary position. 511 * @stable ICU 2.0 512 */ 513 U_STABLE UBool U_EXPORT2 514 ubrk_isBoundary(UBreakIterator *bi, int32_t offset); 515 516 /** 517 * Return the status from the break rule that determined the most recently 518 * returned break position. The values appear in the rule source 519 * within brackets, {123}, for example. For rules that do not specify a 520 * status, a default value of 0 is returned. 521 * <p> 522 * For word break iterators, the possible values are defined in enum UWordBreak. 523 * @stable ICU 2.2 524 */ 525 U_STABLE int32_t U_EXPORT2 526 ubrk_getRuleStatus(UBreakIterator *bi); 527 528 /** 529 * Get the statuses from the break rules that determined the most recently 530 * returned break position. The values appear in the rule source 531 * within brackets, {123}, for example. The default status value for rules 532 * that do not explicitly provide one is zero. 533 * <p> 534 * For word break iterators, the possible values are defined in enum UWordBreak. 535 * @param bi The break iterator to use 536 * @param fillInVec an array to be filled in with the status values. 537 * @param capacity the length of the supplied vector. A length of zero causes 538 * the function to return the number of status values, in the 539 * normal way, without attemtping to store any values. 540 * @param status receives error codes. 541 * @return The number of rule status values from rules that determined 542 * the most recent boundary returned by the break iterator. 543 * @draft ICU 3.0 544 */ 545 U_DRAFT int32_t U_EXPORT2 546 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status); 547 548 /** 549 * Return the locale of the break iterator. You can choose between the valid and 550 * the actual locale. 551 * @param bi break iterator 552 * @param type locale type (valid or actual) 553 * @param status error code 554 * @return locale string 555 * @draft ICU 2.8 likely to change in ICU 3.0, based on feedback 556 */ 557 U_DRAFT const char* U_EXPORT2 558 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status); 559 560 561 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 562 563 #endif 564