• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 * Copyright (C) 1996-2004, International Business Machines Corporation and others. All Rights Reserved.
3 *****************************************************************************************
4 */
5 
6 #ifndef UBRK_H
7 #define UBRK_H
8 
9 #include "unicode/utypes.h"
10 #include "unicode/uloc.h"
11 
12 /**
13  * A text-break iterator.
14  *  For usage in C programs.
15  */
16 #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
17 #   define UBRK_TYPEDEF_UBREAK_ITERATOR
18     /**
19      *  Opaque type representing an ICU Break iterator object.
20      *  @stable ICU 2.0
21      */
22     typedef void UBreakIterator;
23 #endif
24 
25 #if !UCONFIG_NO_BREAK_ITERATION
26 
27 #include "unicode/parseerr.h"
28 
29 /**
30  * \file
31  * \brief C API: BreakIterator
32  *
33  * <h2> BreakIterator C API </h2>
34  *
35  * The BreakIterator C API defines  methods for finding the location
36  * of boundaries in text. Pointer to a UBreakIterator maintain a
37  * current position and scan over text returning the index of characters
38  * where boundaries occur.
39  * <P>
40  * Line boundary analysis determines where a text string can be broken
41  * when line-wrapping. The mechanism correctly handles punctuation and
42  * hyphenated words.
43  * <P>
44  * Sentence boundary analysis allows selection with correct
45  * interpretation of periods within numbers and abbreviations, and
46  * trailing punctuation marks such as quotation marks and parentheses.
47  * <P>
48  * Word boundary analysis is used by search and replace functions, as
49  * well as within text editing applications that allow the user to
50  * select words with a double click. Word selection provides correct
51  * interpretation of punctuation marks within and following
52  * words. Characters that are not part of a word, such as symbols or
53  * punctuation marks, have word-breaks on both sides.
54  * <P>
55  * Character boundary analysis allows users to interact with
56  * characters as they expect to, for example, when moving the cursor
57  * through a text string. Character boundary analysis provides correct
58  * navigation of through character strings, regardless of how the
59  * character is stored.  For example, an accented character might be
60  * stored as a base character and a diacritical mark. What users
61  * consider to be a character can differ between languages.
62  * <P>
63  * Title boundary analysis locates all positions,
64  * typically starts of words, that should be set to Title Case
65  * when title casing the text.
66  * <P>
67  *
68  * This is the interface for all text boundaries.
69  * <P>
70  * Examples:
71  * <P>
72  * Helper function to output text
73  * <pre>
74  * \code
75  *    void printTextRange(UChar* str, int32_t start, int32_t end ) {
76  *         UChar* result;
77  *         UChar* temp;
78  *         const char* res;
79  *         temp=(UChar*)malloc(sizeof(UChar) * ((u_strlen(str)-start)+1));
80  *         result=(UChar*)malloc(sizeof(UChar) * ((end-start)+1));
81  *         u_strcpy(temp, &str[start]);
82  *         u_strncpy(result, temp, end-start);
83  *         res=(char*)malloc(sizeof(char) * (u_strlen(result)+1));
84  *         u_austrcpy(res, result);
85  *         printf("%s\n", res);
86  *    }
87  * \endcode
88  * </pre>
89  * Print each element in order:
90  * <pre>
91  * \code
92  *    void printEachForward( UBreakIterator* boundary, UChar* str) {
93  *       int32_t end;
94  *       int32_t start = ubrk_first(boundary);
95  *       for (end = ubrk_next(boundary)); end != UBRK_DONE; start = end, end = ubrk_next(boundary)) {
96  *             printTextRange(str, start, end );
97  *         }
98  *    }
99  * \endcode
100  * </pre>
101  * Print each element in reverse order:
102  * <pre>
103  * \code
104  *    void printEachBackward( UBreakIterator* boundary, UChar* str) {
105  *       int32_t start;
106  *       int32_t end = ubrk_last(boundary);
107  *       for (start = ubrk_previous(boundary); start != UBRK_DONE;  end = start, start =ubrk_previous(boundary)) {
108  *             printTextRange( str, start, end );
109  *         }
110  *    }
111  * \endcode
112  * </pre>
113  * Print first element
114  * <pre>
115  * \code
116  *    void printFirst(UBreakIterator* boundary, UChar* str) {
117  *        int32_t end;
118  *        int32_t start = ubrk_first(boundary);
119  *        end = ubrk_next(boundary);
120  *        printTextRange( str, start, end );
121  *    }
122  * \endcode
123  * </pre>
124  * Print last element
125  * <pre>
126  * \code
127  *    void printLast(UBreakIterator* boundary, UChar* str) {
128  *        int32_t start;
129  *        int32_t end = ubrk_last(boundary);
130  *        start = ubrk_previous(boundary);
131  *        printTextRange(str, start, end );
132  *    }
133  * \endcode
134  * </pre>
135  * Print the element at a specified position
136  * <pre>
137  * \code
138  *    void printAt(UBreakIterator* boundary, int32_t pos , UChar* str) {
139  *        int32_t start;
140  *        int32_t end = ubrk_following(boundary, pos);
141  *        start = ubrk_previous(boundary);
142  *        printTextRange(str, start, end );
143  *    }
144  * \endcode
145  * </pre>
146  * Creating and using text boundaries
147  * <pre>
148  * \code
149  *       void BreakIterator_Example( void ) {
150  *           UBreakIterator* boundary;
151  *           UChar *stringToExamine;
152  *           stringToExamine=(UChar*)malloc(sizeof(UChar) * (strlen("Aaa bbb ccc. Ddd eee fff.")+1) );
153  *           u_uastrcpy(stringToExamine, "Aaa bbb ccc. Ddd eee fff.");
154  *           printf("Examining: "Aaa bbb ccc. Ddd eee fff.");
155  *
156  *           //print each sentence in forward and reverse order
157  *           boundary = ubrk_open(UBRK_SENTENCE, "en_us", stringToExamine, u_strlen(stringToExamine), &status);
158  *           printf("----- forward: -----------\n");
159  *           printEachForward(boundary, stringToExamine);
160  *           printf("----- backward: ----------\n");
161  *           printEachBackward(boundary, stringToExamine);
162  *           ubrk_close(boundary);
163  *
164  *           //print each word in order
165  *           boundary = ubrk_open(UBRK_WORD, "en_us", stringToExamine, u_strlen(stringToExamine), &status);
166  *           printf("----- forward: -----------\n");
167  *           printEachForward(boundary, stringToExamine);
168  *           printf("----- backward: ----------\n");
169  *           printEachBackward(boundary, stringToExamine);
170  *           //print first element
171  *           printf("----- first: -------------\n");
172  *           printFirst(boundary, stringToExamine);
173  *           //print last element
174  *           printf("----- last: --------------\n");
175  *           printLast(boundary, stringToExamine);
176  *           //print word at charpos 10
177  *           printf("----- at pos 10: ---------\n");
178  *           printAt(boundary, 10 , stringToExamine);
179  *
180  *           ubrk_close(boundary);
181  *       }
182  * \endcode
183  * </pre>
184  */
185 
186 /** The possible types of text boundaries.  @stable ICU 2.0 */
187 typedef enum UBreakIteratorType {
188   /** Character breaks  @stable ICU 2.0 */
189   UBRK_CHARACTER,
190   /** Word breaks @stable ICU 2.0 */
191   UBRK_WORD,
192   /** Line breaks @stable ICU 2.0 */
193   UBRK_LINE,
194   /** Sentence breaks @stable ICU 2.0 */
195   UBRK_SENTENCE,
196 
197 #ifndef U_HIDE_DEPRECATED_API
198   /**
199    * Title Case breaks
200    * The iterator created using this type locates title boundaries as described for
201    * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
202    * please use Word Boundary iterator.
203    *
204    * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
205    */
206   UBRK_TITLE
207 #endif /* U_HIDE_DEPRECATED_API */
208 
209 } UBreakIteratorType;
210 
211 /** Value indicating all text boundaries have been returned.
212  *  @stable ICU 2.0
213  */
214 #define UBRK_DONE ((int32_t) -1)
215 
216 
217 /**
218  *  Enum constants for the word break tags returned by
219  *  getRuleStatus().  A range of values is defined for each category of
220  *  word, to allow for further subdivisions of a category in future releases.
221  *  Applications should check for tag values falling within the range, rather
222  *  than for single individual values.
223  *  @stable ICU 2.2
224 */
225 typedef enum UWordBreak {
226     /** Tag value for "words" that do not fit into any of other categories.
227      *  Includes spaces and most punctuation. */
228     UBRK_WORD_NONE           = 0,
229     /** Upper bound for tags for uncategorized words. */
230     UBRK_WORD_NONE_LIMIT     = 100,
231     /** Tag value for words that appear to be numbers, lower limit.    */
232     UBRK_WORD_NUMBER         = 100,
233     /** Tag value for words that appear to be numbers, upper limit.    */
234     UBRK_WORD_NUMBER_LIMIT   = 200,
235     /** Tag value for words that contain letters, excluding
236      *  hiragana, katakana or ideographic characters, lower limit.    */
237     UBRK_WORD_LETTER         = 200,
238     /** Tag value for words containing letters, upper limit  */
239     UBRK_WORD_LETTER_LIMIT   = 300,
240     /** Tag value for words containing kana characters, lower limit */
241     UBRK_WORD_KANA           = 300,
242     /** Tag value for words containing kana characters, upper limit */
243     UBRK_WORD_KANA_LIMIT     = 400,
244     /** Tag value for words containing ideographic characters, lower limit */
245     UBRK_WORD_IDEO           = 400,
246     /** Tag value for words containing ideographic characters, upper limit */
247     UBRK_WORD_IDEO_LIMIT     = 500
248 } UWordBreak;
249 
250 /**
251  *  Enum constants for the line break tags returned by getRuleStatus().
252  *  A range of values is defined for each category of
253  *  word, to allow for further subdivisions of a category in future releases.
254  *  Applications should check for tag values falling within the range, rather
255  *  than for single individual values.
256  *  @draft ICU 2.8
257 */
258 typedef enum ULineBreakTag {
259     /** Tag value for soft line breaks, positions at which a line break
260       *  is acceptable but not required                */
261     UBRK_LINE_SOFT            = 0,
262     /** Upper bound for soft line breaks.              */
263     UBRK_LINE_SOFT_LIMIT      = 100,
264     /** Tag value for a hard, or mandatory line break  */
265     UBRK_LINE_HARD            = 100,
266     /** Upper bound for hard line breaks.              */
267     UBRK_LINE_HARD_LIMIT      = 200
268 } ULineBreakTag;
269 
270 
271 
272 /**
273  *  Enum constants for the sentence break tags returned by getRuleStatus().
274  *  A range of values is defined for each category of
275  *  sentence, to allow for further subdivisions of a category in future releases.
276  *  Applications should check for tag values falling within the range, rather
277  *  than for single individual values.
278  *  @draft ICU 2.8
279 */
280 typedef enum USentenceBreakTag {
281     /** Tag value for for sentences  ending with a sentence terminator
282       * ('.', '?', '!', etc.) character, possibly followed by a
283       * hard separator (CR, LF, PS, etc.)
284       */
285     UBRK_SENTENCE_TERM       = 0,
286     /** Upper bound for tags for sentences ended by sentence terminators.    */
287     UBRK_SENTENCE_TERM_LIMIT = 100,
288     /** Tag value for for sentences that do not contain an ending
289       * sentence terminator ('.', '?', '!', etc.) character, but
290       * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
291       */
292     UBRK_SENTENCE_SEP        = 100,
293     /** Upper bound for tags for sentences ended by a separator.              */
294     UBRK_SENTENCE_SEP_LIMIT  = 200
295     /** Tag value for a hard, or mandatory line break  */
296 } USentenceBreakTag;
297 
298 
299 /**
300  * Open a new UBreakIterator for locating text boundaries for a specified locale.
301  * A UBreakIterator may be used for detecting character, line, word,
302  * and sentence breaks in text.
303  * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
304  * UBRK_LINE, UBRK_SENTENCE
305  * @param locale The locale specifying the text-breaking conventions.
306  * @param text The text to be iterated over.
307  * @param textLength The number of characters in text, or -1 if null-terminated.
308  * @param status A UErrorCode to receive any errors.
309  * @return A UBreakIterator for the specified locale.
310  * @see ubrk_openRules
311  * @stable ICU 2.0
312  */
313 U_STABLE UBreakIterator* U_EXPORT2
314 ubrk_open(UBreakIteratorType type,
315       const char *locale,
316       const UChar *text,
317       int32_t textLength,
318       UErrorCode *status);
319 
320 /**
321  * Open a new UBreakIterator for locating text boundaries using specified breaking rules.
322  * The rule syntax is ... (TBD)
323  * @param rules A set of rules specifying the text breaking conventions.
324  * @param rulesLength The number of characters in rules, or -1 if null-terminated.
325  * @param text The text to be iterated over.  May be null, in which case ubrk_setText() is
326  *        used to specify the text to be iterated.
327  * @param textLength The number of characters in text, or -1 if null-terminated.
328  * @param parseErr   Receives position and context information for any syntax errors
329  *                   detected while parsing the rules.
330  * @param status A UErrorCode to receive any errors.
331  * @return A UBreakIterator for the specified rules.
332  * @see ubrk_open
333  * @stable ICU 2.2
334  */
335 U_STABLE UBreakIterator* U_EXPORT2
336 ubrk_openRules(const UChar     *rules,
337                int32_t         rulesLength,
338                const UChar     *text,
339                int32_t          textLength,
340                UParseError     *parseErr,
341                UErrorCode      *status);
342 
343 /**
344  * Thread safe cloning operation
345  * @param bi iterator to be cloned
346  * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
347  *  If buffer is not large enough, new memory will be allocated.
348  *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.
349  * @param pBufferSize pointer to size of allocated space.
350  *  If *pBufferSize == 0, a sufficient size for use in cloning will
351  *  be returned ('pre-flighting')
352  *  If *pBufferSize is not enough for a stack-based safe clone,
353  *  new memory will be allocated.
354  * @param status to indicate whether the operation went on smoothly or there were errors
355  *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
356  * @return pointer to the new clone
357  * @stable ICU 2.0
358  */
359 U_STABLE UBreakIterator * U_EXPORT2
360 ubrk_safeClone(
361           const UBreakIterator *bi,
362           void *stackBuffer,
363           int32_t *pBufferSize,
364           UErrorCode *status);
365 
366 /**
367   * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
368   * @stable ICU 2.0
369   */
370 #define U_BRK_SAFECLONE_BUFFERSIZE 512
371 
372 /**
373 * Close a UBreakIterator.
374 * Once closed, a UBreakIterator may no longer be used.
375 * @param bi The break iterator to close.
376  * @stable ICU 2.0
377 */
378 U_STABLE void U_EXPORT2
379 ubrk_close(UBreakIterator *bi);
380 
381 /**
382  * Sets an existing iterator to point to a new piece of text
383  * @param bi The iterator to use
384  * @param text The text to be set
385  * @param textLength The length of the text
386  * @param status The error code
387  * @stable ICU 2.0
388  */
389 U_STABLE void U_EXPORT2
390 ubrk_setText(UBreakIterator* bi,
391              const UChar*    text,
392              int32_t         textLength,
393              UErrorCode*     status);
394 
395 /**
396  * Determine the most recently-returned text boundary.
397  *
398  * @param bi The break iterator to use.
399  * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
400  * \ref ubrk_first, or \ref ubrk_last.
401  * @stable ICU 2.0
402  */
403 U_STABLE int32_t U_EXPORT2
404 ubrk_current(const UBreakIterator *bi);
405 
406 /**
407  * Determine the text boundary following the current text boundary.
408  *
409  * @param bi The break iterator to use.
410  * @return The character index of the next text boundary, or UBRK_DONE
411  * if all text boundaries have been returned.
412  * @see ubrk_previous
413  * @stable ICU 2.0
414  */
415 U_STABLE int32_t U_EXPORT2
416 ubrk_next(UBreakIterator *bi);
417 
418 /**
419  * Determine the text boundary preceding the current text boundary.
420  *
421  * @param bi The break iterator to use.
422  * @return The character index of the preceding text boundary, or UBRK_DONE
423  * if all text boundaries have been returned.
424  * @see ubrk_next
425  * @stable ICU 2.0
426  */
427 U_STABLE int32_t U_EXPORT2
428 ubrk_previous(UBreakIterator *bi);
429 
430 /**
431  * Determine the index of the first character in the text being scanned.
432  * This is not always the same as index 0 of the text.
433  * @param bi The break iterator to use.
434  * @return The character index of the first character in the text being scanned.
435  * @see ubrk_last
436  * @stable ICU 2.0
437  */
438 U_STABLE int32_t U_EXPORT2
439 ubrk_first(UBreakIterator *bi);
440 
441 /**
442  * Determine the index immediately <EM>beyond</EM> the last character in the text being
443  * scanned.
444  * This is not the same as the last character.
445  * @param bi The break iterator to use.
446  * @return The character offset immediately <EM>beyond</EM> the last character in the
447  * text being scanned.
448  * @see ubrk_first
449  * @stable ICU 2.0
450  */
451 U_STABLE int32_t U_EXPORT2
452 ubrk_last(UBreakIterator *bi);
453 
454 /**
455  * Determine the text boundary preceding the specified offset.
456  * The value returned is always smaller than offset, or UBRK_DONE.
457  * @param bi The break iterator to use.
458  * @param offset The offset to begin scanning.
459  * @return The text boundary preceding offset, or UBRK_DONE.
460  * @see ubrk_following
461  * @stable ICU 2.0
462  */
463 U_STABLE int32_t U_EXPORT2
464 ubrk_preceding(UBreakIterator *bi,
465            int32_t offset);
466 
467 /**
468  * Determine the text boundary following the specified offset.
469  * The value returned is always greater than offset, or UBRK_DONE.
470  * @param bi The break iterator to use.
471  * @param offset The offset to begin scanning.
472  * @return The text boundary following offset, or UBRK_DONE.
473  * @see ubrk_preceding
474  * @stable ICU 2.0
475  */
476 U_STABLE int32_t U_EXPORT2
477 ubrk_following(UBreakIterator *bi,
478            int32_t offset);
479 
480 /**
481 * Get a locale for which text breaking information is available.
482 * A UBreakIterator in a locale returned by this function will perform the correct
483 * text breaking for the locale.
484 * @param index The index of the desired locale.
485 * @return A locale for which number text breaking information is available, or 0 if none.
486 * @see ubrk_countAvailable
487 * @stable ICU 2.0
488 */
489 U_STABLE const char* U_EXPORT2
490 ubrk_getAvailable(int32_t index);
491 
492 /**
493 * Determine how many locales have text breaking information available.
494 * This function is most useful as determining the loop ending condition for
495 * calls to \ref ubrk_getAvailable.
496 * @return The number of locales for which text breaking information is available.
497 * @see ubrk_getAvailable
498 * @stable ICU 2.0
499 */
500 U_STABLE int32_t U_EXPORT2
501 ubrk_countAvailable(void);
502 
503 
504 /**
505 * Returns true if the specfied position is a boundary position.  As a side
506 * effect, leaves the iterator pointing to the first boundary position at
507 * or after "offset".
508 * @param bi The break iterator to use.
509 * @param offset the offset to check.
510 * @return True if "offset" is a boundary position.
511 * @stable ICU 2.0
512 */
513 U_STABLE  UBool U_EXPORT2
514 ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
515 
516 /**
517  * Return the status from the break rule that determined the most recently
518  * returned break position.  The values appear in the rule source
519  * within brackets, {123}, for example.  For rules that do not specify a
520  * status, a default value of 0 is returned.
521  * <p>
522  * For word break iterators, the possible values are defined in enum UWordBreak.
523  * @stable ICU 2.2
524  */
525 U_STABLE  int32_t U_EXPORT2
526 ubrk_getRuleStatus(UBreakIterator *bi);
527 
528 /**
529  * Get the statuses from the break rules that determined the most recently
530  * returned break position.  The values appear in the rule source
531  * within brackets, {123}, for example.  The default status value for rules
532  * that do not explicitly provide one is zero.
533  * <p>
534  * For word break iterators, the possible values are defined in enum UWordBreak.
535  * @param bi        The break iterator to use
536  * @param fillInVec an array to be filled in with the status values.
537  * @param capacity  the length of the supplied vector.  A length of zero causes
538  *                  the function to return the number of status values, in the
539  *                  normal way, without attemtping to store any values.
540  * @param status    receives error codes.
541  * @return          The number of rule status values from rules that determined
542  *                  the most recent boundary returned by the break iterator.
543  * @draft ICU 3.0
544  */
545 U_DRAFT  int32_t U_EXPORT2
546 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);
547 
548 /**
549  * Return the locale of the break iterator. You can choose between the valid and
550  * the actual locale.
551  * @param bi break iterator
552  * @param type locale type (valid or actual)
553  * @param status error code
554  * @return locale string
555  * @draft ICU 2.8 likely to change in ICU 3.0, based on feedback
556  */
557 U_DRAFT const char* U_EXPORT2
558 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
559 
560 
561 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
562 
563 #endif
564