• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 1996-2015, International Business Machines Corporation and others.
6 * All Rights Reserved.
7 ******************************************************************************
8 */
9 
10 #ifndef UBRK_H
11 #define UBRK_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uloc.h"
15 #include "unicode/utext.h"
16 
17 #if U_SHOW_CPLUSPLUS_API
18 #include "unicode/localpointer.h"
19 #endif   // U_SHOW_CPLUSPLUS_API
20 
21 /**
22  * A text-break iterator.
23  *  For usage in C programs.
24  */
25 #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
26 #   define UBRK_TYPEDEF_UBREAK_ITERATOR
27     /**
28      *  Opaque type representing an ICU Break iterator object.
29      *  @stable ICU 2.0
30      */
31     typedef struct UBreakIterator UBreakIterator;
32 #endif
33 
34 #include "unicode/parseerr.h"
35 
36 #if !UCONFIG_NO_BREAK_ITERATION
37 /**
38  * \file
39  * \brief C API: BreakIterator
40  *
41  * <h2> BreakIterator C API </h2>
42  *
43  * The BreakIterator C API defines  methods for finding the location
44  * of boundaries in text. Pointer to a UBreakIterator maintain a
45  * current position and scan over text returning the index of characters
46  * where boundaries occur.
47  * <p>
48  * Line boundary analysis determines where a text string can be broken
49  * when line-wrapping. The mechanism correctly handles punctuation and
50  * hyphenated words.
51  * <p>
52  * Note: The locale keyword "lb" can be used to modify line break
53  * behavior according to the CSS level 3 line-break options, see
54  * <http://dev.w3.org/csswg/css-text/#line-breaking>. For example:
55  * "ja@lb=strict", "zh@lb=loose".
56  * <p>
57  * Sentence boundary analysis allows selection with correct
58  * interpretation of periods within numbers and abbreviations, and
59  * trailing punctuation marks such as quotation marks and parentheses.
60  * <p>
61  * Note: The locale keyword "ss" can be used to enable use of
62  * segmentation suppression data (preventing breaks in English after
63  * abbreviations such as "Mr." or "Est.", for example), as follows:
64  * "en@ss=standard".
65  * <p>
66  * Word boundary analysis is used by search and replace functions, as
67  * well as within text editing applications that allow the user to
68  * select words with a double click. Word selection provides correct
69  * interpretation of punctuation marks within and following
70  * words. Characters that are not part of a word, such as symbols or
71  * punctuation marks, have word-breaks on both sides.
72  * <p>
73  * Character boundary analysis identifies the boundaries of
74  * "Extended Grapheme Clusters", which are groupings of codepoints
75  * that should be treated as character-like units for many text operations.
76  * Please see Unicode Standard Annex #29, Unicode Text Segmentation,
77  * http://www.unicode.org/reports/tr29/ for additional information
78  * on grapheme clusters and guidelines on their use.
79  * <p>
80  * Title boundary analysis locates all positions,
81  * typically starts of words, that should be set to Title Case
82  * when title casing the text.
83  * <p>
84  * The text boundary positions are found according to the rules
85  * described in Unicode Standard Annex #29, Text Boundaries, and
86  * Unicode Standard Annex #14, Line Breaking Properties.  These
87  * are available at http://www.unicode.org/reports/tr14/ and
88  * http://www.unicode.org/reports/tr29/.
89  * <p>
90  * In addition to the plain C API defined in this header file, an
91  * object oriented C++ API with equivalent functionality is defined in the
92  * file brkiter.h.
93  * <p>
94  * Code snippets illustrating the use of the Break Iterator APIs
95  * are available in the ICU User Guide,
96  * https://unicode-org.github.io/icu/userguide/boundaryanalysis/
97  * and in the sample program icu/source/samples/break/break.cpp
98  */
99 
100 /** The possible types of text boundaries.  @stable ICU 2.0 */
101 typedef enum UBreakIteratorType {
102   /** Character breaks  @stable ICU 2.0 */
103   UBRK_CHARACTER = 0,
104   /** Word breaks @stable ICU 2.0 */
105   UBRK_WORD = 1,
106   /** Line breaks @stable ICU 2.0 */
107   UBRK_LINE = 2,
108   /** Sentence breaks @stable ICU 2.0 */
109   UBRK_SENTENCE = 3,
110 } UBreakIteratorType;
111 
112 /** Value indicating all text boundaries have been returned.
113  *  @stable ICU 2.0
114  */
115 #define UBRK_DONE ((int32_t) -1)
116 
117 
118 /**
119  *  Enum constants for the word break tags returned by
120  *  getRuleStatus().  A range of values is defined for each category of
121  *  word, to allow for further subdivisions of a category in future releases.
122  *  Applications should check for tag values falling within the range, rather
123  *  than for single individual values.
124  *
125  * The numeric values of all of these constants are stable (will not change).
126  *
127  * @stable ICU 2.2
128 */
129 typedef enum UWordBreak {
130     /** Tag value for "words" that do not fit into any of other categories.
131      *  Includes spaces and most punctuation. */
132     UBRK_WORD_NONE           = 0,
133     /** Upper bound for tags for uncategorized words. */
134     UBRK_WORD_NONE_LIMIT     = 100,
135     /** Tag value for words that appear to be numbers, lower limit.    */
136     UBRK_WORD_NUMBER         = 100,
137     /** Tag value for words that appear to be numbers, upper limit.    */
138     UBRK_WORD_NUMBER_LIMIT   = 200,
139     /** Tag value for words that contain letters, excluding
140      *  hiragana, katakana or ideographic characters, lower limit.    */
141     UBRK_WORD_LETTER         = 200,
142     /** Tag value for words containing letters, upper limit  */
143     UBRK_WORD_LETTER_LIMIT   = 300,
144     /** Tag value for words containing kana characters, lower limit */
145     UBRK_WORD_KANA           = 300,
146     /** Tag value for words containing kana characters, upper limit */
147     UBRK_WORD_KANA_LIMIT     = 400,
148     /** Tag value for words containing ideographic characters, lower limit */
149     UBRK_WORD_IDEO           = 400,
150     /** Tag value for words containing ideographic characters, upper limit */
151     UBRK_WORD_IDEO_LIMIT     = 500
152 } UWordBreak;
153 
154 /**
155  *  Enum constants for the line break tags returned by getRuleStatus().
156  *  A range of values is defined for each category of
157  *  word, to allow for further subdivisions of a category in future releases.
158  *  Applications should check for tag values falling within the range, rather
159  *  than for single individual values.
160  *
161  * The numeric values of all of these constants are stable (will not change).
162  *
163  * @stable ICU 2.8
164 */
165 typedef enum ULineBreakTag {
166     /** Tag value for soft line breaks, positions at which a line break
167       *  is acceptable but not required                */
168     UBRK_LINE_SOFT            = 0,
169     /** Upper bound for soft line breaks.              */
170     UBRK_LINE_SOFT_LIMIT      = 100,
171     /** Tag value for a hard, or mandatory line break  */
172     UBRK_LINE_HARD            = 100,
173     /** Upper bound for hard line breaks.              */
174     UBRK_LINE_HARD_LIMIT      = 200
175 } ULineBreakTag;
176 
177 
178 
179 /**
180  *  Enum constants for the sentence break tags returned by getRuleStatus().
181  *  A range of values is defined for each category of
182  *  sentence, to allow for further subdivisions of a category in future releases.
183  *  Applications should check for tag values falling within the range, rather
184  *  than for single individual values.
185  *
186  * The numeric values of all of these constants are stable (will not change).
187  *
188  * @stable ICU 2.8
189 */
190 typedef enum USentenceBreakTag {
191     /** Tag value for for sentences  ending with a sentence terminator
192       * ('.', '?', '!', etc.) character, possibly followed by a
193       * hard separator (CR, LF, PS, etc.)
194       */
195     UBRK_SENTENCE_TERM       = 0,
196     /** Upper bound for tags for sentences ended by sentence terminators.    */
197     UBRK_SENTENCE_TERM_LIMIT = 100,
198     /** Tag value for for sentences that do not contain an ending
199       * sentence terminator ('.', '?', '!', etc.) character, but
200       * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
201       */
202     UBRK_SENTENCE_SEP        = 100,
203     /** Upper bound for tags for sentences ended by a separator.              */
204     UBRK_SENTENCE_SEP_LIMIT  = 200
205     /** Tag value for a hard, or mandatory line break  */
206 } USentenceBreakTag;
207 
208 
209 /**
210  * Open a new UBreakIterator for locating text boundaries for a specified locale.
211  * A UBreakIterator may be used for detecting character, line, word,
212  * and sentence breaks in text.
213  * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
214  * UBRK_LINE, UBRK_SENTENCE
215  * @param locale The locale specifying the text-breaking conventions. Note that
216  * locale keys such as "lb" and "ss" may be used to modify text break behavior,
217  * see general discussion of BreakIterator C API.
218  * @param text The text to be iterated over. May be null, in which case ubrk_setText() is
219  *        used to specify the text to be iterated.
220  * @param textLength The number of characters in text, or -1 if null-terminated.
221  * @param status A UErrorCode to receive any errors.
222  * @return A UBreakIterator for the specified locale.
223  * @see ubrk_openRules
224  * @stable ICU 2.0
225  */
226 U_CAPI UBreakIterator* U_EXPORT2
227 ubrk_open(UBreakIteratorType type,
228       const char *locale,
229       const UChar *text,
230       int32_t textLength,
231       UErrorCode *status);
232 
233 /**
234  * Open a new UBreakIterator for locating text boundaries using specified breaking rules.
235  * The rule syntax is ... (TBD)
236  * @param rules A set of rules specifying the text breaking conventions.
237  * @param rulesLength The number of characters in rules, or -1 if null-terminated.
238  * @param text The text to be iterated over.  May be null, in which case ubrk_setText() is
239  *        used to specify the text to be iterated.
240  * @param textLength The number of characters in text, or -1 if null-terminated.
241  * @param parseErr   Receives position and context information for any syntax errors
242  *                   detected while parsing the rules.
243  * @param status A UErrorCode to receive any errors.
244  * @return A UBreakIterator for the specified rules.
245  * @see ubrk_open
246  * @stable ICU 2.2
247  */
248 U_CAPI UBreakIterator* U_EXPORT2
249 ubrk_openRules(const UChar     *rules,
250                int32_t         rulesLength,
251                const UChar     *text,
252                int32_t          textLength,
253                UParseError     *parseErr,
254                UErrorCode      *status);
255 
256 /**
257  * Open a new UBreakIterator for locating text boundaries using precompiled binary rules.
258  * Opening a UBreakIterator this way is substantially faster than using ubrk_openRules.
259  * Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not
260  * compatible across different major versions of ICU, nor across platforms of different
261  * endianness or different base character set family (ASCII vs EBCDIC).
262  * @param binaryRules A set of compiled binary rules specifying the text breaking
263  *                    conventions. Ownership of the storage containing the compiled
264  *                    rules remains with the caller of this function. The compiled
265  *                    rules must not be modified or deleted during the life of the
266  *                    break iterator.
267  * @param rulesLength The length of binaryRules in bytes; must be >= 0.
268  * @param text        The text to be iterated over.  May be null, in which case
269  *                    ubrk_setText() is used to specify the text to be iterated.
270  * @param textLength  The number of characters in text, or -1 if null-terminated.
271  * @param status      Pointer to UErrorCode to receive any errors.
272  * @return            UBreakIterator for the specified rules.
273  * @see ubrk_getBinaryRules
274  * @stable ICU 59
275  */
276 U_CAPI UBreakIterator* U_EXPORT2
277 ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength,
278                      const UChar *  text, int32_t textLength,
279                      UErrorCode *   status);
280 
281 #ifndef U_HIDE_DEPRECATED_API
282 
283 #endif /* U_HIDE_DEPRECATED_API */
284 
285 /**
286  * Thread safe cloning operation.
287  * @param bi iterator to be cloned
288  * @param status to indicate whether the operation went on smoothly or there were errors
289  * @return pointer to the new clone
290  * @stable ICU 69
291  */
292 U_CAPI UBreakIterator * U_EXPORT2
293 ubrk_clone(const UBreakIterator *bi,
294            UErrorCode *status);
295 
296 /**
297 * Close a UBreakIterator.
298 * Once closed, a UBreakIterator may no longer be used.
299 * @param bi The break iterator to close.
300  * @stable ICU 2.0
301 */
302 U_CAPI void U_EXPORT2
303 ubrk_close(UBreakIterator *bi);
304 
305 #if U_SHOW_CPLUSPLUS_API
306 
307 U_NAMESPACE_BEGIN
308 
309 /**
310  * \class LocalUBreakIteratorPointer
311  * "Smart pointer" class, closes a UBreakIterator via ubrk_close().
312  * For most methods see the LocalPointerBase base class.
313  *
314  * @see LocalPointerBase
315  * @see LocalPointer
316  * @stable ICU 4.4
317  */
318 U_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_close);
319 
320 U_NAMESPACE_END
321 
322 #endif
323 
324 /**
325  * Sets an existing iterator to point to a new piece of text.
326  * The break iterator retains a pointer to the supplied text.
327  * The caller must not modify or delete the text while the BreakIterator
328  * retains the reference.
329  *
330  * @param bi The iterator to use
331  * @param text The text to be set
332  * @param textLength The length of the text
333  * @param status The error code
334  * @stable ICU 2.0
335  */
336 U_CAPI void U_EXPORT2
337 ubrk_setText(UBreakIterator* bi,
338              const UChar*    text,
339              int32_t         textLength,
340              UErrorCode*     status);
341 
342 /**
343  * Sets an existing iterator to point to a new piece of text.
344  *
345  * All index positions returned by break iterator functions are
346  * native indices from the UText. For example, when breaking UTF-8
347  * encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc.
348  * will be UTF-8 string indices, not UTF-16 positions.
349  *
350  * @param bi The iterator to use
351  * @param text The text to be set.
352  *             This function makes a shallow clone of the supplied UText.  This means
353  *             that the caller is free to immediately close or otherwise reuse the
354  *             UText that was passed as a parameter, but that the underlying text itself
355  *             must not be altered while being referenced by the break iterator.
356  * @param status The error code
357  * @stable ICU 3.4
358  */
359 U_CAPI void U_EXPORT2
360 ubrk_setUText(UBreakIterator* bi,
361              UText*          text,
362              UErrorCode*     status);
363 
364 /**
365  * Determine the most recently-returned text boundary.
366  *
367  * @param bi The break iterator to use.
368  * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
369  * \ref ubrk_first, or \ref ubrk_last.
370  * @stable ICU 2.0
371  */
372 U_CAPI int32_t U_EXPORT2
373 ubrk_current(const UBreakIterator *bi);
374 
375 /**
376  * Advance the iterator to the boundary following the current boundary.
377  *
378  * @param bi The break iterator to use.
379  * @return The character index of the next text boundary, or UBRK_DONE
380  * if all text boundaries have been returned.
381  * @see ubrk_previous
382  * @stable ICU 2.0
383  */
384 U_CAPI int32_t U_EXPORT2
385 ubrk_next(UBreakIterator *bi);
386 
387 /**
388  * Set the iterator position to the boundary preceding the current boundary.
389  *
390  * @param bi The break iterator to use.
391  * @return The character index of the preceding text boundary, or UBRK_DONE
392  * if all text boundaries have been returned.
393  * @see ubrk_next
394  * @stable ICU 2.0
395  */
396 U_CAPI int32_t U_EXPORT2
397 ubrk_previous(UBreakIterator *bi);
398 
399 /**
400  * Set the iterator position to zero, the start of the text being scanned.
401  * @param bi The break iterator to use.
402  * @return The new iterator position (zero).
403  * @see ubrk_last
404  * @stable ICU 2.0
405  */
406 U_CAPI int32_t U_EXPORT2
407 ubrk_first(UBreakIterator *bi);
408 
409 /**
410  * Set the iterator position to the index immediately <EM>beyond</EM> the last character in the text being scanned.
411  * This is not the same as the last character.
412  * @param bi The break iterator to use.
413  * @return The character offset immediately <EM>beyond</EM> the last character in the
414  * text being scanned.
415  * @see ubrk_first
416  * @stable ICU 2.0
417  */
418 U_CAPI int32_t U_EXPORT2
419 ubrk_last(UBreakIterator *bi);
420 
421 /**
422  * Set the iterator position to the first boundary preceding the specified offset.
423  * The new position is always smaller than offset, or UBRK_DONE.
424  * @param bi The break iterator to use.
425  * @param offset The offset to begin scanning.
426  * @return The text boundary preceding offset, or UBRK_DONE.
427  * @see ubrk_following
428  * @stable ICU 2.0
429  */
430 U_CAPI int32_t U_EXPORT2
431 ubrk_preceding(UBreakIterator *bi,
432            int32_t offset);
433 
434 /**
435  * Advance the iterator to the first boundary following the specified offset.
436  * The value returned is always greater than offset, or UBRK_DONE.
437  * @param bi The break iterator to use.
438  * @param offset The offset to begin scanning.
439  * @return The text boundary following offset, or UBRK_DONE.
440  * @see ubrk_preceding
441  * @stable ICU 2.0
442  */
443 U_CAPI int32_t U_EXPORT2
444 ubrk_following(UBreakIterator *bi,
445            int32_t offset);
446 
447 /**
448 * Get a locale for which text breaking information is available.
449 * A UBreakIterator in a locale returned by this function will perform the correct
450 * text breaking for the locale.
451 * @param index The index of the desired locale.
452 * @return A locale for which number text breaking information is available, or 0 if none.
453 * @see ubrk_countAvailable
454 * @stable ICU 2.0
455 */
456 U_CAPI const char* U_EXPORT2
457 ubrk_getAvailable(int32_t index);
458 
459 /**
460 * Determine how many locales have text breaking information available.
461 * This function is most useful as determining the loop ending condition for
462 * calls to \ref ubrk_getAvailable.
463 * @return The number of locales for which text breaking information is available.
464 * @see ubrk_getAvailable
465 * @stable ICU 2.0
466 */
467 U_CAPI int32_t U_EXPORT2
468 ubrk_countAvailable(void);
469 
470 
471 /**
472 * Returns true if the specified position is a boundary position.  As a side
473 * effect, leaves the iterator pointing to the first boundary position at
474 * or after "offset".
475 * @param bi The break iterator to use.
476 * @param offset the offset to check.
477 * @return True if "offset" is a boundary position.
478 * @stable ICU 2.0
479 */
480 U_CAPI  UBool U_EXPORT2
481 ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
482 
483 /**
484  * Return the status from the break rule that determined the most recently
485  * returned break position.  The values appear in the rule source
486  * within brackets, {123}, for example.  For rules that do not specify a
487  * status, a default value of 0 is returned.
488  * <p>
489  * For word break iterators, the possible values are defined in enum UWordBreak.
490  * @stable ICU 2.2
491  */
492 U_CAPI  int32_t U_EXPORT2
493 ubrk_getRuleStatus(UBreakIterator *bi);
494 
495 /**
496  * Get the statuses from the break rules that determined the most recently
497  * returned break position.  The values appear in the rule source
498  * within brackets, {123}, for example.  The default status value for rules
499  * that do not explicitly provide one is zero.
500  * <p>
501  * For word break iterators, the possible values are defined in enum UWordBreak.
502  * @param bi        The break iterator to use
503  * @param fillInVec an array to be filled in with the status values.
504  * @param capacity  the length of the supplied vector.  A length of zero causes
505  *                  the function to return the number of status values, in the
506  *                  normal way, without attempting to store any values.
507  * @param status    receives error codes.
508  * @return          The number of rule status values from rules that determined
509  *                  the most recent boundary returned by the break iterator.
510  * @stable ICU 3.0
511  */
512 U_CAPI  int32_t U_EXPORT2
513 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);
514 
515 /**
516  * Return the locale of the break iterator. You can choose between the valid and
517  * the actual locale.
518  * @param bi break iterator
519  * @param type locale type (valid or actual)
520  * @param status error code
521  * @return locale string
522  * @stable ICU 2.8
523  */
524 U_CAPI const char* U_EXPORT2
525 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
526 
527 /**
528  * Get a compiled binary version of the rules specifying the behavior of a UBreakIterator.
529  * The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator
530  * more quickly than using ubrk_openRules. The compiled rules are not compatible across
531  * different major versions of ICU, nor across platforms of different endianness or
532  * different base character set family (ASCII vs EBCDIC). Supports preflighting (with
533  * binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to
534  * the binaryRules buffer. However, whether preflighting or not, if the actual length
535  * is greater than INT32_MAX, then the function returns 0 and sets *status to
536  * U_INDEX_OUTOFBOUNDS_ERROR.
537 
538  * @param bi            The break iterator to use.
539  * @param binaryRules   Buffer to receive the compiled binary rules; set to NULL for
540  *                      preflighting.
541  * @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for
542  *                      preflighting. Must be >= 0.
543  * @param status        Pointer to UErrorCode to receive any errors, such as
544  *                      U_BUFFER_OVERFLOW_ERROR, U_INDEX_OUTOFBOUNDS_ERROR, or
545  *                      U_ILLEGAL_ARGUMENT_ERROR.
546  * @return              The actual byte length of the binary rules, if <= INT32_MAX;
547  *                      otherwise 0. If not preflighting and this is larger than
548  *                      rulesCapacity, *status will be set to an error.
549  * @see ubrk_openBinaryRules
550  * @stable ICU 59
551  */
552 U_CAPI int32_t U_EXPORT2
553 ubrk_getBinaryRules(UBreakIterator *bi,
554                     uint8_t *       binaryRules, int32_t rulesCapacity,
555                     UErrorCode *    status);
556 
557 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
558 
559 #endif
560