• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  * Copyright (c) 1996, 2006, Oracle and/or its affiliates. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.  Oracle designates this
9  * particular file as subject to the "Classpath" exception as provided
10  * by Oracle in the LICENSE file that accompanied this code.
11  *
12  * This code is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15  * version 2 for more details (a copy is included in the LICENSE file that
16  * accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License version
19  * 2 along with this work; if not, write to the Free Software Foundation,
20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23  * or visit www.oracle.com if you need additional information or have any
24  * questions.
25  */
26 
27 /*
28  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
29  * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
30  *
31  * The original version of this source code and documentation
32  * is copyrighted and owned by Taligent, Inc., a wholly-owned
33  * subsidiary of IBM. These materials are provided under terms
34  * of a License Agreement between Taligent and Sun. This technology
35  * is protected by multiple US and International patents.
36  *
37  * This notice and attribution to Taligent may not be removed.
38  * Taligent is a registered trademark of Taligent, Inc.
39  *
40  */
41 
42 package java.text;
43 
44 import java.util.Locale;
45 
46 
47 /**
48  * The <code>BreakIterator</code> class implements methods for finding
49  * the location of boundaries in text. Instances of <code>BreakIterator</code>
50  * maintain a current position and scan over text
51  * returning the index of characters where boundaries occur.
52  * Internally, <code>BreakIterator</code> scans text using a
53  * <code>CharacterIterator</code>, and is thus able to scan text held
54  * by any object implementing that protocol. A <code>StringCharacterIterator</code>
55  * is used to scan <code>String</code> objects passed to <code>setText</code>.
56  *
57  * <p>
58  * You use the factory methods provided by this class to create
59  * instances of various types of break iterators. In particular,
60  * use <code>getWordInstance</code>, <code>getLineInstance</code>,
61  * <code>getSentenceInstance</code>, and <code>getCharacterInstance</code>
62  * to create <code>BreakIterator</code>s that perform
63  * word, line, sentence, and character boundary analysis respectively.
64  * A single <code>BreakIterator</code> can work only on one unit
65  * (word, line, sentence, and so on). You must use a different iterator
66  * for each unit boundary analysis you wish to perform.
67  *
68  * <p><a name="line"></a>
69  * Line boundary analysis determines where a text string can be
70  * broken when line-wrapping. The mechanism correctly handles
71  * punctuation and hyphenated words. Actual line breaking needs
72  * to also consider the available line width and is handled by
73  * higher-level software.
74  *
75  * <p><a name="sentence"></a>
76  * Sentence boundary analysis allows selection with correct interpretation
77  * of periods within numbers and abbreviations, and trailing punctuation
78  * marks such as quotation marks and parentheses.
79  *
80  * <p><a name="word"></a>
81  * Word boundary analysis is used by search and replace functions, as
82  * well as within text editing applications that allow the user to
83  * select words with a double click. Word selection provides correct
84  * interpretation of punctuation marks within and following
85  * words. Characters that are not part of a word, such as symbols
86  * or punctuation marks, have word-breaks on both sides.
87  *
88  * <p><a name="character"></a>
89  * Character boundary analysis allows users to interact with characters
90  * as they expect to, for example, when moving the cursor through a text
91  * string. Character boundary analysis provides correct navigation
92  * through character strings, regardless of how the character is stored.
93  * The boundaries returned may be those of supplementary characters,
94  * combining character sequences, or ligature clusters.
95  * For example, an accented character might be stored as a base character
96  * and a diacritical mark. What users consider to be a character can
97  * differ between languages.
98  *
99  * <p>
100  * The <code>BreakIterator</code> instances returned by the factory methods
101  * of this class are intended for use with natural languages only, not for
102  * programming language text. It is however possible to define subclasses
103  * that tokenize a programming language.
104  *
105  * <P>
106  * <strong>Examples</strong>:<P>
107  * Creating and using text boundaries:
108  * <blockquote>
109  * <pre>
110  * public static void main(String args[]) {
111  *      if (args.length == 1) {
112  *          String stringToExamine = args[0];
113  *          //print each word in order
114  *          BreakIterator boundary = BreakIterator.getWordInstance();
115  *          boundary.setText(stringToExamine);
116  *          printEachForward(boundary, stringToExamine);
117  *          //print each sentence in reverse order
118  *          boundary = BreakIterator.getSentenceInstance(Locale.US);
119  *          boundary.setText(stringToExamine);
120  *          printEachBackward(boundary, stringToExamine);
121  *          printFirst(boundary, stringToExamine);
122  *          printLast(boundary, stringToExamine);
123  *      }
124  * }
125  * </pre>
126  * </blockquote>
127  *
128  * Print each element in order:
129  * <blockquote>
130  * <pre>
131  * public static void printEachForward(BreakIterator boundary, String source) {
132  *     int start = boundary.first();
133  *     for (int end = boundary.next();
134  *          end != BreakIterator.DONE;
135  *          start = end, end = boundary.next()) {
136  *          System.out.println(source.substring(start,end));
137  *     }
138  * }
139  * </pre>
140  * </blockquote>
141  *
142  * Print each element in reverse order:
143  * <blockquote>
144  * <pre>
145  * public static void printEachBackward(BreakIterator boundary, String source) {
146  *     int end = boundary.last();
147  *     for (int start = boundary.previous();
148  *          start != BreakIterator.DONE;
149  *          end = start, start = boundary.previous()) {
150  *         System.out.println(source.substring(start,end));
151  *     }
152  * }
153  * </pre>
154  * </blockquote>
155  *
156  * Print first element:
157  * <blockquote>
158  * <pre>
159  * public static void printFirst(BreakIterator boundary, String source) {
160  *     int start = boundary.first();
161  *     int end = boundary.next();
162  *     System.out.println(source.substring(start,end));
163  * }
164  * </pre>
165  * </blockquote>
166  *
167  * Print last element:
168  * <blockquote>
169  * <pre>
170  * public static void printLast(BreakIterator boundary, String source) {
171  *     int end = boundary.last();
172  *     int start = boundary.previous();
173  *     System.out.println(source.substring(start,end));
174  * }
175  * </pre>
176  * </blockquote>
177  *
178  * Print the element at a specified position:
179  * <blockquote>
180  * <pre>
181  * public static void printAt(BreakIterator boundary, int pos, String source) {
182  *     int end = boundary.following(pos);
183  *     int start = boundary.previous();
184  *     System.out.println(source.substring(start,end));
185  * }
186  * </pre>
187  * </blockquote>
188  *
189  * Find the next word:
190  * <blockquote>
191  * <pre>
192  * public static int nextWordStartAfter(int pos, String text) {
193  *     BreakIterator wb = BreakIterator.getWordInstance();
194  *     wb.setText(text);
195  *     int last = wb.following(pos);
196  *     int current = wb.next();
197  *     while (current != BreakIterator.DONE) {
198  *         for (int p = last; p < current; p++) {
199  *             if (Character.isLetter(text.codePointAt(p)))
200  *                 return last;
201  *         }
202  *         last = current;
203  *         current = wb.next();
204  *     }
205  *     return BreakIterator.DONE;
206  * }
207  * </pre>
208  * (The iterator returned by BreakIterator.getWordInstance() is unique in that
209  * the break positions it returns don't represent both the start and end of the
210  * thing being iterated over.  That is, a sentence-break iterator returns breaks
211  * that each represent the end of one sentence and the beginning of the next.
212  * With the word-break iterator, the characters between two boundaries might be a
213  * word, or they might be the punctuation or whitespace between two words.  The
214  * above code uses a simple heuristic to determine which boundary is the beginning
215  * of a word: If the characters between this boundary and the next boundary
216  * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
217  * a Hangul syllable, a Kana character, etc.), then the text between this boundary
218  * and the next is a word; otherwise, it's the material between words.)
219  * </blockquote>
220  *
221  * @see CharacterIterator
222  *
223  */
224 
225 public abstract class BreakIterator implements Cloneable {
226 
227     /**
228      * Constructor. BreakIterator is stateless and has no default behavior.
229      */
BreakIterator()230     protected BreakIterator() {
231     }
232 
233     /**
234      * Create a copy of this iterator
235      *
236      * @return A copy of this
237      */
238     @Override
clone()239     public Object clone() {
240         try {
241             return super.clone();
242         } catch (CloneNotSupportedException e) {
243             throw new AssertionError(e);
244         }
245     }
246 
247     /**
248      * DONE is returned by previous(), next(), next(int), preceding(int)
249      * and following(int) when either the first or last text boundary has been
250      * reached.
251      */
252     public static final int DONE = -1;
253 
254     /**
255      * Returns the first boundary. The iterator's current position is set
256      * to the first text boundary.
257      *
258      * @return The character index of the first text boundary.
259      */
first()260     public abstract int first();
261 
262     /**
263      * Returns the last boundary. The iterator's current position is set
264      * to the last text boundary.
265      *
266      * @return The character index of the last text boundary.
267      */
last()268     public abstract int last();
269 
270     /**
271      * Returns the nth boundary from the current boundary. If either
272      * the first or last text boundary has been reached, it returns
273      * <code>BreakIterator.DONE</code> and the current position is set to either
274      * the first or last text boundary depending on which one is reached. Otherwise,
275      * the iterator's current position is set to the new boundary.
276      * For example, if the iterator's current position is the mth text boundary
277      * and three more boundaries exist from the current boundary to the last text
278      * boundary, the next(2) call will return m + 2. The new text position is set
279      * to the (m + 2)th text boundary. A next(4) call would return
280      * <code>BreakIterator.DONE</code> and the last text boundary would become the
281      * new text position.
282      *
283      * @param n which boundary to return.  A value of 0
284      *          does nothing.  Negative values move to previous boundaries
285      *          and positive values move to later boundaries.
286      * @return The character index of the nth boundary from the current position
287      * or <code>BreakIterator.DONE</code> if either first or last text boundary
288      * has been reached.
289      */
next(int n)290     public abstract int next(int n);
291 
292     /**
293      * Returns the boundary following the current boundary. If the current boundary
294      * is the last text boundary, it returns <code>BreakIterator.DONE</code> and
295      * the iterator's current position is unchanged. Otherwise, the iterator's
296      * current position is set to the boundary following the current boundary.
297      *
298      * @return The character index of the next text boundary or
299      * <code>BreakIterator.DONE</code> if the current boundary is the last text
300      * boundary.
301      * Equivalent to next(1).
302      * @see #next(int)
303      */
next()304     public abstract int next();
305 
306     /**
307      * Returns the boundary preceding the current boundary. If the current boundary
308      * is the first text boundary, it returns <code>BreakIterator.DONE</code> and
309      * the iterator's current position is unchanged. Otherwise, the iterator's
310      * current position is set to the boundary preceding the current boundary.
311      *
312      * @return The character index of the previous text boundary or
313      * <code>BreakIterator.DONE</code> if the current boundary is the first text
314      * boundary.
315      */
previous()316     public abstract int previous();
317 
318     /**
319      * Returns the first boundary following the specified character offset. If the
320      * specified offset equals to the last text boundary, it returns
321      * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
322      * Otherwise, the iterator's current position is set to the returned boundary.
323      * The value returned is always greater than the offset or the value
324      * <code>BreakIterator.DONE</code>.
325      *
326      * @param offset the character offset to begin scanning.
327      * @return The first boundary after the specified offset or
328      * <code>BreakIterator.DONE</code> if the last text boundary is passed in
329      * as the offset.
330      * @throws IllegalArgumentException if the specified offset is less than
331      *                                  the first text boundary or greater than the last text
332      *                                  boundary.
333      */
following(int offset)334     public abstract int following(int offset);
335 
336     /**
337      * Returns the last boundary preceding the specified character offset. If the
338      * specified offset equals to the first text boundary, it returns
339      * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
340      * Otherwise, the iterator's current position is set to the returned boundary.
341      * The value returned is always less than the offset or the value
342      * <code>BreakIterator.DONE</code>.
343      * @param offset the characater offset to begin scanning.
344      * @return The last boundary before the specified offset or
345      * <code>BreakIterator.DONE</code> if the first text boundary is passed in
346      * as the offset.
347      * @exception   IllegalArgumentException if the specified offset is less than
348      * the first text boundary or greater than the last text boundary.
349      * @since 1.2
350      */
preceding(int offset)351     public int preceding(int offset) {
352         // NOTE:  This implementation is here solely because we can't add new
353         // abstract methods to an existing class.  There is almost ALWAYS a
354         // better, faster way to do this.
355         int pos = following(offset);
356         while (pos >= offset && pos != DONE)
357             pos = previous();
358         return pos;
359     }
360 
361     /**
362      * Returns true if the specified character offset is a text boundary.
363      * @param offset the character offset to check.
364      * @return <code>true</code> if "offset" is a boundary position,
365      * <code>false</code> otherwise.
366      * @exception   IllegalArgumentException if the specified offset is less than
367      * the first text boundary or greater than the last text boundary.
368      * @since 1.2
369      */
isBoundary(int offset)370     public boolean isBoundary(int offset) {
371         // NOTE: This implementation probably is wrong for most situations
372         // because it fails to take into account the possibility that a
373         // CharacterIterator passed to setText() may not have a begin offset
374         // of 0.  But since the abstract BreakIterator doesn't have that
375         // knowledge, it assumes the begin offset is 0.  If you subclass
376         // BreakIterator, copy the SimpleTextBoundary implementation of this
377         // function into your subclass.  [This should have been abstract at
378         // this level, but it's too late to fix that now.]
379         if (offset == 0) {
380             return true;
381         }
382         int boundary = following(offset - 1);
383         if (boundary == DONE) {
384             throw new IllegalArgumentException();
385         }
386         return boundary == offset;
387     }
388 
389     /**
390      * Returns character index of the text boundary that was most
391      * recently returned by next(), next(int), previous(), first(), last(),
392      * following(int) or preceding(int). If any of these methods returns
393      * <code>BreakIterator.DONE</code> because either first or last text boundary
394      * has been reached, it returns the first or last text boundary depending on
395      * which one is reached.
396      *
397      * @return The text boundary returned from the above methods, first or last
398      * text boundary.
399      * @see #next()
400      * @see #next(int)
401      * @see #previous()
402      * @see #first()
403      * @see #last()
404      * @see #following(int)
405      * @see #preceding(int)
406      */
current()407     public abstract int current();
408 
409     /**
410      * Get the text being scanned
411      *
412      * @return the text being scanned
413      */
getText()414     public abstract CharacterIterator getText();
415 
416     /**
417      * Set a new text string to be scanned.  The current scan
418      * position is reset to first().
419      *
420      * @param newText new text to scan.
421      */
setText(String newText)422     public void setText(String newText) {
423         setText(new StringCharacterIterator(newText));
424     }
425 
426     /**
427      * Set a new text for scanning.  The current scan
428      * position is reset to first().
429      *
430      * @param newText new text to scan.
431      */
setText(CharacterIterator newText)432     public abstract void setText(CharacterIterator newText);
433 
434     /**
435      * Returns a new <code>BreakIterator</code> instance
436      * for <a href="#word">word breaks</a>
437      * for the {@linkplain Locale#getDefault() default locale}.
438      *
439      * @return A break iterator for word breaks
440      */
getWordInstance()441     public static BreakIterator getWordInstance() {
442         return getWordInstance(Locale.getDefault());
443     }
444 
445     /**
446      * Returns a new <code>BreakIterator</code> instance
447      * for <a href="#word">word breaks</a>
448      * for the given locale.
449      *
450      * @param locale the desired locale
451      * @return A break iterator for word breaks
452      * @throws NullPointerException if <code>locale</code> is null
453      */
getWordInstance(Locale locale)454     public static BreakIterator getWordInstance(Locale locale) {
455         return new IcuIteratorWrapper(
456                 android.icu.text.BreakIterator.getWordInstance(locale));
457     }
458 
459     /**
460      * Returns a new <code>BreakIterator</code> instance
461      * for <a href="#line">line breaks</a>
462      * for the {@linkplain Locale#getDefault() default locale}.
463      *
464      * @return A break iterator for line breaks
465      */
getLineInstance()466     public static BreakIterator getLineInstance() {
467         return getLineInstance(Locale.getDefault());
468     }
469 
470     /**
471      * Returns a new <code>BreakIterator</code> instance
472      * for <a href="#line">line breaks</a>
473      * for the given locale.
474      *
475      * @param locale the desired locale
476      * @return A break iterator for line breaks
477      * @throws NullPointerException if <code>locale</code> is null
478      */
getLineInstance(Locale locale)479     public static BreakIterator getLineInstance(Locale locale) {
480         return new IcuIteratorWrapper(
481                 android.icu.text.BreakIterator.getLineInstance(locale));
482     }
483 
484     /**
485      * Returns a new <code>BreakIterator</code> instance
486      * for <a href="#character">character breaks</a>
487      * for the {@linkplain Locale#getDefault() default locale}.
488      *
489      * @return A break iterator for character breaks
490      */
getCharacterInstance()491     public static BreakIterator getCharacterInstance() {
492         return getCharacterInstance(Locale.getDefault());
493     }
494 
495     /**
496      * Returns a new <code>BreakIterator</code> instance
497      * for <a href="#character">character breaks</a>
498      * for the given locale.
499      *
500      * @param locale the desired locale
501      * @return A break iterator for character breaks
502      * @throws NullPointerException if <code>locale</code> is null
503      */
getCharacterInstance(Locale locale)504     public static BreakIterator getCharacterInstance(Locale locale) {
505         return new IcuIteratorWrapper(
506                 android.icu.text.BreakIterator.getCharacterInstance(locale));
507     }
508 
509     /**
510      * Returns a new <code>BreakIterator</code> instance
511      * for <a href="#sentence">sentence breaks</a>
512      * for the {@linkplain Locale#getDefault() default locale}.
513      *
514      * @return A break iterator for sentence breaks
515      */
getSentenceInstance()516     public static BreakIterator getSentenceInstance() {
517         return getSentenceInstance(Locale.getDefault());
518     }
519 
520     /**
521      * Returns a new <code>BreakIterator</code> instance
522      * for <a href="#sentence">sentence breaks</a>
523      * for the given locale.
524      *
525      * @param locale the desired locale
526      * @return A break iterator for sentence breaks
527      * @throws NullPointerException if <code>locale</code> is null
528      */
getSentenceInstance(Locale locale)529     public static BreakIterator getSentenceInstance(Locale locale) {
530         return new IcuIteratorWrapper(
531                 android.icu.text.BreakIterator.getSentenceInstance(locale));
532     }
533 
534     /**
535      * Returns an array of all locales for which the
536      * <code>get*Instance</code> methods of this class can return
537      * localized instances.
538      * The returned array represents the union of locales supported by the Java
539      * runtime and by installed
540      * {@link java.text.spi.BreakIteratorProvider BreakIteratorProvider} implementations.
541      * It must contain at least a <code>Locale</code>
542      * instance equal to {@link java.util.Locale#US Locale.US}.
543      *
544      * @return An array of locales for which localized
545      * <code>BreakIterator</code> instances are available.
546      */
getAvailableLocales()547     public static synchronized Locale[] getAvailableLocales() {
548         return android.icu.text.BreakIterator.getAvailableLocales();
549     }
550 }
551