• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.  Oracle designates this
9  * particular file as subject to the "Classpath" exception as provided
10  * by Oracle in the LICENSE file that accompanied this code.
11  *
12  * This code is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15  * version 2 for more details (a copy is included in the LICENSE file that
16  * accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License version
19  * 2 along with this work; if not, write to the Free Software Foundation,
20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23  * or visit www.oracle.com if you need additional information or have any
24  * questions.
25  */
26 
27 /*
28  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
29  * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
30  *
31  * The original version of this source code and documentation
32  * is copyrighted and owned by Taligent, Inc., a wholly-owned
33  * subsidiary of IBM. These materials are provided under terms
34  * of a License Agreement between Taligent and Sun. This technology
35  * is protected by multiple US and International patents.
36  *
37  * This notice and attribution to Taligent may not be removed.
38  * Taligent is a registered trademark of Taligent, Inc.
39  *
40  */
41 
42 package java.text;
43 
44 import java.util.Locale;
45 
46 
47 // Android-changed: Discourage modification on CharacterIterator after setText. http://b/80456574
48 /**
49  * The <code>BreakIterator</code> class implements methods for finding
50  * the location of boundaries in text. Instances of <code>BreakIterator</code>
51  * maintain a current position and scan over text
52  * returning the index of characters where boundaries occur.
53  * Internally, <code>BreakIterator</code> scans text using a
54  * <code>CharacterIterator</code>, and is thus able to scan text held
55  * by any object implementing that protocol. A <code>StringCharacterIterator</code>
56  * is used to scan <code>String</code> objects passed to <code>setText</code>.
57  * The <code>CharacterIterator</code> object must not be modified after having been
58  * passed to <code>setText</code>. If the text in the <code>CharacterIterator</code> object
59  * is changed, the caller must reset <code>BreakIterator</code> by calling
60  * <code>setText</code>.
61  *
62  * <p>
63  * You use the factory methods provided by this class to create
64  * instances of various types of break iterators. In particular,
65  * use <code>getWordInstance</code>, <code>getLineInstance</code>,
66  * <code>getSentenceInstance</code>, and <code>getCharacterInstance</code>
67  * to create <code>BreakIterator</code>s that perform
68  * word, line, sentence, and character boundary analysis respectively.
69  * A single <code>BreakIterator</code> can work only on one unit
70  * (word, line, sentence, and so on). You must use a different iterator
71  * for each unit boundary analysis you wish to perform.
72  *
73  * <p><a name="line"></a>
74  * Line boundary analysis determines where a text string can be
75  * broken when line-wrapping. The mechanism correctly handles
76  * punctuation and hyphenated words. Actual line breaking needs
77  * to also consider the available line width and is handled by
78  * higher-level software.
79  *
80  * <p><a name="sentence"></a>
81  * Sentence boundary analysis allows selection with correct interpretation
82  * of periods within numbers and abbreviations, and trailing punctuation
83  * marks such as quotation marks and parentheses.
84  *
85  * <p><a name="word"></a>
86  * Word boundary analysis is used by search and replace functions, as
87  * well as within text editing applications that allow the user to
88  * select words with a double click. Word selection provides correct
89  * interpretation of punctuation marks within and following
90  * words. Characters that are not part of a word, such as symbols
91  * or punctuation marks, have word-breaks on both sides.
92  *
93  * <p><a name="character"></a>
94  * Character boundary analysis allows users to interact with characters
95  * as they expect to, for example, when moving the cursor through a text
96  * string. Character boundary analysis provides correct navigation
97  * through character strings, regardless of how the character is stored.
98  * The boundaries returned may be those of supplementary characters,
99  * combining character sequences, or ligature clusters.
100  * For example, an accented character might be stored as a base character
101  * and a diacritical mark. What users consider to be a character can
102  * differ between languages.
103  *
104  * <p>
105  * The <code>BreakIterator</code> instances returned by the factory methods
106  * of this class are intended for use with natural languages only, not for
107  * programming language text. It is however possible to define subclasses
108  * that tokenize a programming language.
109  *
110  * <P>
111  * <strong>Examples</strong>:<P>
112  * Creating and using text boundaries:
113  * <blockquote>
114  * <pre>
115  * public static void main(String args[]) {
116  *      if (args.length == 1) {
117  *          String stringToExamine = args[0];
118  *          //print each word in order
119  *          BreakIterator boundary = BreakIterator.getWordInstance();
120  *          boundary.setText(stringToExamine);
121  *          printEachForward(boundary, stringToExamine);
122  *          //print each sentence in reverse order
123  *          boundary = BreakIterator.getSentenceInstance(Locale.US);
124  *          boundary.setText(stringToExamine);
125  *          printEachBackward(boundary, stringToExamine);
126  *          printFirst(boundary, stringToExamine);
127  *          printLast(boundary, stringToExamine);
128  *      }
129  * }
130  * </pre>
131  * </blockquote>
132  *
133  * Print each element in order:
134  * <blockquote>
135  * <pre>
136  * public static void printEachForward(BreakIterator boundary, String source) {
137  *     int start = boundary.first();
138  *     for (int end = boundary.next();
139  *          end != BreakIterator.DONE;
140  *          start = end, end = boundary.next()) {
141  *          System.out.println(source.substring(start,end));
142  *     }
143  * }
144  * </pre>
145  * </blockquote>
146  *
147  * Print each element in reverse order:
148  * <blockquote>
149  * <pre>
150  * public static void printEachBackward(BreakIterator boundary, String source) {
151  *     int end = boundary.last();
152  *     for (int start = boundary.previous();
153  *          start != BreakIterator.DONE;
154  *          end = start, start = boundary.previous()) {
155  *         System.out.println(source.substring(start,end));
156  *     }
157  * }
158  * </pre>
159  * </blockquote>
160  *
161  * Print first element:
162  * <blockquote>
163  * <pre>
164  * public static void printFirst(BreakIterator boundary, String source) {
165  *     int start = boundary.first();
166  *     int end = boundary.next();
167  *     System.out.println(source.substring(start,end));
168  * }
169  * </pre>
170  * </blockquote>
171  *
172  * Print last element:
173  * <blockquote>
174  * <pre>
175  * public static void printLast(BreakIterator boundary, String source) {
176  *     int end = boundary.last();
177  *     int start = boundary.previous();
178  *     System.out.println(source.substring(start,end));
179  * }
180  * </pre>
181  * </blockquote>
182  *
183  * Print the element at a specified position:
184  * <blockquote>
185  * <pre>
186  * public static void printAt(BreakIterator boundary, int pos, String source) {
187  *     int end = boundary.following(pos);
188  *     int start = boundary.previous();
189  *     System.out.println(source.substring(start,end));
190  * }
191  * </pre>
192  * </blockquote>
193  *
194  * Find the next word:
195  * <blockquote>
196  * <pre>{@code
197  * public static int nextWordStartAfter(int pos, String text) {
198  *     BreakIterator wb = BreakIterator.getWordInstance();
199  *     wb.setText(text);
200  *     int last = wb.following(pos);
201  *     int current = wb.next();
202  *     while (current != BreakIterator.DONE) {
203  *         for (int p = last; p < current; p++) {
204  *             if (Character.isLetter(text.codePointAt(p)))
205  *                 return last;
206  *         }
207  *         last = current;
208  *         current = wb.next();
209  *     }
210  *     return BreakIterator.DONE;
211  * }
212  * }</pre>
213  * (The iterator returned by BreakIterator.getWordInstance() is unique in that
214  * the break positions it returns don't represent both the start and end of the
215  * thing being iterated over.  That is, a sentence-break iterator returns breaks
216  * that each represent the end of one sentence and the beginning of the next.
217  * With the word-break iterator, the characters between two boundaries might be a
218  * word, or they might be the punctuation or whitespace between two words.  The
219  * above code uses a simple heuristic to determine which boundary is the beginning
220  * of a word: If the characters between this boundary and the next boundary
221  * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
222  * a Hangul syllable, a Kana character, etc.), then the text between this boundary
223  * and the next is a word; otherwise, it's the material between words.)
224  * </blockquote>
225  *
226  * @see CharacterIterator
227  *
228  */
229 
230 public abstract class BreakIterator implements Cloneable
231 {
232     /**
233      * Constructor. BreakIterator is stateless and has no default behavior.
234      */
BreakIterator()235     protected BreakIterator()
236     {
237     }
238 
239     /**
240      * Create a copy of this iterator
241      * @return A copy of this
242      */
243     @Override
clone()244     public Object clone()
245     {
246         try {
247             return super.clone();
248         }
249         catch (CloneNotSupportedException e) {
250             throw new InternalError(e);
251         }
252     }
253 
254     /**
255      * DONE is returned by previous(), next(), next(int), preceding(int)
256      * and following(int) when either the first or last text boundary has been
257      * reached.
258      */
259     public static final int DONE = -1;
260 
261     /**
262      * Returns the first boundary. The iterator's current position is set
263      * to the first text boundary.
264      * @return The character index of the first text boundary.
265      */
first()266     public abstract int first();
267 
268     /**
269      * Returns the last boundary. The iterator's current position is set
270      * to the last text boundary.
271      * @return The character index of the last text boundary.
272      */
last()273     public abstract int last();
274 
275     /**
276      * Returns the nth boundary from the current boundary. If either
277      * the first or last text boundary has been reached, it returns
278      * <code>BreakIterator.DONE</code> and the current position is set to either
279      * the first or last text boundary depending on which one is reached. Otherwise,
280      * the iterator's current position is set to the new boundary.
281      * For example, if the iterator's current position is the mth text boundary
282      * and three more boundaries exist from the current boundary to the last text
283      * boundary, the next(2) call will return m + 2. The new text position is set
284      * to the (m + 2)th text boundary. A next(4) call would return
285      * <code>BreakIterator.DONE</code> and the last text boundary would become the
286      * new text position.
287      * @param n which boundary to return.  A value of 0
288      * does nothing.  Negative values move to previous boundaries
289      * and positive values move to later boundaries.
290      * @return The character index of the nth boundary from the current position
291      * or <code>BreakIterator.DONE</code> if either first or last text boundary
292      * has been reached.
293      */
next(int n)294     public abstract int next(int n);
295 
296     /**
297      * Returns the boundary following the current boundary. If the current boundary
298      * is the last text boundary, it returns <code>BreakIterator.DONE</code> and
299      * the iterator's current position is unchanged. Otherwise, the iterator's
300      * current position is set to the boundary following the current boundary.
301      * @return The character index of the next text boundary or
302      * <code>BreakIterator.DONE</code> if the current boundary is the last text
303      * boundary.
304      * Equivalent to next(1).
305      * @see #next(int)
306      */
next()307     public abstract int next();
308 
309     /**
310      * Returns the boundary preceding the current boundary. If the current boundary
311      * is the first text boundary, it returns <code>BreakIterator.DONE</code> and
312      * the iterator's current position is unchanged. Otherwise, the iterator's
313      * current position is set to the boundary preceding the current boundary.
314      * @return The character index of the previous text boundary or
315      * <code>BreakIterator.DONE</code> if the current boundary is the first text
316      * boundary.
317      */
previous()318     public abstract int previous();
319 
320     /**
321      * Returns the first boundary following the specified character offset. If the
322      * specified offset equals to the last text boundary, it returns
323      * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
324      * Otherwise, the iterator's current position is set to the returned boundary.
325      * The value returned is always greater than the offset or the value
326      * <code>BreakIterator.DONE</code>.
327      * @param offset the character offset to begin scanning.
328      * @return The first boundary after the specified offset or
329      * <code>BreakIterator.DONE</code> if the last text boundary is passed in
330      * as the offset.
331      * @exception  IllegalArgumentException if the specified offset is less than
332      * the first text boundary or greater than the last text boundary.
333      */
following(int offset)334     public abstract int following(int offset);
335 
336     /**
337      * Returns the last boundary preceding the specified character offset. If the
338      * specified offset equals to the first text boundary, it returns
339      * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
340      * Otherwise, the iterator's current position is set to the returned boundary.
341      * The value returned is always less than the offset or the value
342      * <code>BreakIterator.DONE</code>.
343      * @param offset the character offset to begin scanning.
344      * @return The last boundary before the specified offset or
345      * <code>BreakIterator.DONE</code> if the first text boundary is passed in
346      * as the offset.
347      * @exception   IllegalArgumentException if the specified offset is less than
348      * the first text boundary or greater than the last text boundary.
349      * @since 1.2
350      */
preceding(int offset)351     public int preceding(int offset) {
352         // NOTE:  This implementation is here solely because we can't add new
353         // abstract methods to an existing class.  There is almost ALWAYS a
354         // better, faster way to do this.
355         int pos = following(offset);
356         while (pos >= offset && pos != DONE) {
357             pos = previous();
358         }
359         return pos;
360     }
361 
362     /**
363      * Returns true if the specified character offset is a text boundary.
364      * @param offset the character offset to check.
365      * @return <code>true</code> if "offset" is a boundary position,
366      * <code>false</code> otherwise.
367      * @exception   IllegalArgumentException if the specified offset is less than
368      * the first text boundary or greater than the last text boundary.
369      * @since 1.2
370      */
isBoundary(int offset)371     public boolean isBoundary(int offset) {
372         // NOTE: This implementation probably is wrong for most situations
373         // because it fails to take into account the possibility that a
374         // CharacterIterator passed to setText() may not have a begin offset
375         // of 0.  But since the abstract BreakIterator doesn't have that
376         // knowledge, it assumes the begin offset is 0.  If you subclass
377         // BreakIterator, copy the SimpleTextBoundary implementation of this
378         // function into your subclass.  [This should have been abstract at
379         // this level, but it's too late to fix that now.]
380         if (offset == 0) {
381             return true;
382         }
383         int boundary = following(offset - 1);
384         if (boundary == DONE) {
385             throw new IllegalArgumentException();
386         }
387         return boundary == offset;
388     }
389 
390     /**
391      * Returns character index of the text boundary that was most
392      * recently returned by next(), next(int), previous(), first(), last(),
393      * following(int) or preceding(int). If any of these methods returns
394      * <code>BreakIterator.DONE</code> because either first or last text boundary
395      * has been reached, it returns the first or last text boundary depending on
396      * which one is reached.
397      * @return The text boundary returned from the above methods, first or last
398      * text boundary.
399      * @see #next()
400      * @see #next(int)
401      * @see #previous()
402      * @see #first()
403      * @see #last()
404      * @see #following(int)
405      * @see #preceding(int)
406      */
current()407     public abstract int current();
408 
409     /**
410      * Get the text being scanned
411      * @return the text being scanned
412      */
getText()413     public abstract CharacterIterator getText();
414 
415     /**
416      * Set a new text string to be scanned.  The current scan
417      * position is reset to first().
418      * @param newText new text to scan.
419      */
setText(String newText)420     public void setText(String newText)
421     {
422         setText(new StringCharacterIterator(newText));
423     }
424 
425     /**
426      * Set a new text for scanning.  The current scan
427      * position is reset to first().
428      * @param newText new text to scan.
429      */
setText(CharacterIterator newText)430     public abstract void setText(CharacterIterator newText);
431 
432     // Android-removed: Removed code related to BreakIteratorProvider support.
433 
434     /**
435      * Returns a new <code>BreakIterator</code> instance
436      * for <a href="BreakIterator.html#word">word breaks</a>
437      * for the {@linkplain Locale#getDefault() default locale}.
438      * @return A break iterator for word breaks
439      */
getWordInstance()440     public static BreakIterator getWordInstance()
441     {
442         return getWordInstance(Locale.getDefault());
443     }
444 
445     /**
446      * Returns a new <code>BreakIterator</code> instance
447      * for <a href="BreakIterator.html#word">word breaks</a>
448      * for the given locale.
449      * @param locale the desired locale
450      * @return A break iterator for word breaks
451      * @exception NullPointerException if <code>locale</code> is null
452      */
getWordInstance(Locale locale)453     public static BreakIterator getWordInstance(Locale locale)
454     {
455         // Android-changed: Switched to ICU.
456         return new IcuIteratorWrapper(
457                 android.icu.text.BreakIterator.getWordInstance(locale));
458     }
459 
460     /**
461      * Returns a new <code>BreakIterator</code> instance
462      * for <a href="BreakIterator.html#line">line breaks</a>
463      * for the {@linkplain Locale#getDefault() default locale}.
464      * @return A break iterator for line breaks
465      */
getLineInstance()466     public static BreakIterator getLineInstance()
467     {
468         return getLineInstance(Locale.getDefault());
469     }
470 
471     /**
472      * Returns a new <code>BreakIterator</code> instance
473      * for <a href="BreakIterator.html#line">line breaks</a>
474      * for the given locale.
475      * @param locale the desired locale
476      * @return A break iterator for line breaks
477      * @exception NullPointerException if <code>locale</code> is null
478      */
getLineInstance(Locale locale)479     public static BreakIterator getLineInstance(Locale locale)
480     {
481         // Android-changed: Switched to ICU.
482         return new IcuIteratorWrapper(
483                 android.icu.text.BreakIterator.getLineInstance(locale));
484     }
485 
486     /**
487      * Returns a new <code>BreakIterator</code> instance
488      * for <a href="BreakIterator.html#character">character breaks</a>
489      * for the {@linkplain Locale#getDefault() default locale}.
490      * @return A break iterator for character breaks
491      */
getCharacterInstance()492     public static BreakIterator getCharacterInstance()
493     {
494         return getCharacterInstance(Locale.getDefault());
495     }
496 
497     /**
498      * Returns a new <code>BreakIterator</code> instance
499      * for <a href="BreakIterator.html#character">character breaks</a>
500      * for the given locale.
501      * @param locale the desired locale
502      * @return A break iterator for character breaks
503      * @exception NullPointerException if <code>locale</code> is null
504      */
getCharacterInstance(Locale locale)505     public static BreakIterator getCharacterInstance(Locale locale)
506     {
507         // Android-changed: Switched to ICU.
508         return new IcuIteratorWrapper(
509                 android.icu.text.BreakIterator.getCharacterInstance(locale));
510     }
511 
512     /**
513      * Returns a new <code>BreakIterator</code> instance
514      * for <a href="BreakIterator.html#sentence">sentence breaks</a>
515      * for the {@linkplain Locale#getDefault() default locale}.
516      * @return A break iterator for sentence breaks
517      */
getSentenceInstance()518     public static BreakIterator getSentenceInstance()
519     {
520         return getSentenceInstance(Locale.getDefault());
521     }
522 
523     /**
524      * Returns a new <code>BreakIterator</code> instance
525      * for <a href="BreakIterator.html#sentence">sentence breaks</a>
526      * for the given locale.
527      * @param locale the desired locale
528      * @return A break iterator for sentence breaks
529      * @exception NullPointerException if <code>locale</code> is null
530      */
getSentenceInstance(Locale locale)531     public static BreakIterator getSentenceInstance(Locale locale)
532     {
533         // Android-changed: Switched to ICU.
534         return new IcuIteratorWrapper(
535                 android.icu.text.BreakIterator.getSentenceInstance(locale));
536     }
537 
538     // Android-removed: Removed code related to BreakIteratorProvider support.
539 
540     // Android-changed: Removed references to BreakIteratorProvider from JavaDoc.
541     /**
542      * Returns an array of all locales for which the
543      * <code>get*Instance</code> methods of this class can return
544      * localized instances.
545      *
546      * @return An array of locales for which localized
547      *         <code>BreakIterator</code> instances are available.
548      */
getAvailableLocales()549     public static synchronized Locale[] getAvailableLocales()
550     {
551         // Android-changed: Switched to ICU.
552         return android.icu.text.BreakIterator.getAvailableLocales();
553     }
554 }
555