• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package android.text.method;
18 
19 import android.annotation.NonNull;
20 import android.compat.annotation.UnsupportedAppUsage;
21 import android.icu.lang.UCharacter;
22 import android.icu.lang.UProperty;
23 import android.icu.text.BreakIterator;
24 import android.os.Build;
25 import android.text.CharSequenceCharacterIterator;
26 import android.text.Selection;
27 
28 import java.util.Locale;
29 
30 /**
31  * Walks through cursor positions at word boundaries. Internally uses
32  * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence}
33  * for performance reasons.
34  *
35  * Also provides methods to determine word boundaries.
36  * {@hide}
37  */
38 public class WordIterator implements Selection.PositionIterator {
39     // Size of the window for the word iterator, should be greater than the longest word's length
40     private static final int WINDOW_WIDTH = 50;
41 
42     private int mStart, mEnd;
43     private CharSequence mCharSeq;
44     private final BreakIterator mIterator;
45 
46     /**
47      * Constructs a WordIterator using the default locale.
48      */
WordIterator()49     public WordIterator() {
50         this(Locale.getDefault());
51     }
52 
53     /**
54      * Constructs a new WordIterator for the specified locale.
55      * @param locale The locale to be used for analyzing the text.
56      */
57     @UnsupportedAppUsage
WordIterator(Locale locale)58     public WordIterator(Locale locale) {
59         mIterator = BreakIterator.getWordInstance(locale);
60     }
61 
62     @UnsupportedAppUsage
setCharSequence(@onNull CharSequence charSequence, int start, int end)63     public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) {
64         if (0 <= start && end <= charSequence.length()) {
65             mCharSeq = charSequence;
66             mStart = Math.max(0, start - WINDOW_WIDTH);
67             mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH);
68             mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd));
69         } else {
70             throw new IndexOutOfBoundsException("input indexes are outside the CharSequence");
71         }
72     }
73 
74     /** {@inheritDoc} */
75     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
preceding(int offset)76     public int preceding(int offset) {
77         checkOffsetIsValid(offset);
78         while (true) {
79             offset = mIterator.preceding(offset);
80             if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) {
81                 return offset;
82             }
83         }
84     }
85 
86     /** {@inheritDoc} */
87     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
following(int offset)88     public int following(int offset) {
89         checkOffsetIsValid(offset);
90         while (true) {
91             offset = mIterator.following(offset);
92             if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) {
93                 return offset;
94             }
95         }
96     }
97 
98     /** {@inheritDoc} */
99     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
isBoundary(int offset)100     public boolean isBoundary(int offset) {
101         checkOffsetIsValid(offset);
102         return mIterator.isBoundary(offset);
103     }
104 
105     /**
106      * Returns the position of next boundary after the given offset. Returns
107      * {@code DONE} if there is no boundary after the given offset.
108      *
109      * @param offset the given start position to search from.
110      * @return the position of the last boundary preceding the given offset.
111      */
112     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
nextBoundary(int offset)113     public int nextBoundary(int offset) {
114         checkOffsetIsValid(offset);
115         return mIterator.following(offset);
116     }
117 
118     /**
119      * Returns the position of boundary preceding the given offset or
120      * {@code DONE} if the given offset specifies the starting position.
121      *
122      * @param offset the given start position to search from.
123      * @return the position of the last boundary preceding the given offset.
124      */
125     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
prevBoundary(int offset)126     public int prevBoundary(int offset) {
127         checkOffsetIsValid(offset);
128         return mIterator.preceding(offset);
129     }
130 
131     /** If <code>offset</code> is within a word, returns the index of the first character of that
132      * word, otherwise returns BreakIterator.DONE.
133      *
134      * The offsets that are considered to be part of a word are the indexes of its characters,
135      * <i>as well as</i> the index of its last character plus one.
136      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
137      *
138      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
139      * The returned value is within [0..offset] or BreakIterator.DONE.
140      *
141      * @throws IllegalArgumentException is offset is not valid.
142      */
143     @UnsupportedAppUsage
getBeginning(int offset)144     public int getBeginning(int offset) {
145         // TODO: Check if usage of this can be updated to getBeginning(offset, true) if
146         // so this method can be removed.
147         return getBeginning(offset, false);
148     }
149 
150     /**
151      * If <code>offset</code> is within a word, returns the index of the last character of that
152      * word plus one, otherwise returns BreakIterator.DONE.
153      *
154      * The offsets that are considered to be part of a word are the indexes of its characters,
155      * <i>as well as</i> the index of its last character plus one.
156      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
157      *
158      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
159      * The returned value is within [offset..textLength] or BreakIterator.DONE.
160      *
161      * @throws IllegalArgumentException is offset is not valid.
162      */
163     @UnsupportedAppUsage
getEnd(int offset)164     public int getEnd(int offset) {
165         // TODO: Check if usage of this can be updated to getEnd(offset, true), if
166         // so this method can be removed.
167         return getEnd(offset, false);
168     }
169 
170     /**
171      * If the <code>offset</code> is within a word or on a word boundary that can only be
172      * considered the start of a word (e.g. _word where "_" is any character that would not
173      * be considered part of the word) then this returns the index of the first character of
174      * that word.
175      *
176      * If the offset is on a word boundary that can be considered the start and end of a
177      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
178      * between AA and BB, this would return the start of the previous word, AA.
179      *
180      * Returns BreakIterator.DONE if there is no previous boundary.
181      *
182      * @throws IllegalArgumentException is offset is not valid.
183      */
184     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
getPrevWordBeginningOnTwoWordsBoundary(int offset)185     public int getPrevWordBeginningOnTwoWordsBoundary(int offset) {
186         return getBeginning(offset, true);
187     }
188 
189     /**
190      * If the <code>offset</code> is within a word or on a word boundary that can only be
191      * considered the end of a word (e.g. word_ where "_" is any character that would not
192      * be considered part of the word) then this returns the index of the last character
193      * plus one of that word.
194      *
195      * If the offset is on a word boundary that can be considered the start and end of a
196      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
197      * between AA and BB, this would return the end of the next word, BB.
198      *
199      * Returns BreakIterator.DONE if there is no next boundary.
200      *
201      * @throws IllegalArgumentException is offset is not valid.
202      */
203     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
getNextWordEndOnTwoWordBoundary(int offset)204     public int getNextWordEndOnTwoWordBoundary(int offset) {
205         return getEnd(offset, true);
206     }
207 
208     /**
209      * If the <code>offset</code> is within a word or on a word boundary that can only be
210      * considered the start of a word (e.g. _word where "_" is any character that would not
211      * be considered part of the word) then this returns the index of the first character of
212      * that word.
213      *
214      * If the offset is on a word boundary that can be considered the start and end of a
215      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
216      * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would
217      * return the start of the previous word, AA. Otherwise it would return the current offset,
218      * the start of BB.
219      *
220      * Returns BreakIterator.DONE if there is no previous boundary.
221      *
222      * @throws IllegalArgumentException is offset is not valid.
223      */
getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary)224     private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) {
225         checkOffsetIsValid(offset);
226 
227         if (isOnLetterOrDigit(offset)) {
228             if (mIterator.isBoundary(offset)
229                     && (!isAfterLetterOrDigit(offset)
230                             || !getPrevWordBeginningOnTwoWordsBoundary)) {
231                 return offset;
232             } else {
233                 return mIterator.preceding(offset);
234             }
235         } else {
236             if (isAfterLetterOrDigit(offset)) {
237                 return mIterator.preceding(offset);
238             }
239         }
240         return BreakIterator.DONE;
241     }
242 
243     /**
244      * If the <code>offset</code> is within a word or on a word boundary that can only be
245      * considered the end of a word (e.g. word_ where "_" is any character that would not be
246      * considered part of the word) then this returns the index of the last character plus one
247      * of that word.
248      *
249      * If the offset is on a word boundary that can be considered the start and end of a
250      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
251      * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return
252      * the end of the next word, BB. Otherwise it would return the current offset, the end
253      * of AA.
254      *
255      * Returns BreakIterator.DONE if there is no next boundary.
256      *
257      * @throws IllegalArgumentException is offset is not valid.
258      */
getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary)259     private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) {
260         checkOffsetIsValid(offset);
261 
262         if (isAfterLetterOrDigit(offset)) {
263             if (mIterator.isBoundary(offset)
264                     && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) {
265                 return offset;
266             } else {
267                 return mIterator.following(offset);
268             }
269         } else {
270             if (isOnLetterOrDigit(offset)) {
271                 return mIterator.following(offset);
272             }
273         }
274         return BreakIterator.DONE;
275     }
276 
277     /**
278      * If <code>offset</code> is within a group of punctuation as defined
279      * by {@link #isPunctuation(int)}, returns the index of the first character
280      * of that group, otherwise returns BreakIterator.DONE.
281      *
282      * @param offset the offset to search from.
283      */
284     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
getPunctuationBeginning(int offset)285     public int getPunctuationBeginning(int offset) {
286         checkOffsetIsValid(offset);
287         while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) {
288             offset = prevBoundary(offset);
289         }
290         // No need to shift offset, prevBoundary handles that.
291         return offset;
292     }
293 
294     /**
295      * If <code>offset</code> is within a group of punctuation as defined
296      * by {@link #isPunctuation(int)}, returns the index of the last character
297      * of that group plus one, otherwise returns BreakIterator.DONE.
298      *
299      * @param offset the offset to search from.
300      */
301     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
getPunctuationEnd(int offset)302     public int getPunctuationEnd(int offset) {
303         checkOffsetIsValid(offset);
304         while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) {
305             offset = nextBoundary(offset);
306         }
307         // No need to shift offset, nextBoundary handles that.
308         return offset;
309     }
310 
311     /**
312      * Indicates if the provided offset is after a punctuation character
313      * as defined by {@link #isPunctuation(int)}.
314      *
315      * @param offset the offset to check from.
316      * @return Whether the offset is after a punctuation character.
317      */
318     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
isAfterPunctuation(int offset)319     public boolean isAfterPunctuation(int offset) {
320         if (mStart < offset && offset <= mEnd) {
321             final int codePoint = Character.codePointBefore(mCharSeq, offset);
322             return isPunctuation(codePoint);
323         }
324         return false;
325     }
326 
327     /**
328      * Indicates if the provided offset is at a punctuation character
329      * as defined by {@link #isPunctuation(int)}.
330      *
331      * @param offset the offset to check from.
332      * @return Whether the offset is at a punctuation character.
333      */
334     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
isOnPunctuation(int offset)335     public boolean isOnPunctuation(int offset) {
336         if (mStart <= offset && offset < mEnd) {
337             final int codePoint = Character.codePointAt(mCharSeq, offset);
338             return isPunctuation(codePoint);
339         }
340         return false;
341     }
342 
343     /**
344      * Indicates if the codepoint is a mid-word-only punctuation.
345      *
346      * At the moment, this is locale-independent, and includes all the characters in
347      * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see
348      * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the
349      * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are
350      * in the middle of a word, but they become word breaks if they happen at the end of a word
351      * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise).
352      *
353      * @param locale the locale to consider the codepoint in. Presently ignored.
354      * @param codePoint the codepoint to check.
355      * @return True if the codepoint is a mid-word punctuation.
356      */
isMidWordPunctuation(Locale locale, int codePoint)357     public static boolean isMidWordPunctuation(Locale locale, int codePoint) {
358         final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK);
359         return (wb == UCharacter.WordBreak.MIDLETTER
360                 || wb == UCharacter.WordBreak.MIDNUMLET
361                 || wb == UCharacter.WordBreak.SINGLE_QUOTE);
362     }
363 
isPunctuationStartBoundary(int offset)364     private boolean isPunctuationStartBoundary(int offset) {
365         return isOnPunctuation(offset) && !isAfterPunctuation(offset);
366     }
367 
isPunctuationEndBoundary(int offset)368     private boolean isPunctuationEndBoundary(int offset) {
369         return !isOnPunctuation(offset) && isAfterPunctuation(offset);
370     }
371 
isPunctuation(int cp)372     private static boolean isPunctuation(int cp) {
373         final int type = Character.getType(cp);
374         return (type == Character.CONNECTOR_PUNCTUATION
375                 || type == Character.DASH_PUNCTUATION
376                 || type == Character.END_PUNCTUATION
377                 || type == Character.FINAL_QUOTE_PUNCTUATION
378                 || type == Character.INITIAL_QUOTE_PUNCTUATION
379                 || type == Character.OTHER_PUNCTUATION
380                 || type == Character.START_PUNCTUATION);
381     }
382 
isAfterLetterOrDigit(int offset)383     private boolean isAfterLetterOrDigit(int offset) {
384         if (mStart < offset && offset <= mEnd) {
385             final int codePoint = Character.codePointBefore(mCharSeq, offset);
386             if (Character.isLetterOrDigit(codePoint)) return true;
387         }
388         return false;
389     }
390 
isOnLetterOrDigit(int offset)391     private boolean isOnLetterOrDigit(int offset) {
392         if (mStart <= offset && offset < mEnd) {
393             final int codePoint = Character.codePointAt(mCharSeq, offset);
394             if (Character.isLetterOrDigit(codePoint)) return true;
395         }
396         return false;
397     }
398 
checkOffsetIsValid(int offset)399     private void checkOffsetIsValid(int offset) {
400         if (!(mStart <= offset && offset <= mEnd)) {
401             throw new IllegalArgumentException("Invalid offset: " + (offset) +
402                     ". Valid range is [" + mStart + ", " + mEnd + "]");
403         }
404     }
405 }
406