• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package android.text.method;
18 
19 import android.annotation.NonNull;
20 import android.compat.annotation.UnsupportedAppUsage;
21 import android.icu.lang.UCharacter;
22 import android.icu.lang.UProperty;
23 import android.icu.text.BreakIterator;
24 import android.icu.util.ULocale;
25 import android.os.Build;
26 import android.text.CharSequenceCharacterIterator;
27 import android.text.Selection;
28 import android.text.TextUtils;
29 
30 import java.util.Locale;
31 
32 /**
33  * Walks through cursor positions at word boundaries. Internally uses
34  * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence}
35  * for performance reasons.
36  *
37  * Also provides methods to determine word boundaries.
38  * {@hide}
39  */
40 @android.ravenwood.annotation.RavenwoodKeepWholeClass
41 public class WordIterator implements Selection.PositionIterator {
42     // Size of the window for the word iterator, should be greater than the longest word's length
43     private static final int WINDOW_WIDTH = 50;
44 
45     private int mStart, mEnd;
46     private CharSequence mCharSeq;
47     private final BreakIterator mIterator;
48 
49     /**
50      * Constructs a WordIterator using the default locale.
51      */
WordIterator()52     public WordIterator() {
53         this(Locale.getDefault());
54     }
55 
56     /**
57      * Constructs a new WordIterator for the specified locale.
58      * @param locale The locale to be used for analyzing the text.
59      */
60     @UnsupportedAppUsage
WordIterator(Locale locale)61     public WordIterator(Locale locale) {
62         mIterator = BreakIterator.getWordInstance(locale);
63     }
64 
65     /**
66      * Constructs a new WordIterator for the specified locale.
67      * @param locale The locale to be used for analyzing the text.
68      */
WordIterator(ULocale locale)69     public WordIterator(ULocale locale) {
70         mIterator = BreakIterator.getWordInstance(locale);
71     }
72 
73     @UnsupportedAppUsage
setCharSequence(@onNull CharSequence charSequence, int start, int end)74     public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) {
75         if (0 <= start && end <= charSequence.length()) {
76             mCharSeq = charSequence;
77             mStart = Math.max(0, start - WINDOW_WIDTH);
78             mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH);
79             mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd));
80         } else {
81             throw new IndexOutOfBoundsException("input indexes are outside the CharSequence");
82         }
83     }
84 
85     /** {@inheritDoc} */
86     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
preceding(int offset)87     public int preceding(int offset) {
88         checkOffsetIsValid(offset);
89         while (true) {
90             offset = mIterator.preceding(offset);
91             if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) {
92                 return offset;
93             }
94         }
95     }
96 
97     /** {@inheritDoc} */
98     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
following(int offset)99     public int following(int offset) {
100         checkOffsetIsValid(offset);
101         while (true) {
102             offset = mIterator.following(offset);
103             if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) {
104                 return offset;
105             }
106         }
107     }
108 
109     /** {@inheritDoc} */
110     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
isBoundary(int offset)111     public boolean isBoundary(int offset) {
112         checkOffsetIsValid(offset);
113         return mIterator.isBoundary(offset);
114     }
115 
116     /**
117      * Returns the position of next boundary after the given offset. Returns
118      * {@code DONE} if there is no boundary after the given offset.
119      *
120      * @param offset the given start position to search from.
121      * @return the position of the last boundary preceding the given offset.
122      */
123     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
nextBoundary(int offset)124     public int nextBoundary(int offset) {
125         checkOffsetIsValid(offset);
126         return mIterator.following(offset);
127     }
128 
129     /**
130      * Returns the position of boundary preceding the given offset or
131      * {@code DONE} if the given offset specifies the starting position.
132      *
133      * @param offset the given start position to search from.
134      * @return the position of the last boundary preceding the given offset.
135      */
136     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
prevBoundary(int offset)137     public int prevBoundary(int offset) {
138         checkOffsetIsValid(offset);
139         return mIterator.preceding(offset);
140     }
141 
142     /** If <code>offset</code> is within a word, returns the index of the first character of that
143      * word, otherwise returns BreakIterator.DONE.
144      *
145      * The offsets that are considered to be part of a word are the indexes of its characters,
146      * <i>as well as</i> the index of its last character plus one.
147      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
148      *
149      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
150      * The returned value is within [0..offset] or BreakIterator.DONE.
151      *
152      * @throws IllegalArgumentException is offset is not valid.
153      */
154     @UnsupportedAppUsage
getBeginning(int offset)155     public int getBeginning(int offset) {
156         // TODO: Check if usage of this can be updated to getBeginning(offset, true) if
157         // so this method can be removed.
158         return getBeginning(offset, false);
159     }
160 
161     /**
162      * If <code>offset</code> is within a word, returns the index of the last character of that
163      * word plus one, otherwise returns BreakIterator.DONE.
164      *
165      * The offsets that are considered to be part of a word are the indexes of its characters,
166      * <i>as well as</i> the index of its last character plus one.
167      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
168      *
169      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
170      * The returned value is within [offset..textLength] or BreakIterator.DONE.
171      *
172      * @throws IllegalArgumentException is offset is not valid.
173      */
174     @UnsupportedAppUsage
getEnd(int offset)175     public int getEnd(int offset) {
176         // TODO: Check if usage of this can be updated to getEnd(offset, true), if
177         // so this method can be removed.
178         return getEnd(offset, false);
179     }
180 
181     /**
182      * If the <code>offset</code> is within a word or on a word boundary that can only be
183      * considered the start of a word (e.g. _word where "_" is any character that would not
184      * be considered part of the word) then this returns the index of the first character of
185      * that word.
186      *
187      * If the offset is on a word boundary that can be considered the start and end of a
188      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
189      * between AA and BB, this would return the start of the previous word, AA.
190      *
191      * Returns BreakIterator.DONE if there is no previous boundary.
192      *
193      * @throws IllegalArgumentException is offset is not valid.
194      */
195     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
getPrevWordBeginningOnTwoWordsBoundary(int offset)196     public int getPrevWordBeginningOnTwoWordsBoundary(int offset) {
197         return getBeginning(offset, true);
198     }
199 
200     /**
201      * If the <code>offset</code> is within a word or on a word boundary that can only be
202      * considered the end of a word (e.g. word_ where "_" is any character that would not
203      * be considered part of the word) then this returns the index of the last character
204      * plus one of that word.
205      *
206      * If the offset is on a word boundary that can be considered the start and end of a
207      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
208      * between AA and BB, this would return the end of the next word, BB.
209      *
210      * Returns BreakIterator.DONE if there is no next boundary.
211      *
212      * @throws IllegalArgumentException is offset is not valid.
213      */
214     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
getNextWordEndOnTwoWordBoundary(int offset)215     public int getNextWordEndOnTwoWordBoundary(int offset) {
216         return getEnd(offset, true);
217     }
218 
219     /**
220      * If the <code>offset</code> is within a word or on a word boundary that can only be
221      * considered the start of a word (e.g. _word where "_" is any character that would not
222      * be considered part of the word) then this returns the index of the first character of
223      * that word.
224      *
225      * If the offset is on a word boundary that can be considered the start and end of a
226      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
227      * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would
228      * return the start of the previous word, AA. Otherwise it would return the current offset,
229      * the start of BB.
230      *
231      * Returns BreakIterator.DONE if there is no previous boundary.
232      *
233      * @throws IllegalArgumentException is offset is not valid.
234      */
getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary)235     private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) {
236         checkOffsetIsValid(offset);
237 
238         if (isOnLetterOrDigit(offset)) {
239             if (mIterator.isBoundary(offset)
240                     && (!isAfterLetterOrDigit(offset)
241                             || !getPrevWordBeginningOnTwoWordsBoundary)) {
242                 return offset;
243             } else {
244                 return mIterator.preceding(offset);
245             }
246         } else {
247             if (isAfterLetterOrDigit(offset)) {
248                 return mIterator.preceding(offset);
249             }
250         }
251         return BreakIterator.DONE;
252     }
253 
254     /**
255      * If the <code>offset</code> is within a word or on a word boundary that can only be
256      * considered the end of a word (e.g. word_ where "_" is any character that would not be
257      * considered part of the word) then this returns the index of the last character plus one
258      * of that word.
259      *
260      * If the offset is on a word boundary that can be considered the start and end of a
261      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
262      * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return
263      * the end of the next word, BB. Otherwise it would return the current offset, the end
264      * of AA.
265      *
266      * Returns BreakIterator.DONE if there is no next boundary.
267      *
268      * @throws IllegalArgumentException is offset is not valid.
269      */
getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary)270     private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) {
271         checkOffsetIsValid(offset);
272 
273         if (isAfterLetterOrDigit(offset)) {
274             if (mIterator.isBoundary(offset)
275                     && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) {
276                 return offset;
277             } else {
278                 return mIterator.following(offset);
279             }
280         } else {
281             if (isOnLetterOrDigit(offset)) {
282                 return mIterator.following(offset);
283             }
284         }
285         return BreakIterator.DONE;
286     }
287 
288     /**
289      * If <code>offset</code> is within a group of punctuation as defined by {@link
290      * TextUtils#isPunctuation(int)}, returns the index of the first character of that group,
291      * otherwise returns BreakIterator.DONE.
292      *
293      * @param offset the offset to search from.
294      */
295     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
getPunctuationBeginning(int offset)296     public int getPunctuationBeginning(int offset) {
297         checkOffsetIsValid(offset);
298         while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) {
299             offset = prevBoundary(offset);
300         }
301         // No need to shift offset, prevBoundary handles that.
302         return offset;
303     }
304 
305     /**
306      * If <code>offset</code> is within a group of punctuation as defined by {@link
307      * TextUtils#isPunctuation(int)}, returns the index of the last character of that group plus
308      * one, otherwise returns BreakIterator.DONE.
309      *
310      * @param offset the offset to search from.
311      */
312     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
getPunctuationEnd(int offset)313     public int getPunctuationEnd(int offset) {
314         checkOffsetIsValid(offset);
315         while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) {
316             offset = nextBoundary(offset);
317         }
318         // No need to shift offset, nextBoundary handles that.
319         return offset;
320     }
321 
322     /**
323      * Indicates if the provided offset is after a punctuation character as defined by {@link
324      * TextUtils#isPunctuation(int)}.
325      *
326      * @param offset the offset to check from.
327      * @return Whether the offset is after a punctuation character.
328      */
329     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
isAfterPunctuation(int offset)330     public boolean isAfterPunctuation(int offset) {
331         if (mStart < offset && offset <= mEnd) {
332             final int codePoint = Character.codePointBefore(mCharSeq, offset);
333             return TextUtils.isPunctuation(codePoint);
334         }
335         return false;
336     }
337 
338     /**
339      * Indicates if the provided offset is at a punctuation character as defined by {@link
340      * TextUtils#isPunctuation(int)}.
341      *
342      * @param offset the offset to check from.
343      * @return Whether the offset is at a punctuation character.
344      */
345     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
isOnPunctuation(int offset)346     public boolean isOnPunctuation(int offset) {
347         if (mStart <= offset && offset < mEnd) {
348             final int codePoint = Character.codePointAt(mCharSeq, offset);
349             return TextUtils.isPunctuation(codePoint);
350         }
351         return false;
352     }
353 
354     /**
355      * Indicates if the codepoint is a mid-word-only punctuation.
356      *
357      * At the moment, this is locale-independent, and includes all the characters in
358      * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see
359      * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the
360      * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are
361      * in the middle of a word, but they become word breaks if they happen at the end of a word
362      * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise).
363      *
364      * @param locale the locale to consider the codepoint in. Presently ignored.
365      * @param codePoint the codepoint to check.
366      * @return True if the codepoint is a mid-word punctuation.
367      */
isMidWordPunctuation(Locale locale, int codePoint)368     public static boolean isMidWordPunctuation(Locale locale, int codePoint) {
369         final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK);
370         return (wb == UCharacter.WordBreak.MIDLETTER
371                 || wb == UCharacter.WordBreak.MIDNUMLET
372                 || wb == UCharacter.WordBreak.SINGLE_QUOTE);
373     }
374 
isPunctuationStartBoundary(int offset)375     private boolean isPunctuationStartBoundary(int offset) {
376         return isOnPunctuation(offset) && !isAfterPunctuation(offset);
377     }
378 
isPunctuationEndBoundary(int offset)379     private boolean isPunctuationEndBoundary(int offset) {
380         return !isOnPunctuation(offset) && isAfterPunctuation(offset);
381     }
382 
isAfterLetterOrDigit(int offset)383     private boolean isAfterLetterOrDigit(int offset) {
384         if (mStart < offset && offset <= mEnd) {
385             final int codePoint = Character.codePointBefore(mCharSeq, offset);
386             if (Character.isLetterOrDigit(codePoint)) return true;
387         }
388         return false;
389     }
390 
isOnLetterOrDigit(int offset)391     private boolean isOnLetterOrDigit(int offset) {
392         if (mStart <= offset && offset < mEnd) {
393             final int codePoint = Character.codePointAt(mCharSeq, offset);
394             if (Character.isLetterOrDigit(codePoint)) return true;
395         }
396         return false;
397     }
398 
checkOffsetIsValid(int offset)399     private void checkOffsetIsValid(int offset) {
400         if (!(mStart <= offset && offset <= mEnd)) {
401             throw new IllegalArgumentException("Invalid offset: " + (offset) +
402                     ". Valid range is [" + mStart + ", " + mEnd + "]");
403         }
404     }
405 }
406