1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package android.text.method; 18 19 import android.annotation.NonNull; 20 import android.compat.annotation.UnsupportedAppUsage; 21 import android.icu.lang.UCharacter; 22 import android.icu.lang.UProperty; 23 import android.icu.text.BreakIterator; 24 import android.icu.util.ULocale; 25 import android.os.Build; 26 import android.text.CharSequenceCharacterIterator; 27 import android.text.Selection; 28 import android.text.TextUtils; 29 30 import java.util.Locale; 31 32 /** 33 * Walks through cursor positions at word boundaries. Internally uses 34 * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence} 35 * for performance reasons. 36 * 37 * Also provides methods to determine word boundaries. 38 * {@hide} 39 */ 40 @android.ravenwood.annotation.RavenwoodKeepWholeClass 41 public class WordIterator implements Selection.PositionIterator { 42 // Size of the window for the word iterator, should be greater than the longest word's length 43 private static final int WINDOW_WIDTH = 50; 44 45 private int mStart, mEnd; 46 private CharSequence mCharSeq; 47 private final BreakIterator mIterator; 48 49 /** 50 * Constructs a WordIterator using the default locale. 51 */ WordIterator()52 public WordIterator() { 53 this(Locale.getDefault()); 54 } 55 56 /** 57 * Constructs a new WordIterator for the specified locale. 58 * @param locale The locale to be used for analyzing the text. 59 */ 60 @UnsupportedAppUsage WordIterator(Locale locale)61 public WordIterator(Locale locale) { 62 mIterator = BreakIterator.getWordInstance(locale); 63 } 64 65 /** 66 * Constructs a new WordIterator for the specified locale. 67 * @param locale The locale to be used for analyzing the text. 68 */ WordIterator(ULocale locale)69 public WordIterator(ULocale locale) { 70 mIterator = BreakIterator.getWordInstance(locale); 71 } 72 73 @UnsupportedAppUsage setCharSequence(@onNull CharSequence charSequence, int start, int end)74 public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) { 75 if (0 <= start && end <= charSequence.length()) { 76 mCharSeq = charSequence; 77 mStart = Math.max(0, start - WINDOW_WIDTH); 78 mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH); 79 mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd)); 80 } else { 81 throw new IndexOutOfBoundsException("input indexes are outside the CharSequence"); 82 } 83 } 84 85 /** {@inheritDoc} */ 86 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) preceding(int offset)87 public int preceding(int offset) { 88 checkOffsetIsValid(offset); 89 while (true) { 90 offset = mIterator.preceding(offset); 91 if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) { 92 return offset; 93 } 94 } 95 } 96 97 /** {@inheritDoc} */ 98 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) following(int offset)99 public int following(int offset) { 100 checkOffsetIsValid(offset); 101 while (true) { 102 offset = mIterator.following(offset); 103 if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) { 104 return offset; 105 } 106 } 107 } 108 109 /** {@inheritDoc} */ 110 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) isBoundary(int offset)111 public boolean isBoundary(int offset) { 112 checkOffsetIsValid(offset); 113 return mIterator.isBoundary(offset); 114 } 115 116 /** 117 * Returns the position of next boundary after the given offset. Returns 118 * {@code DONE} if there is no boundary after the given offset. 119 * 120 * @param offset the given start position to search from. 121 * @return the position of the last boundary preceding the given offset. 122 */ 123 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) nextBoundary(int offset)124 public int nextBoundary(int offset) { 125 checkOffsetIsValid(offset); 126 return mIterator.following(offset); 127 } 128 129 /** 130 * Returns the position of boundary preceding the given offset or 131 * {@code DONE} if the given offset specifies the starting position. 132 * 133 * @param offset the given start position to search from. 134 * @return the position of the last boundary preceding the given offset. 135 */ 136 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) prevBoundary(int offset)137 public int prevBoundary(int offset) { 138 checkOffsetIsValid(offset); 139 return mIterator.preceding(offset); 140 } 141 142 /** If <code>offset</code> is within a word, returns the index of the first character of that 143 * word, otherwise returns BreakIterator.DONE. 144 * 145 * The offsets that are considered to be part of a word are the indexes of its characters, 146 * <i>as well as</i> the index of its last character plus one. 147 * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. 148 * 149 * Valid range for offset is [0..textLength] (note the inclusive upper bound). 150 * The returned value is within [0..offset] or BreakIterator.DONE. 151 * 152 * @throws IllegalArgumentException is offset is not valid. 153 */ 154 @UnsupportedAppUsage getBeginning(int offset)155 public int getBeginning(int offset) { 156 // TODO: Check if usage of this can be updated to getBeginning(offset, true) if 157 // so this method can be removed. 158 return getBeginning(offset, false); 159 } 160 161 /** 162 * If <code>offset</code> is within a word, returns the index of the last character of that 163 * word plus one, otherwise returns BreakIterator.DONE. 164 * 165 * The offsets that are considered to be part of a word are the indexes of its characters, 166 * <i>as well as</i> the index of its last character plus one. 167 * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. 168 * 169 * Valid range for offset is [0..textLength] (note the inclusive upper bound). 170 * The returned value is within [offset..textLength] or BreakIterator.DONE. 171 * 172 * @throws IllegalArgumentException is offset is not valid. 173 */ 174 @UnsupportedAppUsage getEnd(int offset)175 public int getEnd(int offset) { 176 // TODO: Check if usage of this can be updated to getEnd(offset, true), if 177 // so this method can be removed. 178 return getEnd(offset, false); 179 } 180 181 /** 182 * If the <code>offset</code> is within a word or on a word boundary that can only be 183 * considered the start of a word (e.g. _word where "_" is any character that would not 184 * be considered part of the word) then this returns the index of the first character of 185 * that word. 186 * 187 * If the offset is on a word boundary that can be considered the start and end of a 188 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 189 * between AA and BB, this would return the start of the previous word, AA. 190 * 191 * Returns BreakIterator.DONE if there is no previous boundary. 192 * 193 * @throws IllegalArgumentException is offset is not valid. 194 */ 195 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) getPrevWordBeginningOnTwoWordsBoundary(int offset)196 public int getPrevWordBeginningOnTwoWordsBoundary(int offset) { 197 return getBeginning(offset, true); 198 } 199 200 /** 201 * If the <code>offset</code> is within a word or on a word boundary that can only be 202 * considered the end of a word (e.g. word_ where "_" is any character that would not 203 * be considered part of the word) then this returns the index of the last character 204 * plus one of that word. 205 * 206 * If the offset is on a word boundary that can be considered the start and end of a 207 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 208 * between AA and BB, this would return the end of the next word, BB. 209 * 210 * Returns BreakIterator.DONE if there is no next boundary. 211 * 212 * @throws IllegalArgumentException is offset is not valid. 213 */ 214 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) getNextWordEndOnTwoWordBoundary(int offset)215 public int getNextWordEndOnTwoWordBoundary(int offset) { 216 return getEnd(offset, true); 217 } 218 219 /** 220 * If the <code>offset</code> is within a word or on a word boundary that can only be 221 * considered the start of a word (e.g. _word where "_" is any character that would not 222 * be considered part of the word) then this returns the index of the first character of 223 * that word. 224 * 225 * If the offset is on a word boundary that can be considered the start and end of a 226 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 227 * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would 228 * return the start of the previous word, AA. Otherwise it would return the current offset, 229 * the start of BB. 230 * 231 * Returns BreakIterator.DONE if there is no previous boundary. 232 * 233 * @throws IllegalArgumentException is offset is not valid. 234 */ getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary)235 private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) { 236 checkOffsetIsValid(offset); 237 238 if (isOnLetterOrDigit(offset)) { 239 if (mIterator.isBoundary(offset) 240 && (!isAfterLetterOrDigit(offset) 241 || !getPrevWordBeginningOnTwoWordsBoundary)) { 242 return offset; 243 } else { 244 return mIterator.preceding(offset); 245 } 246 } else { 247 if (isAfterLetterOrDigit(offset)) { 248 return mIterator.preceding(offset); 249 } 250 } 251 return BreakIterator.DONE; 252 } 253 254 /** 255 * If the <code>offset</code> is within a word or on a word boundary that can only be 256 * considered the end of a word (e.g. word_ where "_" is any character that would not be 257 * considered part of the word) then this returns the index of the last character plus one 258 * of that word. 259 * 260 * If the offset is on a word boundary that can be considered the start and end of a 261 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 262 * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return 263 * the end of the next word, BB. Otherwise it would return the current offset, the end 264 * of AA. 265 * 266 * Returns BreakIterator.DONE if there is no next boundary. 267 * 268 * @throws IllegalArgumentException is offset is not valid. 269 */ getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary)270 private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) { 271 checkOffsetIsValid(offset); 272 273 if (isAfterLetterOrDigit(offset)) { 274 if (mIterator.isBoundary(offset) 275 && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) { 276 return offset; 277 } else { 278 return mIterator.following(offset); 279 } 280 } else { 281 if (isOnLetterOrDigit(offset)) { 282 return mIterator.following(offset); 283 } 284 } 285 return BreakIterator.DONE; 286 } 287 288 /** 289 * If <code>offset</code> is within a group of punctuation as defined by {@link 290 * TextUtils#isPunctuation(int)}, returns the index of the first character of that group, 291 * otherwise returns BreakIterator.DONE. 292 * 293 * @param offset the offset to search from. 294 */ 295 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) getPunctuationBeginning(int offset)296 public int getPunctuationBeginning(int offset) { 297 checkOffsetIsValid(offset); 298 while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) { 299 offset = prevBoundary(offset); 300 } 301 // No need to shift offset, prevBoundary handles that. 302 return offset; 303 } 304 305 /** 306 * If <code>offset</code> is within a group of punctuation as defined by {@link 307 * TextUtils#isPunctuation(int)}, returns the index of the last character of that group plus 308 * one, otherwise returns BreakIterator.DONE. 309 * 310 * @param offset the offset to search from. 311 */ 312 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) getPunctuationEnd(int offset)313 public int getPunctuationEnd(int offset) { 314 checkOffsetIsValid(offset); 315 while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) { 316 offset = nextBoundary(offset); 317 } 318 // No need to shift offset, nextBoundary handles that. 319 return offset; 320 } 321 322 /** 323 * Indicates if the provided offset is after a punctuation character as defined by {@link 324 * TextUtils#isPunctuation(int)}. 325 * 326 * @param offset the offset to check from. 327 * @return Whether the offset is after a punctuation character. 328 */ 329 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) isAfterPunctuation(int offset)330 public boolean isAfterPunctuation(int offset) { 331 if (mStart < offset && offset <= mEnd) { 332 final int codePoint = Character.codePointBefore(mCharSeq, offset); 333 return TextUtils.isPunctuation(codePoint); 334 } 335 return false; 336 } 337 338 /** 339 * Indicates if the provided offset is at a punctuation character as defined by {@link 340 * TextUtils#isPunctuation(int)}. 341 * 342 * @param offset the offset to check from. 343 * @return Whether the offset is at a punctuation character. 344 */ 345 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) isOnPunctuation(int offset)346 public boolean isOnPunctuation(int offset) { 347 if (mStart <= offset && offset < mEnd) { 348 final int codePoint = Character.codePointAt(mCharSeq, offset); 349 return TextUtils.isPunctuation(codePoint); 350 } 351 return false; 352 } 353 354 /** 355 * Indicates if the codepoint is a mid-word-only punctuation. 356 * 357 * At the moment, this is locale-independent, and includes all the characters in 358 * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see 359 * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the 360 * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are 361 * in the middle of a word, but they become word breaks if they happen at the end of a word 362 * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise). 363 * 364 * @param locale the locale to consider the codepoint in. Presently ignored. 365 * @param codePoint the codepoint to check. 366 * @return True if the codepoint is a mid-word punctuation. 367 */ isMidWordPunctuation(Locale locale, int codePoint)368 public static boolean isMidWordPunctuation(Locale locale, int codePoint) { 369 final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK); 370 return (wb == UCharacter.WordBreak.MIDLETTER 371 || wb == UCharacter.WordBreak.MIDNUMLET 372 || wb == UCharacter.WordBreak.SINGLE_QUOTE); 373 } 374 isPunctuationStartBoundary(int offset)375 private boolean isPunctuationStartBoundary(int offset) { 376 return isOnPunctuation(offset) && !isAfterPunctuation(offset); 377 } 378 isPunctuationEndBoundary(int offset)379 private boolean isPunctuationEndBoundary(int offset) { 380 return !isOnPunctuation(offset) && isAfterPunctuation(offset); 381 } 382 isAfterLetterOrDigit(int offset)383 private boolean isAfterLetterOrDigit(int offset) { 384 if (mStart < offset && offset <= mEnd) { 385 final int codePoint = Character.codePointBefore(mCharSeq, offset); 386 if (Character.isLetterOrDigit(codePoint)) return true; 387 } 388 return false; 389 } 390 isOnLetterOrDigit(int offset)391 private boolean isOnLetterOrDigit(int offset) { 392 if (mStart <= offset && offset < mEnd) { 393 final int codePoint = Character.codePointAt(mCharSeq, offset); 394 if (Character.isLetterOrDigit(codePoint)) return true; 395 } 396 return false; 397 } 398 checkOffsetIsValid(int offset)399 private void checkOffsetIsValid(int offset) { 400 if (!(mStart <= offset && offset <= mEnd)) { 401 throw new IllegalArgumentException("Invalid offset: " + (offset) + 402 ". Valid range is [" + mStart + ", " + mEnd + "]"); 403 } 404 } 405 } 406