1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package android.text.method; 18 19 import android.annotation.NonNull; 20 import android.compat.annotation.UnsupportedAppUsage; 21 import android.icu.lang.UCharacter; 22 import android.icu.lang.UProperty; 23 import android.icu.text.BreakIterator; 24 import android.os.Build; 25 import android.text.CharSequenceCharacterIterator; 26 import android.text.Selection; 27 28 import java.util.Locale; 29 30 /** 31 * Walks through cursor positions at word boundaries. Internally uses 32 * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence} 33 * for performance reasons. 34 * 35 * Also provides methods to determine word boundaries. 36 * {@hide} 37 */ 38 public class WordIterator implements Selection.PositionIterator { 39 // Size of the window for the word iterator, should be greater than the longest word's length 40 private static final int WINDOW_WIDTH = 50; 41 42 private int mStart, mEnd; 43 private CharSequence mCharSeq; 44 private final BreakIterator mIterator; 45 46 /** 47 * Constructs a WordIterator using the default locale. 48 */ WordIterator()49 public WordIterator() { 50 this(Locale.getDefault()); 51 } 52 53 /** 54 * Constructs a new WordIterator for the specified locale. 55 * @param locale The locale to be used for analyzing the text. 56 */ 57 @UnsupportedAppUsage WordIterator(Locale locale)58 public WordIterator(Locale locale) { 59 mIterator = BreakIterator.getWordInstance(locale); 60 } 61 62 @UnsupportedAppUsage setCharSequence(@onNull CharSequence charSequence, int start, int end)63 public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) { 64 if (0 <= start && end <= charSequence.length()) { 65 mCharSeq = charSequence; 66 mStart = Math.max(0, start - WINDOW_WIDTH); 67 mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH); 68 mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd)); 69 } else { 70 throw new IndexOutOfBoundsException("input indexes are outside the CharSequence"); 71 } 72 } 73 74 /** {@inheritDoc} */ 75 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) preceding(int offset)76 public int preceding(int offset) { 77 checkOffsetIsValid(offset); 78 while (true) { 79 offset = mIterator.preceding(offset); 80 if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) { 81 return offset; 82 } 83 } 84 } 85 86 /** {@inheritDoc} */ 87 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) following(int offset)88 public int following(int offset) { 89 checkOffsetIsValid(offset); 90 while (true) { 91 offset = mIterator.following(offset); 92 if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) { 93 return offset; 94 } 95 } 96 } 97 98 /** {@inheritDoc} */ 99 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) isBoundary(int offset)100 public boolean isBoundary(int offset) { 101 checkOffsetIsValid(offset); 102 return mIterator.isBoundary(offset); 103 } 104 105 /** 106 * Returns the position of next boundary after the given offset. Returns 107 * {@code DONE} if there is no boundary after the given offset. 108 * 109 * @param offset the given start position to search from. 110 * @return the position of the last boundary preceding the given offset. 111 */ 112 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) nextBoundary(int offset)113 public int nextBoundary(int offset) { 114 checkOffsetIsValid(offset); 115 return mIterator.following(offset); 116 } 117 118 /** 119 * Returns the position of boundary preceding the given offset or 120 * {@code DONE} if the given offset specifies the starting position. 121 * 122 * @param offset the given start position to search from. 123 * @return the position of the last boundary preceding the given offset. 124 */ 125 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) prevBoundary(int offset)126 public int prevBoundary(int offset) { 127 checkOffsetIsValid(offset); 128 return mIterator.preceding(offset); 129 } 130 131 /** If <code>offset</code> is within a word, returns the index of the first character of that 132 * word, otherwise returns BreakIterator.DONE. 133 * 134 * The offsets that are considered to be part of a word are the indexes of its characters, 135 * <i>as well as</i> the index of its last character plus one. 136 * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. 137 * 138 * Valid range for offset is [0..textLength] (note the inclusive upper bound). 139 * The returned value is within [0..offset] or BreakIterator.DONE. 140 * 141 * @throws IllegalArgumentException is offset is not valid. 142 */ 143 @UnsupportedAppUsage getBeginning(int offset)144 public int getBeginning(int offset) { 145 // TODO: Check if usage of this can be updated to getBeginning(offset, true) if 146 // so this method can be removed. 147 return getBeginning(offset, false); 148 } 149 150 /** 151 * If <code>offset</code> is within a word, returns the index of the last character of that 152 * word plus one, otherwise returns BreakIterator.DONE. 153 * 154 * The offsets that are considered to be part of a word are the indexes of its characters, 155 * <i>as well as</i> the index of its last character plus one. 156 * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. 157 * 158 * Valid range for offset is [0..textLength] (note the inclusive upper bound). 159 * The returned value is within [offset..textLength] or BreakIterator.DONE. 160 * 161 * @throws IllegalArgumentException is offset is not valid. 162 */ 163 @UnsupportedAppUsage getEnd(int offset)164 public int getEnd(int offset) { 165 // TODO: Check if usage of this can be updated to getEnd(offset, true), if 166 // so this method can be removed. 167 return getEnd(offset, false); 168 } 169 170 /** 171 * If the <code>offset</code> is within a word or on a word boundary that can only be 172 * considered the start of a word (e.g. _word where "_" is any character that would not 173 * be considered part of the word) then this returns the index of the first character of 174 * that word. 175 * 176 * If the offset is on a word boundary that can be considered the start and end of a 177 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 178 * between AA and BB, this would return the start of the previous word, AA. 179 * 180 * Returns BreakIterator.DONE if there is no previous boundary. 181 * 182 * @throws IllegalArgumentException is offset is not valid. 183 */ 184 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) getPrevWordBeginningOnTwoWordsBoundary(int offset)185 public int getPrevWordBeginningOnTwoWordsBoundary(int offset) { 186 return getBeginning(offset, true); 187 } 188 189 /** 190 * If the <code>offset</code> is within a word or on a word boundary that can only be 191 * considered the end of a word (e.g. word_ where "_" is any character that would not 192 * be considered part of the word) then this returns the index of the last character 193 * plus one of that word. 194 * 195 * If the offset is on a word boundary that can be considered the start and end of a 196 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 197 * between AA and BB, this would return the end of the next word, BB. 198 * 199 * Returns BreakIterator.DONE if there is no next boundary. 200 * 201 * @throws IllegalArgumentException is offset is not valid. 202 */ 203 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) getNextWordEndOnTwoWordBoundary(int offset)204 public int getNextWordEndOnTwoWordBoundary(int offset) { 205 return getEnd(offset, true); 206 } 207 208 /** 209 * If the <code>offset</code> is within a word or on a word boundary that can only be 210 * considered the start of a word (e.g. _word where "_" is any character that would not 211 * be considered part of the word) then this returns the index of the first character of 212 * that word. 213 * 214 * If the offset is on a word boundary that can be considered the start and end of a 215 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 216 * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would 217 * return the start of the previous word, AA. Otherwise it would return the current offset, 218 * the start of BB. 219 * 220 * Returns BreakIterator.DONE if there is no previous boundary. 221 * 222 * @throws IllegalArgumentException is offset is not valid. 223 */ getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary)224 private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) { 225 checkOffsetIsValid(offset); 226 227 if (isOnLetterOrDigit(offset)) { 228 if (mIterator.isBoundary(offset) 229 && (!isAfterLetterOrDigit(offset) 230 || !getPrevWordBeginningOnTwoWordsBoundary)) { 231 return offset; 232 } else { 233 return mIterator.preceding(offset); 234 } 235 } else { 236 if (isAfterLetterOrDigit(offset)) { 237 return mIterator.preceding(offset); 238 } 239 } 240 return BreakIterator.DONE; 241 } 242 243 /** 244 * If the <code>offset</code> is within a word or on a word boundary that can only be 245 * considered the end of a word (e.g. word_ where "_" is any character that would not be 246 * considered part of the word) then this returns the index of the last character plus one 247 * of that word. 248 * 249 * If the offset is on a word boundary that can be considered the start and end of a 250 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 251 * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return 252 * the end of the next word, BB. Otherwise it would return the current offset, the end 253 * of AA. 254 * 255 * Returns BreakIterator.DONE if there is no next boundary. 256 * 257 * @throws IllegalArgumentException is offset is not valid. 258 */ getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary)259 private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) { 260 checkOffsetIsValid(offset); 261 262 if (isAfterLetterOrDigit(offset)) { 263 if (mIterator.isBoundary(offset) 264 && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) { 265 return offset; 266 } else { 267 return mIterator.following(offset); 268 } 269 } else { 270 if (isOnLetterOrDigit(offset)) { 271 return mIterator.following(offset); 272 } 273 } 274 return BreakIterator.DONE; 275 } 276 277 /** 278 * If <code>offset</code> is within a group of punctuation as defined 279 * by {@link #isPunctuation(int)}, returns the index of the first character 280 * of that group, otherwise returns BreakIterator.DONE. 281 * 282 * @param offset the offset to search from. 283 */ 284 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) getPunctuationBeginning(int offset)285 public int getPunctuationBeginning(int offset) { 286 checkOffsetIsValid(offset); 287 while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) { 288 offset = prevBoundary(offset); 289 } 290 // No need to shift offset, prevBoundary handles that. 291 return offset; 292 } 293 294 /** 295 * If <code>offset</code> is within a group of punctuation as defined 296 * by {@link #isPunctuation(int)}, returns the index of the last character 297 * of that group plus one, otherwise returns BreakIterator.DONE. 298 * 299 * @param offset the offset to search from. 300 */ 301 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) getPunctuationEnd(int offset)302 public int getPunctuationEnd(int offset) { 303 checkOffsetIsValid(offset); 304 while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) { 305 offset = nextBoundary(offset); 306 } 307 // No need to shift offset, nextBoundary handles that. 308 return offset; 309 } 310 311 /** 312 * Indicates if the provided offset is after a punctuation character 313 * as defined by {@link #isPunctuation(int)}. 314 * 315 * @param offset the offset to check from. 316 * @return Whether the offset is after a punctuation character. 317 */ 318 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) isAfterPunctuation(int offset)319 public boolean isAfterPunctuation(int offset) { 320 if (mStart < offset && offset <= mEnd) { 321 final int codePoint = Character.codePointBefore(mCharSeq, offset); 322 return isPunctuation(codePoint); 323 } 324 return false; 325 } 326 327 /** 328 * Indicates if the provided offset is at a punctuation character 329 * as defined by {@link #isPunctuation(int)}. 330 * 331 * @param offset the offset to check from. 332 * @return Whether the offset is at a punctuation character. 333 */ 334 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) isOnPunctuation(int offset)335 public boolean isOnPunctuation(int offset) { 336 if (mStart <= offset && offset < mEnd) { 337 final int codePoint = Character.codePointAt(mCharSeq, offset); 338 return isPunctuation(codePoint); 339 } 340 return false; 341 } 342 343 /** 344 * Indicates if the codepoint is a mid-word-only punctuation. 345 * 346 * At the moment, this is locale-independent, and includes all the characters in 347 * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see 348 * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the 349 * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are 350 * in the middle of a word, but they become word breaks if they happen at the end of a word 351 * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise). 352 * 353 * @param locale the locale to consider the codepoint in. Presently ignored. 354 * @param codePoint the codepoint to check. 355 * @return True if the codepoint is a mid-word punctuation. 356 */ isMidWordPunctuation(Locale locale, int codePoint)357 public static boolean isMidWordPunctuation(Locale locale, int codePoint) { 358 final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK); 359 return (wb == UCharacter.WordBreak.MIDLETTER 360 || wb == UCharacter.WordBreak.MIDNUMLET 361 || wb == UCharacter.WordBreak.SINGLE_QUOTE); 362 } 363 isPunctuationStartBoundary(int offset)364 private boolean isPunctuationStartBoundary(int offset) { 365 return isOnPunctuation(offset) && !isAfterPunctuation(offset); 366 } 367 isPunctuationEndBoundary(int offset)368 private boolean isPunctuationEndBoundary(int offset) { 369 return !isOnPunctuation(offset) && isAfterPunctuation(offset); 370 } 371 isPunctuation(int cp)372 private static boolean isPunctuation(int cp) { 373 final int type = Character.getType(cp); 374 return (type == Character.CONNECTOR_PUNCTUATION 375 || type == Character.DASH_PUNCTUATION 376 || type == Character.END_PUNCTUATION 377 || type == Character.FINAL_QUOTE_PUNCTUATION 378 || type == Character.INITIAL_QUOTE_PUNCTUATION 379 || type == Character.OTHER_PUNCTUATION 380 || type == Character.START_PUNCTUATION); 381 } 382 isAfterLetterOrDigit(int offset)383 private boolean isAfterLetterOrDigit(int offset) { 384 if (mStart < offset && offset <= mEnd) { 385 final int codePoint = Character.codePointBefore(mCharSeq, offset); 386 if (Character.isLetterOrDigit(codePoint)) return true; 387 } 388 return false; 389 } 390 isOnLetterOrDigit(int offset)391 private boolean isOnLetterOrDigit(int offset) { 392 if (mStart <= offset && offset < mEnd) { 393 final int codePoint = Character.codePointAt(mCharSeq, offset); 394 if (Character.isLetterOrDigit(codePoint)) return true; 395 } 396 return false; 397 } 398 checkOffsetIsValid(int offset)399 private void checkOffsetIsValid(int offset) { 400 if (!(mStart <= offset && offset <= mEnd)) { 401 throw new IllegalArgumentException("Invalid offset: " + (offset) + 402 ". Valid range is [" + mStart + ", " + mEnd + "]"); 403 } 404 } 405 } 406