1 /* 2 * Copyright (C) 2022 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package android.text; 18 19 import android.annotation.IntRange; 20 import android.annotation.NonNull; 21 import android.icu.text.BreakIterator; 22 import android.icu.util.ULocale; 23 import android.text.method.WordIterator; 24 25 /** 26 * Implementation of {@link SegmentFinder} using words as the text segment. Word boundaries are 27 * found using {@code WordIterator}. Whitespace characters are excluded, so they are not included in 28 * any text segments. 29 * 30 * <p>For example, the text "Hello, World!" would be subdivided into four text segments: "Hello", 31 * ",", "World", "!". The space character does not belong to any text segments. 32 * 33 * @see <a href="https://unicode.org/reports/tr29/#Word_Boundaries">Unicode Text Segmentation - Word 34 * Boundaries</a> 35 */ 36 @android.ravenwood.annotation.RavenwoodKeepWholeClass 37 public class WordSegmentFinder extends SegmentFinder { 38 private final CharSequence mText; 39 private final WordIterator mWordIterator; 40 41 /** 42 * Constructs a WordSegmentFinder instance for the specified text which uses the provided locale 43 * to determine word boundaries. 44 * 45 * @param text text to be segmented 46 * @param locale locale used for analyzing the text 47 */ WordSegmentFinder( @onNull CharSequence text, @NonNull ULocale locale)48 public WordSegmentFinder( 49 @NonNull CharSequence text, @NonNull ULocale locale) { 50 mText = text; 51 mWordIterator = new WordIterator(locale); 52 mWordIterator.setCharSequence(text, 0, text.length()); 53 } 54 55 /** 56 * Constructs a WordSegmentFinder instance for the specified text which uses the provided 57 * WordIterator to determine word boundaries. 58 * 59 * @param text text to be segmented 60 * @param wordIterator word iterator used to find word boundaries in the text 61 * @hide 62 */ WordSegmentFinder(@onNull CharSequence text, @NonNull WordIterator wordIterator)63 public WordSegmentFinder(@NonNull CharSequence text, @NonNull WordIterator wordIterator) { 64 mText = text; 65 mWordIterator = wordIterator; 66 } 67 68 @Override previousStartBoundary(@ntRangefrom = 0) int offset)69 public int previousStartBoundary(@IntRange(from = 0) int offset) { 70 int boundary = offset; 71 do { 72 boundary = mWordIterator.prevBoundary(boundary); 73 if (boundary == BreakIterator.DONE) { 74 return DONE; 75 } 76 } while (Character.isWhitespace(mText.charAt(boundary))); 77 return boundary; 78 } 79 80 @Override previousEndBoundary(@ntRangefrom = 0) int offset)81 public int previousEndBoundary(@IntRange(from = 0) int offset) { 82 int boundary = offset; 83 do { 84 boundary = mWordIterator.prevBoundary(boundary); 85 if (boundary == BreakIterator.DONE || boundary == 0) { 86 return DONE; 87 } 88 } while (Character.isWhitespace(mText.charAt(boundary - 1))); 89 return boundary; 90 } 91 92 @Override nextStartBoundary(@ntRangefrom = 0) int offset)93 public int nextStartBoundary(@IntRange(from = 0) int offset) { 94 int boundary = offset; 95 do { 96 boundary = mWordIterator.nextBoundary(boundary); 97 if (boundary == BreakIterator.DONE || boundary == mText.length()) { 98 return DONE; 99 } 100 } while (Character.isWhitespace(mText.charAt(boundary))); 101 return boundary; 102 } 103 104 @Override nextEndBoundary(@ntRangefrom = 0) int offset)105 public int nextEndBoundary(@IntRange(from = 0) int offset) { 106 int boundary = offset; 107 do { 108 boundary = mWordIterator.nextBoundary(boundary); 109 if (boundary == BreakIterator.DONE) { 110 return DONE; 111 } 112 } while (Character.isWhitespace(mText.charAt(boundary - 1))); 113 return boundary; 114 } 115 } 116