1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package com.google.android.icing; 16 17 import java.text.BreakIterator; 18 import java.util.ArrayList; 19 import java.util.List; 20 import java.util.Locale; 21 22 /** 23 * A simple wrapper around BreakIterator that allows batching of multiple BreakIterator#next calls 24 * to reduce the number of necessary reverse jni calls. 25 * 26 * <p>Example: The text "我每天走路去上班。" has a length of 9 bytes in UTF-16 and a length of 27 bytes in 27 * UTF-8. The text should be broken up into the following six terms when properly segmented: "我", 28 * "每天", "走路", "去", "上班", "。" 29 * 30 * <pre>{@code 31 * BreakIteratorBatcher brkItrBatcher = new BreakIteratorBatcher(Locale.US); 32 * brkItrBatcher.setText("我每天走路去上班。"); 33 * int[] utf16Boundaries = brkItrBatcher.next(5); 34 * assertThat(utf16Boundaries).asList().containsExactly(1, 3, 5, 6, 8); 35 * utf16Boundaries = brkItrBatcher.next(5); 36 * assertThat(utf16Boundaries).asList().containsExactly(9); 37 * }</pre> 38 */ 39 public class BreakIteratorBatcher { 40 41 private final BreakIterator iterator; 42 BreakIteratorBatcher(Locale locale)43 public BreakIteratorBatcher(Locale locale) { 44 this.iterator = BreakIterator.getWordInstance(locale); 45 } 46 47 /* Direct calls to BreakIterator */ setText(String text)48 public void setText(String text) { 49 iterator.setText(text); 50 } 51 first()52 public int first() { 53 return iterator.first(); 54 } 55 preceding(int utf16Offset)56 public int preceding(int utf16Offset) { 57 return iterator.preceding(utf16Offset); 58 } 59 following(int utf16Offset)60 public int following(int utf16Offset) { 61 return iterator.following(utf16Offset); 62 } 63 64 /** 65 * Batched version of next. Returns an array of ints of up to size batchSize, reflecting the 66 * return values of batchSize successful calls to BreakIterator.next. If the BreakIterator reaches 67 * the end of the text (returns BreakIterator#DONE), then only the results of the previous calls 68 * in that batch will be returned. 69 */ next(int batchSize)70 public int[] next(int batchSize) { 71 List<Integer> breakIndices = new ArrayList<>(batchSize); 72 for (int i = 0; i < batchSize; ++i) { 73 int boundary = iterator.next(); 74 if (boundary == BreakIterator.DONE) { 75 break; 76 } 77 breakIndices.add(boundary); 78 } 79 int[] breakIndicesArray = new int[breakIndices.size()]; 80 for (int i = 0; i < breakIndices.size(); ++i) { 81 breakIndicesArray[i] = breakIndices.get(i); 82 } 83 return breakIndicesArray; 84 } 85 } 86