• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 package com.google.android.icing;
16 
17 import java.text.BreakIterator;
18 import java.util.ArrayList;
19 import java.util.List;
20 import java.util.Locale;
21 
22 /**
23  * A simple wrapper around BreakIterator that allows batching of multiple BreakIterator#next calls
24  * to reduce the number of necessary reverse jni calls.
25  *
26  * <p>Example: The text "我每天走路去上班。" has a length of 9 bytes in UTF-16 and a length of 27 bytes in
27  * UTF-8. The text should be broken up into the following six terms when properly segmented: "我",
28  * "每天", "走路", "去", "上班", "。"
29  *
30  * <pre>{@code
31  * BreakIteratorBatcher brkItrBatcher = new BreakIteratorBatcher(Locale.US);
32  * brkItrBatcher.setText("我每天走路去上班。");
33  * int[] utf16Boundaries = brkItrBatcher.next(5);
34  * assertThat(utf16Boundaries).asList().containsExactly(1, 3, 5, 6, 8);
35  * utf16Boundaries = brkItrBatcher.next(5);
36  * assertThat(utf16Boundaries).asList().containsExactly(9);
37  * }</pre>
38  */
39 public class BreakIteratorBatcher {
40 
41   private final BreakIterator iterator;
42 
BreakIteratorBatcher(Locale locale)43   public BreakIteratorBatcher(Locale locale) {
44     this.iterator = BreakIterator.getWordInstance(locale);
45   }
46 
47   /* Direct calls to BreakIterator */
setText(String text)48   public void setText(String text) {
49     iterator.setText(text);
50   }
51 
first()52   public int first() {
53     return iterator.first();
54   }
55 
preceding(int utf16Offset)56   public int preceding(int utf16Offset) {
57     return iterator.preceding(utf16Offset);
58   }
59 
following(int utf16Offset)60   public int following(int utf16Offset) {
61     return iterator.following(utf16Offset);
62   }
63 
64   /**
65    * Batched version of next. Returns an array of ints of up to size batchSize, reflecting the
66    * return values of batchSize successful calls to BreakIterator.next. If the BreakIterator reaches
67    * the end of the text (returns BreakIterator#DONE), then only the results of the previous calls
68    * in that batch will be returned.
69    */
next(int batchSize)70   public int[] next(int batchSize) {
71     List<Integer> breakIndices = new ArrayList<>(batchSize);
72     for (int i = 0; i < batchSize; ++i) {
73       int boundary = iterator.next();
74       if (boundary == BreakIterator.DONE) {
75         break;
76       }
77       breakIndices.add(boundary);
78     }
79     int[] breakIndicesArray = new int[breakIndices.size()];
80     for (int i = 0; i < breakIndices.size(); ++i) {
81       breakIndicesArray[i] = breakIndices.get(i);
82     }
83     return breakIndicesArray;
84   }
85 }
86