1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/tokenization/icu/icu-language-segmenter.h"
16
17 #include <cstdint>
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22 #include <vector>
23
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/absl_ports/canonical_errors.h"
27 #include "icing/legacy/core/icing-string-util.h"
28 #include "icing/util/character-iterator.h"
29 #include "icing/util/i18n-utils.h"
30 #include "icing/util/status-macros.h"
31 #include "unicode/ubrk.h"
32 #include "unicode/uchar.h"
33 #include "unicode/umachine.h"
34
35 namespace icing {
36 namespace lib {
37
38 namespace {
39 constexpr char kASCIISpace = ' ';
40 } // namespace
41
42 class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
43 public:
44 // Factory function to create a segment iterator based on the given locale.
45 //
46 // Returns:
47 // An iterator on success
48 // INTERNAL_ERROR if unable to create
49 static libtextclassifier3::StatusOr<
50 std::unique_ptr<LanguageSegmenter::Iterator>>
Create(std::string_view text,std::string_view locale)51 Create(std::string_view text, std::string_view locale) {
52 std::unique_ptr<IcuLanguageSegmenterIterator> iterator(
53 new IcuLanguageSegmenterIterator(text, locale));
54 if (iterator->Initialize()) {
55 return iterator;
56 }
57 return absl_ports::InternalError("Unable to create a term iterator");
58 }
59
~IcuLanguageSegmenterIterator()60 ~IcuLanguageSegmenterIterator() {
61 ubrk_close(break_iterator_);
62 utext_close(&u_text_);
63 }
64
65 // Advances to the next term. Returns false if it has reached the end.
Advance()66 bool Advance() override {
67 // Prerequisite check
68 if (term_end_index_exclusive_ == UBRK_DONE) {
69 return false;
70 }
71
72 if (term_end_index_exclusive_ == 0) {
73 // First Advance() call
74 term_start_index_ = ubrk_first(break_iterator_);
75 } else {
76 term_start_index_ = term_end_index_exclusive_;
77 }
78 term_end_index_exclusive_ = ubrk_next(break_iterator_);
79
80 // Reached the end
81 if (term_end_index_exclusive_ == UBRK_DONE) {
82 MarkAsDone();
83 return false;
84 }
85
86 if (!IsValidSegment()) {
87 return Advance();
88 }
89 return true;
90 }
91
92 // Returns the current term. It can be called only when Advance() returns
93 // true.
GetTerm() const94 std::string_view GetTerm() const override {
95 int term_length = term_end_index_exclusive_ - term_start_index_;
96 if (term_end_index_exclusive_ == UBRK_DONE) {
97 term_length = 0;
98 } else if (text_[term_start_index_] == kASCIISpace) {
99 // Rule 3: multiple continuous whitespaces are treated as one.
100 term_length = 1;
101 }
102 return text_.substr(term_start_index_, term_length);
103 }
104
CalculateTermStart()105 libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart()
106 override {
107 if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
108 return absl_ports::AbortedError(
109 "Could not retrieve valid utf8 character!");
110 }
111 return offset_iterator_;
112 }
113
CalculateTermEndExclusive()114 libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive()
115 override {
116 if (!offset_iterator_.MoveToUtf8(term_end_index_exclusive_)) {
117 return absl_ports::AbortedError(
118 "Could not retrieve valid utf8 character!");
119 }
120 return offset_iterator_;
121 }
122
ResetToTermStartingAfterUtf32(int32_t offset)123 libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32(
124 int32_t offset) override {
125 if (offset < 0) {
126 // Very simple. The first term start after a negative offset is the first
127 // term. So just reset to start and Advance.
128 return ResetToStartUtf32();
129 }
130
131 // 1. Find the unicode character that contains the byte at offset.
132 if (!offset_iterator_.MoveToUtf32(offset)) {
133 // An error occurred. Mark as DONE
134 if (offset_iterator_.utf8_index() != text_.length()) {
135 // We returned false for some reason other than hitting the end. This is
136 // a real error. Just return.
137 MarkAsDone();
138 return absl_ports::AbortedError(
139 "Could not retrieve valid utf8 character!");
140 }
141 }
142 if (offset_iterator_.utf8_index() == text_.length()) {
143 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
144 "Illegal offset provided! Offset utf-32:%d, utf-8:%d is not within "
145 "bounds of string of length %zu",
146 offset_iterator_.utf32_index(), offset_iterator_.utf8_index(),
147 text_.length()));
148 }
149
150 // 2. We've got the unicode character containing byte offset. Now, we need
151 // to point to the segment that starts after this character.
152 int following_utf8_index =
153 ubrk_following(break_iterator_, offset_iterator_.utf8_index());
154 if (following_utf8_index == UBRK_DONE) {
155 MarkAsDone();
156 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
157 "No segments begin after provided offset %d.", offset));
158 }
159 term_end_index_exclusive_ = following_utf8_index;
160
161 // 3. The term_end_exclusive_ points to the start of the term that we want
162 // to return. We need to Advance so that term_start_ will now point to this
163 // term.
164 if (!Advance()) {
165 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
166 "No segments begin after provided offset %d.", offset));
167 }
168 if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
169 return absl_ports::AbortedError(
170 "Could not retrieve valid utf8 character!");
171 }
172 return offset_iterator_.utf32_index();
173 }
174
ResetToTermEndingBeforeUtf32(int32_t offset)175 libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32(
176 int32_t offset) override {
177 if (offset < 0) {
178 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
179 "Illegal offset provided! Offset %d is not within bounds of string "
180 "of length %zu",
181 offset, text_.length()));
182 }
183
184 if (!offset_iterator_.MoveToUtf32(offset)) {
185 // An error occurred. Mark as DONE
186 if (offset_iterator_.utf8_index() != text_.length()) {
187 // We returned false for some reason other than hitting the end. This is
188 // a real error. Just return.
189 MarkAsDone();
190 return absl_ports::AbortedError(
191 "Could not retrieve valid utf8 character!");
192 }
193 // If it returned false because we hit the end. Then that's fine. We'll
194 // just treat it as if the request was for the end.
195 }
196
197 // 2. We've got the unicode character containing byte offset. Now, we need
198 // to point to the segment that ends before this character.
199 int starting_utf8_index =
200 ubrk_preceding(break_iterator_, offset_iterator_.utf8_index());
201 if (starting_utf8_index == UBRK_DONE) {
202 // Rewind the end indices.
203 MarkAsDone();
204 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
205 "No segments end before provided offset %d.", offset));
206 }
207 term_start_index_ = starting_utf8_index;
208
209 // 3. We've correctly set the start index and the iterator currently points
210 // to that position. Now we need to find the correct end position and
211 // advance the iterator to that position.
212 int ending_utf8_index = ubrk_next(break_iterator_);
213 if (ending_utf8_index == UBRK_DONE) {
214 // This shouldn't ever happen.
215 MarkAsDone();
216 return absl_ports::AbortedError(IcingStringUtil::StringPrintf(
217 "No segments end before provided offset %d.", offset));
218 }
219 term_end_index_exclusive_ = ending_utf8_index;
220
221 // 4. The start and end indices point to a segment, but we need to ensure
222 // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
223 // need a segment prior to this one.
224 CharacterIterator term_start_iterator = offset_iterator_;
225 if (!term_start_iterator.MoveToUtf8(term_start_index_)) {
226 return absl_ports::AbortedError(
227 "Could not retrieve valid utf8 character!");
228 }
229 if (term_end_index_exclusive_ > offset_iterator_.utf8_index() ||
230 !IsValidSegment()) {
231 return ResetToTermEndingBeforeUtf32(term_start_iterator.utf32_index());
232 }
233 return term_start_iterator.utf32_index();
234 }
235
ResetToStartUtf32()236 libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() override {
237 term_start_index_ = 0;
238 term_end_index_exclusive_ = 0;
239 if (!Advance()) {
240 return absl_ports::NotFoundError(
241 "Unable to find any valid terms in text.");
242 }
243 if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
244 return absl_ports::AbortedError(
245 "Could not retrieve valid utf8 character!");
246 }
247 return offset_iterator_.utf32_index();
248 }
249
250 private:
IcuLanguageSegmenterIterator(std::string_view text,std::string_view locale)251 explicit IcuLanguageSegmenterIterator(std::string_view text,
252 std::string_view locale)
253 : break_iterator_(nullptr),
254 text_(text),
255 locale_(locale),
256 u_text_(UTEXT_INITIALIZER),
257 offset_iterator_(text),
258 term_start_index_(0),
259 term_end_index_exclusive_(0) {}
260
261 // Returns true on success
Initialize()262 bool Initialize() {
263 UErrorCode status = U_ZERO_ERROR;
264 utext_openUTF8(&u_text_, text_.data(), text_.length(), &status);
265 break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr,
266 /*textLength=*/0, &status);
267 ubrk_setUText(break_iterator_, &u_text_, &status);
268 return !U_FAILURE(status);
269 }
270
ResetToTermStartingBefore(int32_t offset)271 libtextclassifier3::Status ResetToTermStartingBefore(int32_t offset) {
272 term_start_index_ = ubrk_preceding(break_iterator_, offset);
273 if (term_start_index_ == UBRK_DONE) {
274 MarkAsDone();
275 return absl_ports::NotFoundError("");
276 }
277 term_end_index_exclusive_ = ubrk_next(break_iterator_);
278 if (term_end_index_exclusive_ == UBRK_DONE) {
279 MarkAsDone();
280 return absl_ports::NotFoundError("");
281 }
282 return libtextclassifier3::Status::OK;
283 }
284
285 // Ensures that all members are consistent with the 'Done' state.
286 // In the 'Done' state, term_start_index_ will point to the first character
287 // and term_end_index_exclusive_ will be marked with the kDone value.
288 // break_iterator_ may be in any state.
MarkAsDone()289 void MarkAsDone() {
290 term_end_index_exclusive_ = UBRK_DONE;
291 term_start_index_ = 0;
292 }
293
IsValidSegment() const294 bool IsValidSegment() const {
295 // Rule 1: all ASCII terms will be returned.
296 // We know it's a ASCII term by checking the first char.
297 if (i18n_utils::IsAscii(text_[term_start_index_])) {
298 return true;
299 }
300
301 UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
302 term_start_index_);
303 // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
304 // We know it's an alphabetic term by checking the first unicode character.
305 if (u_isUAlphabetic(uchar32)) {
306 return true;
307 }
308 return false;
309 }
310
311 // The underlying class that does the segmentation, ubrk_close() must be
312 // called after using.
313 UBreakIterator* break_iterator_;
314
315 // Text to be segmented
316 std::string_view text_;
317
318 // Locale of the input text, used to help segment more accurately. If a
319 // wrong locale is set, text could probably still be segmented correctly
320 // because the default break iterator behavior is used for most locales.
321 std::string_view locale_;
322
323 // A thin wrapper around the input UTF8 text, needed by break_iterator_.
324 // utext_close() must be called after using.
325 UText u_text_;
326
327 // Offset iterator. This iterator is not guaranteed to point to any particular
328 // character, but is guaranteed to point to a valid UTF character sequence.
329 //
330 // This iterator is used to save some amount of linear traversal when seeking
331 // to a specific UTF-32 offset. Each function that uses it could just create
332 // a CharacterIterator starting at the beginning of the text and traverse
333 // forward from there.
334 CharacterIterator offset_iterator_;
335
336 // The start and end indices are used to track the positions of current
337 // term.
338 int term_start_index_;
339 int term_end_index_exclusive_;
340 };
341
IcuLanguageSegmenter(std::string locale)342 IcuLanguageSegmenter::IcuLanguageSegmenter(std::string locale)
343 : locale_(std::move(locale)) {}
344
345 libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
Segment(const std::string_view text) const346 IcuLanguageSegmenter::Segment(const std::string_view text) const {
347 return IcuLanguageSegmenterIterator::Create(text, locale_);
348 }
349
350 libtextclassifier3::StatusOr<std::vector<std::string_view>>
GetAllTerms(const std::string_view text) const351 IcuLanguageSegmenter::GetAllTerms(const std::string_view text) const {
352 ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator,
353 Segment(text));
354 std::vector<std::string_view> terms;
355 while (iterator->Advance()) {
356 terms.push_back(iterator->GetTerm());
357 }
358 return terms;
359 }
360
361 } // namespace lib
362 } // namespace icing
363