1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/tokenization/icu/icu-language-segmenter.h"
16
17 #include <cstdint>
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22 #include <vector>
23
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/absl_ports/canonical_errors.h"
27 #include "icing/absl_ports/mutex.h"
28 #include "icing/legacy/core/icing-string-util.h"
29 #include "icing/util/character-iterator.h"
30 #include "icing/util/i18n-utils.h"
31 #include "icing/util/status-macros.h"
32 #include "unicode/ubrk.h"
33 #include "unicode/uchar.h"
34 #include "unicode/umachine.h"
35
36 namespace icing {
37 namespace lib {
38
39 namespace {
40 constexpr char kASCIISpace = ' ';
41 } // namespace
42
43 class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
44 public:
45 // Factory function to create a segment iterator based on the given locale.
46 //
47 // Returns:
48 // An iterator on success
49 // INTERNAL_ERROR if unable to create
50 static libtextclassifier3::StatusOr<
51 std::unique_ptr<LanguageSegmenter::Iterator>>
Create(const IcuLanguageSegmenter * creator,UBreakIterator * break_iterator,std::string_view text,std::string_view locale)52 Create(const IcuLanguageSegmenter* creator, UBreakIterator* break_iterator,
53 std::string_view text, std::string_view locale) {
54 std::unique_ptr<IcuLanguageSegmenterIterator> iterator(
55 new IcuLanguageSegmenterIterator(creator, break_iterator, text,
56 locale));
57 if (iterator->Initialize()) {
58 return iterator;
59 }
60 return absl_ports::InternalError("Unable to create a term iterator");
61 }
62
~IcuLanguageSegmenterIterator()63 ~IcuLanguageSegmenterIterator() {
64 utext_close(u_text_);
65 creator_.ReturnBreakIterator(break_iterator_);
66 }
67
68 // Advances to the next term. Returns false if it has reached the end.
Advance()69 bool Advance() override {
70 // Prerequisite check
71 if (term_end_index_exclusive_ == UBRK_DONE) {
72 return false;
73 }
74
75 if (term_end_index_exclusive_ == 0) {
76 // First Advance() call
77 term_start_index_ = ubrk_first(break_iterator_);
78 } else {
79 term_start_index_ = term_end_index_exclusive_;
80 }
81 term_end_index_exclusive_ = ubrk_next(break_iterator_);
82
83 // Reached the end
84 if (term_end_index_exclusive_ == UBRK_DONE) {
85 MarkAsDone();
86 return false;
87 }
88
89 return true;
90 }
91
92 // Returns the current term. It can be called only when Advance() returns
93 // true.
GetTerm() const94 std::string_view GetTerm() const override {
95 int term_length = term_end_index_exclusive_ - term_start_index_;
96 if (term_end_index_exclusive_ == UBRK_DONE) {
97 term_length = 0;
98 } else if (text_[term_start_index_] == kASCIISpace) {
99 // Rule 3: multiple continuous whitespaces are treated as one.
100 term_length = 1;
101 }
102 return text_.substr(term_start_index_, term_length);
103 }
104
CalculateTermStart()105 libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart()
106 override {
107 if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
108 return absl_ports::AbortedError(
109 "Could not retrieve valid utf8 character!");
110 }
111 return offset_iterator_;
112 }
113
CalculateTermEndExclusive()114 libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive()
115 override {
116 if (!offset_iterator_.MoveToUtf8(term_end_index_exclusive_)) {
117 return absl_ports::AbortedError(
118 "Could not retrieve valid utf8 character!");
119 }
120 return offset_iterator_;
121 }
122
ResetToTermStartingAfterUtf32(int32_t offset)123 libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32(
124 int32_t offset) override {
125 if (offset < 0) {
126 // Very simple. The first term start after a negative offset is the first
127 // term. So just reset to start and Advance.
128 return ResetToStartUtf32();
129 }
130
131 // 1. Find the unicode character that contains the byte at offset.
132 if (!offset_iterator_.MoveToUtf32(offset)) {
133 // An error occurred. Mark as DONE
134 if (offset_iterator_.utf8_index() != text_.length()) {
135 // We returned false for some reason other than hitting the end. This is
136 // a real error. Just return.
137 MarkAsDone();
138 return absl_ports::AbortedError(
139 "Could not retrieve valid utf8 character!");
140 }
141 }
142 if (offset_iterator_.utf8_index() == text_.length()) {
143 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
144 "Illegal offset provided! Offset utf-32:%d, utf-8:%d is not within "
145 "bounds of string of length %zu",
146 offset_iterator_.utf32_index(), offset_iterator_.utf8_index(),
147 text_.length()));
148 }
149
150 // 2. We've got the unicode character containing byte offset. Now, we need
151 // to point to the segment that starts after this character.
152 int following_utf8_index =
153 ubrk_following(break_iterator_, offset_iterator_.utf8_index());
154 if (following_utf8_index == UBRK_DONE) {
155 MarkAsDone();
156 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
157 "No segments begin after provided offset %d.", offset));
158 }
159 term_end_index_exclusive_ = following_utf8_index;
160
161 // 3. The term_end_exclusive_ points to the start of the term that we want
162 // to return. We need to Advance so that term_start_ will now point to this
163 // term.
164 if (!Advance()) {
165 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
166 "No segments begin after provided offset %d.", offset));
167 }
168 if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
169 return absl_ports::AbortedError(
170 "Could not retrieve valid utf8 character!");
171 }
172 return offset_iterator_.utf32_index();
173 }
174
ResetToTermEndingBeforeUtf32(int32_t offset)175 libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32(
176 int32_t offset) override {
177 if (offset < 0) {
178 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
179 "Illegal offset provided! Offset %d is not within bounds of string "
180 "of length %zu",
181 offset, text_.length()));
182 }
183
184 if (!offset_iterator_.MoveToUtf32(offset)) {
185 // An error occurred. Mark as DONE
186 if (offset_iterator_.utf8_index() != text_.length()) {
187 // We returned false for some reason other than hitting the end. This is
188 // a real error. Just return.
189 MarkAsDone();
190 return absl_ports::AbortedError(
191 "Could not retrieve valid utf8 character!");
192 }
193 // If it returned false because we hit the end. Then that's fine. We'll
194 // just treat it as if the request was for the end.
195 }
196
197 // 2. We've got the unicode character containing byte offset. Now, we need
198 // to point to the segment that ends before this character.
199 int starting_utf8_index =
200 ubrk_preceding(break_iterator_, offset_iterator_.utf8_index());
201 if (starting_utf8_index == UBRK_DONE) {
202 // Rewind the end indices.
203 MarkAsDone();
204 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
205 "No segments end before provided offset %d.", offset));
206 }
207 term_start_index_ = starting_utf8_index;
208
209 // 3. We've correctly set the start index and the iterator currently points
210 // to that position. Now we need to find the correct end position and
211 // advance the iterator to that position.
212 int ending_utf8_index = ubrk_next(break_iterator_);
213 if (ending_utf8_index == UBRK_DONE) {
214 // This shouldn't ever happen.
215 MarkAsDone();
216 return absl_ports::AbortedError(IcingStringUtil::StringPrintf(
217 "No segments end before provided offset %d.", offset));
218 }
219 term_end_index_exclusive_ = ending_utf8_index;
220
221 // 4. The start and end indices point to a segment, but we need to ensure
222 // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
223 // need a segment prior to this one.
224 CharacterIterator term_start_iterator = offset_iterator_;
225 if (!term_start_iterator.MoveToUtf8(term_start_index_)) {
226 return absl_ports::AbortedError(
227 "Could not retrieve valid utf8 character!");
228 }
229 if (term_end_index_exclusive_ > offset_iterator_.utf8_index()) {
230 return ResetToTermEndingBeforeUtf32(term_start_iterator.utf32_index());
231 }
232 return term_start_iterator.utf32_index();
233 }
234
ResetToStartUtf32()235 libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() override {
236 term_start_index_ = 0;
237 term_end_index_exclusive_ = 0;
238 if (!Advance()) {
239 return absl_ports::NotFoundError(
240 "Unable to find any valid terms in text.");
241 }
242 if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
243 return absl_ports::AbortedError(
244 "Could not retrieve valid utf8 character!");
245 }
246 return offset_iterator_.utf32_index();
247 }
248
249 private:
IcuLanguageSegmenterIterator(const IcuLanguageSegmenter * creator,UBreakIterator * break_iterator,std::string_view text,std::string_view locale)250 explicit IcuLanguageSegmenterIterator(const IcuLanguageSegmenter* creator,
251 UBreakIterator* break_iterator,
252 std::string_view text,
253 std::string_view locale)
254 : creator_(*creator),
255 break_iterator_(break_iterator),
256 text_(text),
257 locale_(locale),
258 u_text_(nullptr),
259 offset_iterator_(text),
260 term_start_index_(0),
261 term_end_index_exclusive_(0) {}
262
263 // Returns true on success
Initialize()264 bool Initialize() {
265 if (break_iterator_ == nullptr) {
266 return false;
267 }
268 UErrorCode status = U_ZERO_ERROR;
269 u_text_ = utext_openUTF8(nullptr, text_.data(), text_.length(), &status);
270 if (u_text_ == nullptr) {
271 return false;
272 }
273 ubrk_setUText(break_iterator_, u_text_, &status);
274 return !U_FAILURE(status);
275 }
276
ResetToTermStartingBefore(int32_t offset)277 libtextclassifier3::Status ResetToTermStartingBefore(int32_t offset) {
278 term_start_index_ = ubrk_preceding(break_iterator_, offset);
279 if (term_start_index_ == UBRK_DONE) {
280 MarkAsDone();
281 return absl_ports::NotFoundError("");
282 }
283 term_end_index_exclusive_ = ubrk_next(break_iterator_);
284 if (term_end_index_exclusive_ == UBRK_DONE) {
285 MarkAsDone();
286 return absl_ports::NotFoundError("");
287 }
288 return libtextclassifier3::Status::OK;
289 }
290
291 // Ensures that all members are consistent with the 'Done' state.
292 // In the 'Done' state, term_start_index_ will point to the first character
293 // and term_end_index_exclusive_ will be marked with the kDone value.
294 // break_iterator_ may be in any state.
MarkAsDone()295 void MarkAsDone() {
296 term_end_index_exclusive_ = UBRK_DONE;
297 term_start_index_ = 0;
298 }
299
300 const IcuLanguageSegmenter& creator_; // Does not own.
301
302 // The underlying class that does the segmentation, ubrk_close() must be
303 // called after using.
304 UBreakIterator* break_iterator_; // Does not own
305
306 // Text to be segmented
307 std::string_view text_;
308
309 // Locale of the input text, used to help segment more accurately. If a
310 // wrong locale is set, text could probably still be segmented correctly
311 // because the default break iterator behavior is used for most locales.
312 std::string_view locale_;
313
314 // A thin wrapper around the input UTF8 text, needed by break_iterator_.
315 // Allocated by calling utext_openUtf8() and freed by calling utext_close().
316 UText* u_text_;
317
318 // Offset iterator. This iterator is not guaranteed to point to any particular
319 // character, but is guaranteed to point to a valid UTF character sequence.
320 //
321 // This iterator is used to save some amount of linear traversal when seeking
322 // to a specific UTF-32 offset. Each function that uses it could just create
323 // a CharacterIterator starting at the beginning of the text and traverse
324 // forward from there.
325 CharacterIterator offset_iterator_;
326
327 // The start and end indices are used to track the positions of current
328 // term.
329 int term_start_index_;
330 int term_end_index_exclusive_;
331 };
332
333 /* static */ libtextclassifier3::StatusOr<std::unique_ptr<IcuLanguageSegmenter>>
Create(std::string && locale)334 IcuLanguageSegmenter::Create(std::string&& locale) {
335 UErrorCode status = U_ZERO_ERROR;
336 UBreakIterator* break_iterator = ubrk_open(
337 UBRK_WORD, locale.c_str(), /*text=*/nullptr, /*textLength=*/0, &status);
338 if (U_FAILURE(status) || break_iterator == nullptr) {
339 return absl_ports::AbortedError(
340 "Unable to create ICU break_iterator for language segmentation");
341 }
342 return std::unique_ptr<IcuLanguageSegmenter>(
343 new IcuLanguageSegmenter(std::move(locale), break_iterator));
344 }
345
ProduceBreakIterator() const346 UBreakIterator* IcuLanguageSegmenter::ProduceBreakIterator() const {
347 UBreakIterator* itr = nullptr;
348 {
349 absl_ports::unique_lock l(&mutex_);
350 if (cached_break_iterator_ != nullptr) {
351 itr = cached_break_iterator_;
352 cached_break_iterator_ = nullptr;
353 }
354 }
355 if (itr == nullptr) {
356 UErrorCode status = U_ZERO_ERROR;
357 itr = ubrk_open(UBRK_WORD, locale_.c_str(), /*text=*/nullptr,
358 /*textLength=*/0, &status);
359 if (U_FAILURE(status)) {
360 itr = nullptr;
361 }
362 }
363 return itr;
364 }
365
ReturnBreakIterator(UBreakIterator * itr) const366 void IcuLanguageSegmenter::ReturnBreakIterator(UBreakIterator* itr) const {
367 {
368 absl_ports::unique_lock l(&mutex_);
369 if (cached_break_iterator_ == nullptr) {
370 cached_break_iterator_ = itr;
371 return;
372 }
373 }
374 ubrk_close(itr);
375 }
376
377 libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
Segment(const std::string_view text) const378 IcuLanguageSegmenter::Segment(const std::string_view text) const {
379 return IcuLanguageSegmenterIterator::Create(this, ProduceBreakIterator(),
380 text, locale_);
381 }
382
383 libtextclassifier3::StatusOr<std::vector<std::string_view>>
GetAllTerms(const std::string_view text) const384 IcuLanguageSegmenter::GetAllTerms(const std::string_view text) const {
385 ICING_ASSIGN_OR_RETURN(
386 std::unique_ptr<LanguageSegmenter::Iterator> iterator,
387 Segment(text));
388 std::vector<std::string_view> terms;
389 while (iterator->Advance()) {
390 terms.push_back(iterator->GetTerm());
391 }
392 return terms;
393 }
394
395 } // namespace lib
396 } // namespace icing
397