• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/tokenization/icu/icu-language-segmenter.h"
16 
17 #include <cstdint>
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22 #include <vector>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/absl_ports/canonical_errors.h"
27 #include "icing/absl_ports/mutex.h"
28 #include "icing/legacy/core/icing-string-util.h"
29 #include "icing/util/character-iterator.h"
30 #include "icing/util/i18n-utils.h"
31 #include "icing/util/status-macros.h"
32 #include "unicode/ubrk.h"
33 #include "unicode/uchar.h"
34 #include "unicode/umachine.h"
35 
36 namespace icing {
37 namespace lib {
38 
39 namespace {
40 constexpr char kASCIISpace = ' ';
41 }  // namespace
42 
43 class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
44  public:
45   // Factory function to create a segment iterator based on the given locale.
46   //
47   // Returns:
48   //   An iterator on success
49   //   INTERNAL_ERROR if unable to create
50   static libtextclassifier3::StatusOr<
51       std::unique_ptr<LanguageSegmenter::Iterator>>
Create(const IcuLanguageSegmenter * creator,UBreakIterator * break_iterator,std::string_view text,std::string_view locale)52   Create(const IcuLanguageSegmenter* creator, UBreakIterator* break_iterator,
53          std::string_view text, std::string_view locale) {
54     std::unique_ptr<IcuLanguageSegmenterIterator> iterator(
55         new IcuLanguageSegmenterIterator(creator, break_iterator, text,
56                                          locale));
57     if (iterator->Initialize()) {
58       return iterator;
59     }
60     return absl_ports::InternalError("Unable to create a term iterator");
61   }
62 
~IcuLanguageSegmenterIterator()63   ~IcuLanguageSegmenterIterator() {
64     utext_close(u_text_);
65     creator_.ReturnBreakIterator(break_iterator_);
66   }
67 
68   // Advances to the next term. Returns false if it has reached the end.
Advance()69   bool Advance() override {
70     // Prerequisite check
71     if (term_end_index_exclusive_ == UBRK_DONE) {
72       return false;
73     }
74 
75     if (term_end_index_exclusive_ == 0) {
76       // First Advance() call
77       term_start_index_ = ubrk_first(break_iterator_);
78     } else {
79       term_start_index_ = term_end_index_exclusive_;
80     }
81     term_end_index_exclusive_ = ubrk_next(break_iterator_);
82 
83     // Reached the end
84     if (term_end_index_exclusive_ == UBRK_DONE) {
85       MarkAsDone();
86       return false;
87     }
88 
89     return true;
90   }
91 
92   // Returns the current term. It can be called only when Advance() returns
93   // true.
GetTerm() const94   std::string_view GetTerm() const override {
95     int term_length = term_end_index_exclusive_ - term_start_index_;
96     if (term_end_index_exclusive_ == UBRK_DONE) {
97       term_length = 0;
98     } else if (text_[term_start_index_] == kASCIISpace) {
99       // Rule 3: multiple continuous whitespaces are treated as one.
100       term_length = 1;
101     }
102     return text_.substr(term_start_index_, term_length);
103   }
104 
CalculateTermStart()105   libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart()
106       override {
107     if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
108       return absl_ports::AbortedError(
109           "Could not retrieve valid utf8 character!");
110     }
111     return offset_iterator_;
112   }
113 
CalculateTermEndExclusive()114   libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive()
115       override {
116     if (!offset_iterator_.MoveToUtf8(term_end_index_exclusive_)) {
117       return absl_ports::AbortedError(
118           "Could not retrieve valid utf8 character!");
119     }
120     return offset_iterator_;
121   }
122 
ResetToTermStartingAfterUtf32(int32_t offset)123   libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32(
124       int32_t offset) override {
125     if (offset < 0) {
126       // Very simple. The first term start after a negative offset is the first
127       // term. So just reset to start and Advance.
128       return ResetToStartUtf32();
129     }
130 
131     // 1. Find the unicode character that contains the byte at offset.
132     if (!offset_iterator_.MoveToUtf32(offset)) {
133       // An error occurred. Mark as DONE
134       if (offset_iterator_.utf8_index() != text_.length()) {
135         // We returned false for some reason other than hitting the end. This is
136         // a real error. Just return.
137         MarkAsDone();
138         return absl_ports::AbortedError(
139             "Could not retrieve valid utf8 character!");
140       }
141     }
142     if (offset_iterator_.utf8_index() == text_.length()) {
143       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
144           "Illegal offset provided! Offset utf-32:%d, utf-8:%d is not within "
145           "bounds of string of length %zu",
146           offset_iterator_.utf32_index(), offset_iterator_.utf8_index(),
147           text_.length()));
148     }
149 
150     // 2. We've got the unicode character containing byte offset. Now, we need
151     // to point to the segment that starts after this character.
152     int following_utf8_index =
153         ubrk_following(break_iterator_, offset_iterator_.utf8_index());
154     if (following_utf8_index == UBRK_DONE) {
155       MarkAsDone();
156       return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
157           "No segments begin after provided offset %d.", offset));
158     }
159     term_end_index_exclusive_ = following_utf8_index;
160 
161     // 3. The term_end_exclusive_ points to the start of the term that we want
162     // to return. We need to Advance so that term_start_ will now point to this
163     // term.
164     if (!Advance()) {
165       return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
166           "No segments begin after provided offset %d.", offset));
167     }
168     if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
169       return absl_ports::AbortedError(
170           "Could not retrieve valid utf8 character!");
171     }
172     return offset_iterator_.utf32_index();
173   }
174 
ResetToTermEndingBeforeUtf32(int32_t offset)175   libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32(
176       int32_t offset) override {
177     if (offset < 0) {
178       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
179           "Illegal offset provided! Offset %d is not within bounds of string "
180           "of length %zu",
181           offset, text_.length()));
182     }
183 
184     if (!offset_iterator_.MoveToUtf32(offset)) {
185       // An error occurred. Mark as DONE
186       if (offset_iterator_.utf8_index() != text_.length()) {
187         // We returned false for some reason other than hitting the end. This is
188         // a real error. Just return.
189         MarkAsDone();
190         return absl_ports::AbortedError(
191             "Could not retrieve valid utf8 character!");
192       }
193       // If it returned false because we hit the end. Then that's fine. We'll
194       // just treat it as if the request was for the end.
195     }
196 
197     // 2. We've got the unicode character containing byte offset. Now, we need
198     // to point to the segment that ends before this character.
199     int starting_utf8_index =
200         ubrk_preceding(break_iterator_, offset_iterator_.utf8_index());
201     if (starting_utf8_index == UBRK_DONE) {
202       // Rewind the end indices.
203       MarkAsDone();
204       return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
205           "No segments end before provided offset %d.", offset));
206     }
207     term_start_index_ = starting_utf8_index;
208 
209     // 3. We've correctly set the start index and the iterator currently points
210     // to that position. Now we need to find the correct end position and
211     // advance the iterator to that position.
212     int ending_utf8_index = ubrk_next(break_iterator_);
213     if (ending_utf8_index == UBRK_DONE) {
214       // This shouldn't ever happen.
215       MarkAsDone();
216       return absl_ports::AbortedError(IcingStringUtil::StringPrintf(
217           "No segments end before provided offset %d.", offset));
218     }
219     term_end_index_exclusive_ = ending_utf8_index;
220 
221     // 4. The start and end indices point to a segment, but we need to ensure
222     // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
223     // need a segment prior to this one.
224     CharacterIterator term_start_iterator = offset_iterator_;
225     if (!term_start_iterator.MoveToUtf8(term_start_index_)) {
226       return absl_ports::AbortedError(
227           "Could not retrieve valid utf8 character!");
228     }
229     if (term_end_index_exclusive_ > offset_iterator_.utf8_index()) {
230       return ResetToTermEndingBeforeUtf32(term_start_iterator.utf32_index());
231     }
232     return term_start_iterator.utf32_index();
233   }
234 
ResetToStartUtf32()235   libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() override {
236     term_start_index_ = 0;
237     term_end_index_exclusive_ = 0;
238     if (!Advance()) {
239       return absl_ports::NotFoundError(
240           "Unable to find any valid terms in text.");
241     }
242     if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
243       return absl_ports::AbortedError(
244           "Could not retrieve valid utf8 character!");
245     }
246     return offset_iterator_.utf32_index();
247   }
248 
249  private:
IcuLanguageSegmenterIterator(const IcuLanguageSegmenter * creator,UBreakIterator * break_iterator,std::string_view text,std::string_view locale)250   explicit IcuLanguageSegmenterIterator(const IcuLanguageSegmenter* creator,
251                                         UBreakIterator* break_iterator,
252                                         std::string_view text,
253                                         std::string_view locale)
254       : creator_(*creator),
255         break_iterator_(break_iterator),
256         text_(text),
257         locale_(locale),
258         u_text_(nullptr),
259         offset_iterator_(text),
260         term_start_index_(0),
261         term_end_index_exclusive_(0) {}
262 
263   // Returns true on success
Initialize()264   bool Initialize() {
265     if (break_iterator_ == nullptr) {
266       return false;
267     }
268     UErrorCode status = U_ZERO_ERROR;
269     u_text_ = utext_openUTF8(nullptr, text_.data(), text_.length(), &status);
270     if (u_text_ == nullptr) {
271       return false;
272     }
273     ubrk_setUText(break_iterator_, u_text_, &status);
274     return !U_FAILURE(status);
275   }
276 
ResetToTermStartingBefore(int32_t offset)277   libtextclassifier3::Status ResetToTermStartingBefore(int32_t offset) {
278     term_start_index_ = ubrk_preceding(break_iterator_, offset);
279     if (term_start_index_ == UBRK_DONE) {
280       MarkAsDone();
281       return absl_ports::NotFoundError("");
282     }
283     term_end_index_exclusive_ = ubrk_next(break_iterator_);
284     if (term_end_index_exclusive_ == UBRK_DONE) {
285       MarkAsDone();
286       return absl_ports::NotFoundError("");
287     }
288     return libtextclassifier3::Status::OK;
289   }
290 
291   // Ensures that all members are consistent with the 'Done' state.
292   // In the 'Done' state, term_start_index_ will point to the first character
293   // and term_end_index_exclusive_ will be marked with the kDone value.
294   // break_iterator_ may be in any state.
MarkAsDone()295   void MarkAsDone() {
296     term_end_index_exclusive_ = UBRK_DONE;
297     term_start_index_ = 0;
298   }
299 
300   const IcuLanguageSegmenter& creator_;  // Does not own.
301 
302   // The underlying class that does the segmentation, ubrk_close() must be
303   // called after using.
304   UBreakIterator* break_iterator_;  // Does not own
305 
306   // Text to be segmented
307   std::string_view text_;
308 
309   // Locale of the input text, used to help segment more accurately. If a
310   // wrong locale is set, text could probably still be segmented correctly
311   // because the default break iterator behavior is used for most locales.
312   std::string_view locale_;
313 
314   // A thin wrapper around the input UTF8 text, needed by break_iterator_.
315   // Allocated by calling utext_openUtf8() and freed by calling utext_close().
316   UText* u_text_;
317 
318   // Offset iterator. This iterator is not guaranteed to point to any particular
319   // character, but is guaranteed to point to a valid UTF character sequence.
320   //
321   // This iterator is used to save some amount of linear traversal when seeking
322   // to a specific UTF-32 offset. Each function that uses it could just create
323   // a CharacterIterator starting at the beginning of the text and traverse
324   // forward from there.
325   CharacterIterator offset_iterator_;
326 
327   // The start and end indices are used to track the positions of current
328   // term.
329   int term_start_index_;
330   int term_end_index_exclusive_;
331 };
332 
333 /* static */ libtextclassifier3::StatusOr<std::unique_ptr<IcuLanguageSegmenter>>
Create(std::string && locale)334 IcuLanguageSegmenter::Create(std::string&& locale) {
335   UErrorCode status = U_ZERO_ERROR;
336   UBreakIterator* break_iterator = ubrk_open(
337       UBRK_WORD, locale.c_str(), /*text=*/nullptr, /*textLength=*/0, &status);
338   if (U_FAILURE(status) || break_iterator == nullptr) {
339     return absl_ports::AbortedError(
340         "Unable to create ICU break_iterator for language segmentation");
341   }
342   return std::unique_ptr<IcuLanguageSegmenter>(
343       new IcuLanguageSegmenter(std::move(locale), break_iterator));
344 }
345 
ProduceBreakIterator() const346 UBreakIterator* IcuLanguageSegmenter::ProduceBreakIterator() const {
347   UBreakIterator* itr = nullptr;
348   {
349     absl_ports::unique_lock l(&mutex_);
350     if (cached_break_iterator_ != nullptr) {
351       itr = cached_break_iterator_;
352       cached_break_iterator_ = nullptr;
353     }
354   }
355   if (itr == nullptr) {
356     UErrorCode status = U_ZERO_ERROR;
357     itr = ubrk_open(UBRK_WORD, locale_.c_str(), /*text=*/nullptr,
358                     /*textLength=*/0, &status);
359     if (U_FAILURE(status)) {
360       itr = nullptr;
361     }
362   }
363   return itr;
364 }
365 
ReturnBreakIterator(UBreakIterator * itr) const366 void IcuLanguageSegmenter::ReturnBreakIterator(UBreakIterator* itr) const {
367   {
368     absl_ports::unique_lock l(&mutex_);
369     if (cached_break_iterator_ == nullptr) {
370       cached_break_iterator_ = itr;
371       return;
372     }
373   }
374   ubrk_close(itr);
375 }
376 
377 libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
Segment(const std::string_view text) const378 IcuLanguageSegmenter::Segment(const std::string_view text) const {
379   return IcuLanguageSegmenterIterator::Create(this, ProduceBreakIterator(),
380                                               text, locale_);
381 }
382 
383 libtextclassifier3::StatusOr<std::vector<std::string_view>>
GetAllTerms(const std::string_view text) const384 IcuLanguageSegmenter::GetAllTerms(const std::string_view text) const {
385   ICING_ASSIGN_OR_RETURN(
386       std::unique_ptr<LanguageSegmenter::Iterator> iterator,
387       Segment(text));
388   std::vector<std::string_view> terms;
389   while (iterator->Advance()) {
390     terms.push_back(iterator->GetTerm());
391   }
392   return terms;
393 }
394 
395 }  // namespace lib
396 }  // namespace icing
397