• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/tokenization/icu/icu-language-segmenter.h"
16 
17 #include <cstdint>
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22 #include <vector>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/absl_ports/canonical_errors.h"
27 #include "icing/legacy/core/icing-string-util.h"
28 #include "icing/util/character-iterator.h"
29 #include "icing/util/i18n-utils.h"
30 #include "icing/util/status-macros.h"
31 #include "unicode/ubrk.h"
32 #include "unicode/uchar.h"
33 #include "unicode/umachine.h"
34 
35 namespace icing {
36 namespace lib {
37 
38 namespace {
39 constexpr char kASCIISpace = ' ';
40 }  // namespace
41 
42 class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
43  public:
44   // Factory function to create a segment iterator based on the given locale.
45   //
46   // Returns:
47   //   An iterator on success
48   //   INTERNAL_ERROR if unable to create
49   static libtextclassifier3::StatusOr<
50       std::unique_ptr<LanguageSegmenter::Iterator>>
Create(std::string_view text,std::string_view locale)51   Create(std::string_view text, std::string_view locale) {
52     std::unique_ptr<IcuLanguageSegmenterIterator> iterator(
53         new IcuLanguageSegmenterIterator(text, locale));
54     if (iterator->Initialize()) {
55       return iterator;
56     }
57     return absl_ports::InternalError("Unable to create a term iterator");
58   }
59 
~IcuLanguageSegmenterIterator()60   ~IcuLanguageSegmenterIterator() {
61     ubrk_close(break_iterator_);
62     utext_close(&u_text_);
63   }
64 
65   // Advances to the next term. Returns false if it has reached the end.
Advance()66   bool Advance() override {
67     // Prerequisite check
68     if (term_end_index_exclusive_ == UBRK_DONE) {
69       return false;
70     }
71 
72     if (term_end_index_exclusive_ == 0) {
73       // First Advance() call
74       term_start_index_ = ubrk_first(break_iterator_);
75     } else {
76       term_start_index_ = term_end_index_exclusive_;
77     }
78     term_end_index_exclusive_ = ubrk_next(break_iterator_);
79 
80     // Reached the end
81     if (term_end_index_exclusive_ == UBRK_DONE) {
82       MarkAsDone();
83       return false;
84     }
85 
86     if (!IsValidSegment()) {
87       return Advance();
88     }
89     return true;
90   }
91 
92   // Returns the current term. It can be called only when Advance() returns
93   // true.
GetTerm() const94   std::string_view GetTerm() const override {
95     int term_length = term_end_index_exclusive_ - term_start_index_;
96     if (term_end_index_exclusive_ == UBRK_DONE) {
97       term_length = 0;
98     } else if (text_[term_start_index_] == kASCIISpace) {
99       // Rule 3: multiple continuous whitespaces are treated as one.
100       term_length = 1;
101     }
102     return text_.substr(term_start_index_, term_length);
103   }
104 
CalculateTermStart()105   libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart()
106       override {
107     if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
108       return absl_ports::AbortedError(
109           "Could not retrieve valid utf8 character!");
110     }
111     return offset_iterator_;
112   }
113 
CalculateTermEndExclusive()114   libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive()
115       override {
116     if (!offset_iterator_.MoveToUtf8(term_end_index_exclusive_)) {
117       return absl_ports::AbortedError(
118           "Could not retrieve valid utf8 character!");
119     }
120     return offset_iterator_;
121   }
122 
ResetToTermStartingAfterUtf32(int32_t offset)123   libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32(
124       int32_t offset) override {
125     if (offset < 0) {
126       // Very simple. The first term start after a negative offset is the first
127       // term. So just reset to start and Advance.
128       return ResetToStartUtf32();
129     }
130 
131     // 1. Find the unicode character that contains the byte at offset.
132     if (!offset_iterator_.MoveToUtf32(offset)) {
133       // An error occurred. Mark as DONE
134       if (offset_iterator_.utf8_index() != text_.length()) {
135         // We returned false for some reason other than hitting the end. This is
136         // a real error. Just return.
137         MarkAsDone();
138         return absl_ports::AbortedError(
139             "Could not retrieve valid utf8 character!");
140       }
141     }
142     if (offset_iterator_.utf8_index() == text_.length()) {
143       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
144           "Illegal offset provided! Offset utf-32:%d, utf-8:%d is not within "
145           "bounds of string of length %zu",
146           offset_iterator_.utf32_index(), offset_iterator_.utf8_index(),
147           text_.length()));
148     }
149 
150     // 2. We've got the unicode character containing byte offset. Now, we need
151     // to point to the segment that starts after this character.
152     int following_utf8_index =
153         ubrk_following(break_iterator_, offset_iterator_.utf8_index());
154     if (following_utf8_index == UBRK_DONE) {
155       MarkAsDone();
156       return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
157           "No segments begin after provided offset %d.", offset));
158     }
159     term_end_index_exclusive_ = following_utf8_index;
160 
161     // 3. The term_end_exclusive_ points to the start of the term that we want
162     // to return. We need to Advance so that term_start_ will now point to this
163     // term.
164     if (!Advance()) {
165       return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
166           "No segments begin after provided offset %d.", offset));
167     }
168     if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
169       return absl_ports::AbortedError(
170           "Could not retrieve valid utf8 character!");
171     }
172     return offset_iterator_.utf32_index();
173   }
174 
ResetToTermEndingBeforeUtf32(int32_t offset)175   libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32(
176       int32_t offset) override {
177     if (offset < 0) {
178       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
179           "Illegal offset provided! Offset %d is not within bounds of string "
180           "of length %zu",
181           offset, text_.length()));
182     }
183 
184     if (!offset_iterator_.MoveToUtf32(offset)) {
185       // An error occurred. Mark as DONE
186       if (offset_iterator_.utf8_index() != text_.length()) {
187         // We returned false for some reason other than hitting the end. This is
188         // a real error. Just return.
189         MarkAsDone();
190         return absl_ports::AbortedError(
191             "Could not retrieve valid utf8 character!");
192       }
193       // If it returned false because we hit the end. Then that's fine. We'll
194       // just treat it as if the request was for the end.
195     }
196 
197     // 2. We've got the unicode character containing byte offset. Now, we need
198     // to point to the segment that ends before this character.
199     int starting_utf8_index =
200         ubrk_preceding(break_iterator_, offset_iterator_.utf8_index());
201     if (starting_utf8_index == UBRK_DONE) {
202       // Rewind the end indices.
203       MarkAsDone();
204       return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
205           "No segments end before provided offset %d.", offset));
206     }
207     term_start_index_ = starting_utf8_index;
208 
209     // 3. We've correctly set the start index and the iterator currently points
210     // to that position. Now we need to find the correct end position and
211     // advance the iterator to that position.
212     int ending_utf8_index = ubrk_next(break_iterator_);
213     if (ending_utf8_index == UBRK_DONE) {
214       // This shouldn't ever happen.
215       MarkAsDone();
216       return absl_ports::AbortedError(IcingStringUtil::StringPrintf(
217           "No segments end before provided offset %d.", offset));
218     }
219     term_end_index_exclusive_ = ending_utf8_index;
220 
221     // 4. The start and end indices point to a segment, but we need to ensure
222     // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
223     // need a segment prior to this one.
224     CharacterIterator term_start_iterator = offset_iterator_;
225     if (!term_start_iterator.MoveToUtf8(term_start_index_)) {
226       return absl_ports::AbortedError(
227           "Could not retrieve valid utf8 character!");
228     }
229     if (term_end_index_exclusive_ > offset_iterator_.utf8_index() ||
230         !IsValidSegment()) {
231       return ResetToTermEndingBeforeUtf32(term_start_iterator.utf32_index());
232     }
233     return term_start_iterator.utf32_index();
234   }
235 
ResetToStartUtf32()236   libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() override {
237     term_start_index_ = 0;
238     term_end_index_exclusive_ = 0;
239     if (!Advance()) {
240       return absl_ports::NotFoundError(
241           "Unable to find any valid terms in text.");
242     }
243     if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
244       return absl_ports::AbortedError(
245           "Could not retrieve valid utf8 character!");
246     }
247     return offset_iterator_.utf32_index();
248   }
249 
250  private:
IcuLanguageSegmenterIterator(std::string_view text,std::string_view locale)251   explicit IcuLanguageSegmenterIterator(std::string_view text,
252                                         std::string_view locale)
253       : break_iterator_(nullptr),
254         text_(text),
255         locale_(locale),
256         u_text_(UTEXT_INITIALIZER),
257         offset_iterator_(text),
258         term_start_index_(0),
259         term_end_index_exclusive_(0) {}
260 
261   // Returns true on success
Initialize()262   bool Initialize() {
263     UErrorCode status = U_ZERO_ERROR;
264     utext_openUTF8(&u_text_, text_.data(), text_.length(), &status);
265     break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr,
266                                 /*textLength=*/0, &status);
267     ubrk_setUText(break_iterator_, &u_text_, &status);
268     return !U_FAILURE(status);
269   }
270 
ResetToTermStartingBefore(int32_t offset)271   libtextclassifier3::Status ResetToTermStartingBefore(int32_t offset) {
272     term_start_index_ = ubrk_preceding(break_iterator_, offset);
273     if (term_start_index_ == UBRK_DONE) {
274       MarkAsDone();
275       return absl_ports::NotFoundError("");
276     }
277     term_end_index_exclusive_ = ubrk_next(break_iterator_);
278     if (term_end_index_exclusive_ == UBRK_DONE) {
279       MarkAsDone();
280       return absl_ports::NotFoundError("");
281     }
282     return libtextclassifier3::Status::OK;
283   }
284 
285   // Ensures that all members are consistent with the 'Done' state.
286   // In the 'Done' state, term_start_index_ will point to the first character
287   // and term_end_index_exclusive_ will be marked with the kDone value.
288   // break_iterator_ may be in any state.
MarkAsDone()289   void MarkAsDone() {
290     term_end_index_exclusive_ = UBRK_DONE;
291     term_start_index_ = 0;
292   }
293 
IsValidSegment() const294   bool IsValidSegment() const {
295     // Rule 1: all ASCII terms will be returned.
296     // We know it's a ASCII term by checking the first char.
297     if (i18n_utils::IsAscii(text_[term_start_index_])) {
298       return true;
299     }
300 
301     UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
302                                                term_start_index_);
303     // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
304     // We know it's an alphabetic term by checking the first unicode character.
305     if (u_isUAlphabetic(uchar32)) {
306       return true;
307     }
308     return false;
309   }
310 
311   // The underlying class that does the segmentation, ubrk_close() must be
312   // called after using.
313   UBreakIterator* break_iterator_;
314 
315   // Text to be segmented
316   std::string_view text_;
317 
318   // Locale of the input text, used to help segment more accurately. If a
319   // wrong locale is set, text could probably still be segmented correctly
320   // because the default break iterator behavior is used for most locales.
321   std::string_view locale_;
322 
323   // A thin wrapper around the input UTF8 text, needed by break_iterator_.
324   // utext_close() must be called after using.
325   UText u_text_;
326 
327   // Offset iterator. This iterator is not guaranteed to point to any particular
328   // character, but is guaranteed to point to a valid UTF character sequence.
329   //
330   // This iterator is used to save some amount of linear traversal when seeking
331   // to a specific UTF-32 offset. Each function that uses it could just create
332   // a CharacterIterator starting at the beginning of the text and traverse
333   // forward from there.
334   CharacterIterator offset_iterator_;
335 
336   // The start and end indices are used to track the positions of current
337   // term.
338   int term_start_index_;
339   int term_end_index_exclusive_;
340 };
341 
IcuLanguageSegmenter(std::string locale)342 IcuLanguageSegmenter::IcuLanguageSegmenter(std::string locale)
343     : locale_(std::move(locale)) {}
344 
345 libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
Segment(const std::string_view text) const346 IcuLanguageSegmenter::Segment(const std::string_view text) const {
347   return IcuLanguageSegmenterIterator::Create(text, locale_);
348 }
349 
350 libtextclassifier3::StatusOr<std::vector<std::string_view>>
GetAllTerms(const std::string_view text) const351 IcuLanguageSegmenter::GetAllTerms(const std::string_view text) const {
352   ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator,
353                          Segment(text));
354   std::vector<std::string_view> terms;
355   while (iterator->Advance()) {
356     terms.push_back(iterator->GetTerm());
357   }
358   return terms;
359 }
360 
361 }  // namespace lib
362 }  // namespace icing
363