• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/tokenizer.h"
18 
19 #include <algorithm>
20 
21 #include "utils/base/logging.h"
22 #include "utils/base/macros.h"
23 #include "utils/strings/utf8.h"
24 #include "utils/utf8/unicodetext.h"
25 
26 namespace libtextclassifier3 {
27 
Tokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens,const bool preserve_floating_numbers)28 Tokenizer::Tokenizer(
29     const TokenizationType type, const UniLib* unilib,
30     const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
31     const std::vector<const CodepointRange*>&
32         internal_tokenizer_codepoint_ranges,
33     const bool split_on_script_change,
34     const bool icu_preserve_whitespace_tokens,
35     const bool preserve_floating_numbers)
36     : type_(type),
37       unilib_(unilib),
38       split_on_script_change_(split_on_script_change),
39       icu_preserve_whitespace_tokens_(icu_preserve_whitespace_tokens),
40       preserve_floating_numbers_(preserve_floating_numbers) {
41   for (const TokenizationCodepointRange* range : codepoint_ranges) {
42     codepoint_ranges_.emplace_back(range->UnPack());
43   }
44 
45   std::sort(codepoint_ranges_.begin(), codepoint_ranges_.end(),
46             [](const std::unique_ptr<const TokenizationCodepointRangeT>& a,
47                const std::unique_ptr<const TokenizationCodepointRangeT>& b) {
48               return a->start < b->start;
49             });
50 
51   SortCodepointRanges(internal_tokenizer_codepoint_ranges,
52                       &internal_tokenizer_codepoint_ranges_);
53   if (type_ == TokenizationType_MIXED && split_on_script_change) {
54     TC3_LOG(ERROR) << "The option `split_on_script_change` is unavailable for "
55                       "the selected tokenizer type (mixed).";
56   }
57 }
58 
FindTokenizationRange(int codepoint) const59 const TokenizationCodepointRangeT* Tokenizer::FindTokenizationRange(
60     int codepoint) const {
61   auto it = std::lower_bound(
62       codepoint_ranges_.begin(), codepoint_ranges_.end(), codepoint,
63       [](const std::unique_ptr<const TokenizationCodepointRangeT>& range,
64          int codepoint) {
65         // This function compares range with the codepoint for the purpose of
66         // finding the first greater or equal range. Because of the use of
67         // std::lower_bound it needs to return true when range < codepoint;
68         // the first time it will return false the lower bound is found and
69         // returned.
70         //
71         // It might seem weird that the condition is range.end <= codepoint
72         // here but when codepoint == range.end it means it's actually just
73         // outside of the range, thus the range is less than the codepoint.
74         return range->end <= codepoint;
75       });
76   if (it != codepoint_ranges_.end() && (*it)->start <= codepoint &&
77       (*it)->end > codepoint) {
78     return it->get();
79   } else {
80     return nullptr;
81   }
82 }
83 
GetScriptAndRole(char32 codepoint,TokenizationCodepointRange_::Role * role,int * script) const84 void Tokenizer::GetScriptAndRole(char32 codepoint,
85                                  TokenizationCodepointRange_::Role* role,
86                                  int* script) const {
87   const TokenizationCodepointRangeT* range = FindTokenizationRange(codepoint);
88   if (range) {
89     *role = range->role;
90     *script = range->script_id;
91   } else {
92     *role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
93     *script = kUnknownScript;
94   }
95 }
96 
Tokenize(const std::string & text) const97 std::vector<Token> Tokenizer::Tokenize(const std::string& text) const {
98   UnicodeText text_unicode = UTF8ToUnicodeText(text, /*do_copy=*/false);
99   return Tokenize(text_unicode);
100 }
101 
Tokenize(const UnicodeText & text_unicode) const102 std::vector<Token> Tokenizer::Tokenize(const UnicodeText& text_unicode) const {
103   switch (type_) {
104     case TokenizationType_INTERNAL_TOKENIZER:
105       return InternalTokenize(text_unicode);
106     case TokenizationType_ICU:
107       TC3_FALLTHROUGH_INTENDED;
108     case TokenizationType_MIXED: {
109       std::vector<Token> result;
110       if (!ICUTokenize(text_unicode, &result)) {
111         return {};
112       }
113       if (type_ == TokenizationType_MIXED) {
114         InternalRetokenize(text_unicode, &result);
115       }
116       return result;
117     }
118     case TokenizationType_LETTER_DIGIT: {
119       std::vector<Token> result;
120       if (!NumberTokenize(text_unicode, &result)) {
121         return {};
122       }
123       return result;
124     }
125     default:
126       TC3_LOG(ERROR) << "Unknown tokenization type specified. Using internal.";
127       return InternalTokenize(text_unicode);
128   }
129 }
130 
AppendCodepointToToken(UnicodeText::const_iterator it,Token * token)131 void AppendCodepointToToken(UnicodeText::const_iterator it, Token* token) {
132   token->value += std::string(
133       it.utf8_data(), it.utf8_data() + GetNumBytesForUTF8Char(it.utf8_data()));
134 }
135 
InternalTokenize(const UnicodeText & text_unicode) const136 std::vector<Token> Tokenizer::InternalTokenize(
137     const UnicodeText& text_unicode) const {
138   std::vector<Token> result;
139   Token new_token("", 0, 0);
140   int codepoint_index = 0;
141 
142   int last_script = kInvalidScript;
143   for (auto it = text_unicode.begin(); it != text_unicode.end();
144        ++it, ++codepoint_index) {
145     TokenizationCodepointRange_::Role role;
146     int script;
147     GetScriptAndRole(*it, &role, &script);
148 
149     if (role & TokenizationCodepointRange_::Role_SPLIT_BEFORE ||
150         (split_on_script_change_ && last_script != kInvalidScript &&
151          last_script != script)) {
152       if (!new_token.value.empty()) {
153         result.push_back(new_token);
154       }
155       new_token = Token("", codepoint_index, codepoint_index);
156     }
157     if (!(role & TokenizationCodepointRange_::Role_DISCARD_CODEPOINT)) {
158       new_token.end += 1;
159       AppendCodepointToToken(it, &new_token);
160     }
161     if (role & TokenizationCodepointRange_::Role_SPLIT_AFTER) {
162       if (!new_token.value.empty()) {
163         result.push_back(new_token);
164       }
165       new_token = Token("", codepoint_index + 1, codepoint_index + 1);
166     }
167 
168     last_script = script;
169   }
170   if (!new_token.value.empty()) {
171     result.push_back(new_token);
172   }
173 
174   return result;
175 }
176 
TokenizeSubstring(const UnicodeText & unicode_text,CodepointSpan span,std::vector<Token> * result) const177 void Tokenizer::TokenizeSubstring(const UnicodeText& unicode_text,
178                                   CodepointSpan span,
179                                   std::vector<Token>* result) const {
180   if (span.first < 0) {
181     // There is no span to tokenize.
182     return;
183   }
184 
185   // Extract the substring.
186   UnicodeText text = UnicodeText::Substring(unicode_text, span.first,
187                                             span.second, /*do_copy=*/false);
188 
189   // Run the tokenizer and update the token bounds to reflect the offset of the
190   // substring.
191   std::vector<Token> tokens = InternalTokenize(text);
192 
193   // Avoids progressive capacity increases in the for loop.
194   result->reserve(result->size() + tokens.size());
195   for (Token& token : tokens) {
196     token.start += span.first;
197     token.end += span.first;
198     result->emplace_back(std::move(token));
199   }
200 }
201 
InternalRetokenize(const UnicodeText & unicode_text,std::vector<Token> * tokens) const202 void Tokenizer::InternalRetokenize(const UnicodeText& unicode_text,
203                                    std::vector<Token>* tokens) const {
204   std::vector<Token> result;
205   CodepointSpan span(-1, -1);
206   for (Token& token : *tokens) {
207     const UnicodeText unicode_token_value =
208         UTF8ToUnicodeText(token.value, /*do_copy=*/false);
209     bool should_retokenize = true;
210     for (const int codepoint : unicode_token_value) {
211       if (!IsCodepointInRanges(codepoint,
212                                internal_tokenizer_codepoint_ranges_)) {
213         should_retokenize = false;
214         break;
215       }
216     }
217 
218     if (should_retokenize) {
219       if (span.first < 0) {
220         span.first = token.start;
221       }
222       span.second = token.end;
223     } else {
224       TokenizeSubstring(unicode_text, span, &result);
225       span.first = -1;
226       result.emplace_back(std::move(token));
227     }
228   }
229   TokenizeSubstring(unicode_text, span, &result);
230 
231   *tokens = std::move(result);
232 }
233 
ICUTokenize(const UnicodeText & context_unicode,std::vector<Token> * result) const234 bool Tokenizer::ICUTokenize(const UnicodeText& context_unicode,
235                             std::vector<Token>* result) const {
236   std::unique_ptr<UniLib::BreakIterator> break_iterator =
237       unilib_->CreateBreakIterator(context_unicode);
238   if (!break_iterator) {
239     return false;
240   }
241   const int context_unicode_size = context_unicode.size_codepoints();
242   int last_unicode_index = 0;
243   int unicode_index = 0;
244   auto token_begin_it = context_unicode.begin();
245   while ((unicode_index = break_iterator->Next()) !=
246          UniLib::BreakIterator::kDone) {
247     const int token_length = unicode_index - last_unicode_index;
248     if (token_length + last_unicode_index > context_unicode_size) {
249       return false;
250     }
251 
252     auto token_end_it = token_begin_it;
253     std::advance(token_end_it, token_length);
254     TC3_CHECK(token_end_it <= context_unicode.end());
255 
256     // Determine if the whole token is whitespace.
257     bool is_whitespace = true;
258     for (auto char_it = token_begin_it; char_it < token_end_it; ++char_it) {
259       if (!unilib_->IsWhitespace(*char_it)) {
260         is_whitespace = false;
261         break;
262       }
263     }
264 
265     const std::string token =
266         context_unicode.UTF8Substring(token_begin_it, token_end_it);
267 
268     if (!is_whitespace || icu_preserve_whitespace_tokens_) {
269       result->push_back(Token(token, last_unicode_index, unicode_index,
270                               /*is_padding=*/false, is_whitespace));
271     }
272 
273     last_unicode_index = unicode_index;
274     token_begin_it = token_end_it;
275   }
276 
277   return true;
278 }
279 
NumberTokenize(const UnicodeText & text_unicode,std::vector<Token> * result) const280 bool Tokenizer::NumberTokenize(const UnicodeText& text_unicode,
281                                std::vector<Token>* result) const {
282   Token new_token("", 0, 0);
283   NumberTokenType current_token_type = NOT_SET;
284   int codepoint_index = 0;
285 
286   auto PushToken = [&new_token, result]() {
287     if (!new_token.value.empty()) {
288       result->push_back(new_token);
289     }
290   };
291 
292   auto MaybeResetTokenAndAddChar =
293       [&new_token, PushToken, &current_token_type](
294           int codepoint_index, NumberTokenType token_type,
295           UnicodeText::const_iterator it, bool is_whitespace = false) {
296         if (current_token_type != token_type) {
297           PushToken();
298           new_token = Token("", codepoint_index, codepoint_index,
299                             /*is_padding=*/false, is_whitespace);
300         }
301         new_token.end += 1;
302         AppendCodepointToToken(it, &new_token);
303         current_token_type = token_type;
304       };
305 
306   auto FinishTokenAndAddSeparator =
307       [&new_token, result, &current_token_type, PushToken](
308           int codepoint_index, UnicodeText::const_iterator it) {
309         PushToken();
310 
311         result->emplace_back("", codepoint_index, codepoint_index + 1);
312         AppendCodepointToToken(it, &result->back());
313 
314         new_token = Token("", codepoint_index + 1, codepoint_index + 1);
315         current_token_type = NOT_SET;
316       };
317 
318   for (auto it = text_unicode.begin(); it != text_unicode.end();
319        ++it, ++codepoint_index) {
320     if (unilib_->IsDigit(*it)) {
321       MaybeResetTokenAndAddChar(codepoint_index, NUMERICAL, it);
322     } else if (unilib_->IsLetter(*it)) {
323       MaybeResetTokenAndAddChar(codepoint_index, TERM, it);
324     } else if (unilib_->IsWhitespace(*it)) {
325       MaybeResetTokenAndAddChar(codepoint_index, WHITESPACE, it,
326                                 /*is_whitespace=*/true);
327     } else if (unilib_->IsDot(*it) && preserve_floating_numbers_) {
328       auto it_next = std::next(it);
329       if (current_token_type == NUMERICAL && it_next != text_unicode.end() &&
330           unilib_->IsDigit(*it_next)) {
331         new_token.end += 1;
332         AppendCodepointToToken(it, &new_token);
333       } else {
334         // If the current token is not a number or dot at the end or followed
335         // by a non digit => separate token
336         FinishTokenAndAddSeparator(codepoint_index, it);
337       }
338     } else {
339       FinishTokenAndAddSeparator(codepoint_index, it);
340     }
341   }
342   PushToken();
343 
344   return true;
345 }
346 
347 }  // namespace libtextclassifier3
348