• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "smartselect/tokenizer.h"
18 
19 #include "util/strings/utf8.h"
20 #include "util/utf8/unicodetext.h"
21 
22 namespace libtextclassifier {
23 
PrepareTokenizationCodepointRanges(const std::vector<TokenizationCodepointRange> & codepoint_range_configs)24 void Tokenizer::PrepareTokenizationCodepointRanges(
25     const std::vector<TokenizationCodepointRange>& codepoint_range_configs) {
26   codepoint_ranges_.clear();
27   codepoint_ranges_.reserve(codepoint_range_configs.size());
28   for (const TokenizationCodepointRange& range : codepoint_range_configs) {
29     codepoint_ranges_.push_back(
30         CodepointRange(range.start(), range.end(), range.role()));
31   }
32 
33   std::sort(codepoint_ranges_.begin(), codepoint_ranges_.end(),
34             [](const CodepointRange& a, const CodepointRange& b) {
35               return a.start < b.start;
36             });
37 }
38 
FindTokenizationRole(int codepoint) const39 TokenizationCodepointRange::Role Tokenizer::FindTokenizationRole(
40     int codepoint) const {
41   auto it = std::lower_bound(codepoint_ranges_.begin(), codepoint_ranges_.end(),
42                              codepoint,
43                              [](const CodepointRange& range, int codepoint) {
44                                // This function compares range with the
45                                // codepoint for the purpose of finding the first
46                                // greater or equal range. Because of the use of
47                                // std::lower_bound it needs to return true when
48                                // range < codepoint; the first time it will
49                                // return false the lower bound is found and
50                                // returned.
51                                //
52                                // It might seem weird that the condition is
53                                // range.end <= codepoint here but when codepoint
54                                // == range.end it means it's actually just
55                                // outside of the range, thus the range is less
56                                // than the codepoint.
57                                return range.end <= codepoint;
58                              });
59   if (it != codepoint_ranges_.end() && it->start <= codepoint &&
60       it->end > codepoint) {
61     return it->role;
62   } else {
63     return TokenizationCodepointRange::DEFAULT_ROLE;
64   }
65 }
66 
Tokenize(const std::string & utf8_text) const67 std::vector<Token> Tokenizer::Tokenize(const std::string& utf8_text) const {
68   UnicodeText context_unicode = UTF8ToUnicodeText(utf8_text, /*do_copy=*/false);
69 
70   std::vector<Token> result;
71   Token new_token("", 0, 0);
72   int codepoint_index = 0;
73   for (auto it = context_unicode.begin(); it != context_unicode.end();
74        ++it, ++codepoint_index) {
75     TokenizationCodepointRange::Role role = FindTokenizationRole(*it);
76     if (role & TokenizationCodepointRange::SPLIT_BEFORE) {
77       if (!new_token.value.empty()) {
78         result.push_back(new_token);
79       }
80       new_token = Token("", codepoint_index, codepoint_index);
81     }
82     if (!(role & TokenizationCodepointRange::DISCARD_CODEPOINT)) {
83       new_token.value += std::string(
84           it.utf8_data(),
85           it.utf8_data() + GetNumBytesForNonZeroUTF8Char(it.utf8_data()));
86       ++new_token.end;
87     }
88     if (role & TokenizationCodepointRange::SPLIT_AFTER) {
89       if (!new_token.value.empty()) {
90         result.push_back(new_token);
91       }
92       new_token = Token("", codepoint_index + 1, codepoint_index + 1);
93     }
94   }
95   if (!new_token.value.empty()) {
96     result.push_back(new_token);
97   }
98 
99   return result;
100 }
101 
102 }  // namespace libtextclassifier
103