1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_ 18 #define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_ 19 20 #include <string> 21 #include <vector> 22 23 #include "annotator/types.h" 24 #include "utils/base/integral_types.h" 25 #include "utils/codepoint-range.h" 26 #include "utils/tokenizer_generated.h" 27 #include "utils/utf8/unicodetext.h" 28 #include "utils/utf8/unilib.h" 29 30 namespace libtextclassifier3 { 31 32 const int kInvalidScript = -1; 33 const int kUnknownScript = -2; 34 35 // Tokenizer splits the input string into a sequence of tokens, according to 36 // the configuration. 37 class Tokenizer { 38 public: 39 // `codepoint_ranges`: Codepoint ranges that determine how different 40 // codepoints are tokenized. The ranges must not overlap. 41 // `internal_tokenizer_codepoint_ranges`: Codepoint ranges that define which 42 // tokens should be re-tokenized with the internal tokenizer in the mixed 43 // tokenization mode. 44 // `split_on_script_change`: Whether to consider a change of codepoint script 45 // in a sequence of characters as a token boundary. If True, will treat 46 // script change as a token boundary. 47 // `icu_preserve_whitespace_tokens`: If true, will include empty tokens in the 48 // output (in the ICU tokenization mode). 49 // `preserve_floating_numbers`: If true (default), will keep dots between 50 // digits together, not making separate tokens (in the LETTER_DIGIT 51 // tokenization mode). 52 Tokenizer( 53 const TokenizationType type, const UniLib* unilib, 54 const std::vector<const TokenizationCodepointRange*>& codepoint_ranges, 55 const std::vector<const CodepointRange*>& 56 internal_tokenizer_codepoint_ranges, 57 const bool split_on_script_change, 58 const bool icu_preserve_whitespace_tokens, 59 const bool preserve_floating_numbers); 60 Tokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens)61 Tokenizer( 62 const TokenizationType type, const UniLib* unilib, 63 const std::vector<const TokenizationCodepointRange*>& codepoint_ranges, 64 const std::vector<const CodepointRange*>& 65 internal_tokenizer_codepoint_ranges, 66 const bool split_on_script_change, 67 const bool icu_preserve_whitespace_tokens) 68 : Tokenizer(type, unilib, codepoint_ranges, 69 internal_tokenizer_codepoint_ranges, split_on_script_change, 70 icu_preserve_whitespace_tokens, 71 /*preserve_floating_numbers=*/true) {} 72 Tokenizer(const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const bool split_on_script_change)73 Tokenizer( 74 const std::vector<const TokenizationCodepointRange*>& codepoint_ranges, 75 const bool split_on_script_change) 76 : Tokenizer(TokenizationType_INTERNAL_TOKENIZER, /*unilib=*/nullptr, 77 codepoint_ranges, /*internal_tokenizer_codepoint_ranges=*/{}, 78 split_on_script_change, 79 /*icu_preserve_whitespace_tokens=*/false, 80 /*preserve_floating_numbers=*/true) {} 81 82 // Describes the type of tokens used in the NumberTokenizer. 83 enum NumberTokenType { 84 INVALID_TOKEN_TYPE, 85 NUMERICAL, 86 TERM, 87 WHITESPACE, 88 SEPARATOR, 89 NOT_SET 90 }; 91 92 // Tokenizes the input string using the selected tokenization method. 93 std::vector<Token> Tokenize(const std::string& text) const; 94 95 // Same as above but takes UnicodeText. 96 std::vector<Token> Tokenize(const UnicodeText& text_unicode) const; 97 98 protected: 99 // Finds the tokenization codepoint range config for given codepoint. 100 // Internally uses binary search so should be O(log(# of codepoint_ranges)). 101 const TokenizationCodepointRangeT* FindTokenizationRange(int codepoint) const; 102 103 // Finds the role and script for given codepoint. If not found, DEFAULT_ROLE 104 // and kUnknownScript are assigned. 105 void GetScriptAndRole(char32 codepoint, 106 TokenizationCodepointRange_::Role* role, 107 int* script) const; 108 109 // Tokenizes a substring of the unicode string, appending the resulting tokens 110 // to the output vector. The resulting tokens have bounds relative to the full 111 // string. Does nothing if the start of the span is negative. 112 void TokenizeSubstring(const UnicodeText& unicode_text, CodepointSpan span, 113 std::vector<Token>* result) const; 114 115 std::vector<Token> InternalTokenize(const UnicodeText& text_unicode) const; 116 117 // Takes the result of ICU tokenization and retokenizes stretches of tokens 118 // made of a specific subset of characters using the internal tokenizer. 119 void InternalRetokenize(const UnicodeText& unicode_text, 120 std::vector<Token>* tokens) const; 121 122 // Tokenizes the input text using ICU tokenizer. 123 bool ICUTokenize(const UnicodeText& context_unicode, 124 std::vector<Token>* result) const; 125 126 // Tokenizes the input in number, word and separator tokens. 127 bool NumberTokenize(const UnicodeText& text_unicode, 128 std::vector<Token>* result) const; 129 130 private: 131 const TokenizationType type_; 132 133 const UniLib* unilib_; 134 135 // Codepoint ranges that determine how different codepoints are tokenized. 136 // The ranges must not overlap. 137 std::vector<std::unique_ptr<const TokenizationCodepointRangeT>> 138 codepoint_ranges_; 139 140 // Codepoint ranges that define which tokens (consisting of which codepoints) 141 // should be re-tokenized with the internal tokenizer in the mixed 142 // tokenization mode. 143 // NOTE: Must be sorted. 144 std::vector<CodepointRangeStruct> internal_tokenizer_codepoint_ranges_; 145 146 // If true, tokens will be additionally split when the codepoint's script_id 147 // changes. 148 const bool split_on_script_change_; 149 150 const bool icu_preserve_whitespace_tokens_; 151 const bool preserve_floating_numbers_; 152 }; 153 154 } // namespace libtextclassifier3 155 156 #endif // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_ 157