1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_ 18 #define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_ 19 20 #include <string> 21 #include <vector> 22 23 #include "annotator/types.h" 24 #include "utils/base/integral_types.h" 25 #include "utils/codepoint-range.h" 26 #include "utils/tokenizer_generated.h" 27 #include "utils/utf8/unicodetext.h" 28 #include "utils/utf8/unilib.h" 29 30 namespace libtextclassifier3 { 31 32 const int kInvalidScript = -1; 33 const int kUnknownScript = -2; 34 35 // Tokenizer splits the input string into a sequence of tokens, according to 36 // the configuration. 37 class Tokenizer { 38 public: 39 // `codepoint_ranges`: Codepoint ranges that determine how different 40 // codepoints are tokenized. The ranges must not overlap. 41 // `internal_tokenizer_codepoint_ranges`: Codepoint ranges that define which 42 // tokens should be re-tokenized with the internal tokenizer in the mixed 43 // tokenization mode. 44 // `split_on_script_change`: Whether to consider a change of codepoint script 45 // in a sequence of characters as a token boundary. If True, will treat 46 // script change as a token boundary. 47 // `icu_preserve_whitespace_tokens`: If true, will include empty tokens in the 48 // output (in the ICU tokenization mode). 49 Tokenizer( 50 const TokenizationType type, const UniLib* unilib, 51 const std::vector<const TokenizationCodepointRange*>& codepoint_ranges, 52 const std::vector<const CodepointRange*>& 53 internal_tokenizer_codepoint_ranges, 54 const bool split_on_script_change, 55 const bool icu_preserve_whitespace_tokens); 56 Tokenizer(const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const bool split_on_script_change)57 Tokenizer( 58 const std::vector<const TokenizationCodepointRange*>& codepoint_ranges, 59 const bool split_on_script_change) 60 : Tokenizer(TokenizationType_INTERNAL_TOKENIZER, /*unilib=*/nullptr, 61 codepoint_ranges, /*internal_tokenizer_codepoint_ranges=*/{}, 62 split_on_script_change, 63 /*icu_preserve_whitespace_tokens=*/false) {} 64 65 // Tokenizes the input string using the selected tokenization method. 66 std::vector<Token> Tokenize(const std::string& text) const; 67 68 // Same as above but takes UnicodeText. 69 std::vector<Token> Tokenize(const UnicodeText& text_unicode) const; 70 71 protected: 72 // Finds the tokenization codepoint range config for given codepoint. 73 // Internally uses binary search so should be O(log(# of codepoint_ranges)). 74 const TokenizationCodepointRangeT* FindTokenizationRange(int codepoint) const; 75 76 // Finds the role and script for given codepoint. If not found, DEFAULT_ROLE 77 // and kUnknownScript are assigned. 78 void GetScriptAndRole(char32 codepoint, 79 TokenizationCodepointRange_::Role* role, 80 int* script) const; 81 82 // Tokenizes a substring of the unicode string, appending the resulting tokens 83 // to the output vector. The resulting tokens have bounds relative to the full 84 // string. Does nothing if the start of the span is negative. 85 void TokenizeSubstring(const UnicodeText& unicode_text, CodepointSpan span, 86 std::vector<Token>* result) const; 87 88 std::vector<Token> InternalTokenize(const UnicodeText& text_unicode) const; 89 90 // Takes the result of ICU tokenization and retokenizes stretches of tokens 91 // made of a specific subset of characters using the internal tokenizer. 92 void InternalRetokenize(const UnicodeText& unicode_text, 93 std::vector<Token>* tokens) const; 94 95 // Tokenizes the input text using ICU tokenizer. 96 bool ICUTokenize(const UnicodeText& context_unicode, 97 std::vector<Token>* result) const; 98 99 private: 100 const TokenizationType type_; 101 102 const UniLib* unilib_; 103 104 // Codepoint ranges that determine how different codepoints are tokenized. 105 // The ranges must not overlap. 106 std::vector<std::unique_ptr<const TokenizationCodepointRangeT>> 107 codepoint_ranges_; 108 109 // Codepoint ranges that define which tokens (consisting of which codepoints) 110 // should be re-tokenized with the internal tokenizer in the mixed 111 // tokenization mode. 112 // NOTE: Must be sorted. 113 std::vector<CodepointRangeStruct> internal_tokenizer_codepoint_ranges_; 114 115 // If true, tokens will be additionally split when the codepoint's script_id 116 // changes. 117 const bool split_on_script_change_; 118 119 const bool icu_preserve_whitespace_tokens_; 120 }; 121 122 } // namespace libtextclassifier3 123 124 #endif // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_ 125