• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_
18 #define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_
19 
20 #include <string>
21 #include <vector>
22 
23 #include "annotator/types.h"
24 #include "utils/base/integral_types.h"
25 #include "utils/codepoint-range.h"
26 #include "utils/tokenizer_generated.h"
27 #include "utils/utf8/unicodetext.h"
28 #include "utils/utf8/unilib.h"
29 
30 namespace libtextclassifier3 {
31 
32 const int kInvalidScript = -1;
33 const int kUnknownScript = -2;
34 
35 // Tokenizer splits the input string into a sequence of tokens, according to
36 // the configuration.
37 class Tokenizer {
38  public:
39   // `codepoint_ranges`: Codepoint ranges that determine how different
40   //      codepoints are tokenized. The ranges must not overlap.
41   // `internal_tokenizer_codepoint_ranges`: Codepoint ranges that define which
42   //      tokens should be re-tokenized with the internal tokenizer in the mixed
43   //      tokenization mode.
44   // `split_on_script_change`: Whether to consider a change of codepoint script
45   //      in a sequence of characters as a token boundary. If True, will treat
46   //      script change as a token boundary.
47   // `icu_preserve_whitespace_tokens`: If true, will include empty tokens in the
48   // output (in the ICU tokenization mode).
49   // `preserve_floating_numbers`: If true (default), will keep dots between
50   // digits together, not making separate tokens (in the LETTER_DIGIT
51   // tokenization mode).
52   Tokenizer(
53       const TokenizationType type, const UniLib* unilib,
54       const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
55       const std::vector<const CodepointRange*>&
56           internal_tokenizer_codepoint_ranges,
57       const bool split_on_script_change,
58       const bool icu_preserve_whitespace_tokens,
59       const bool preserve_floating_numbers);
60 
Tokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens)61   Tokenizer(
62       const TokenizationType type, const UniLib* unilib,
63       const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
64       const std::vector<const CodepointRange*>&
65           internal_tokenizer_codepoint_ranges,
66       const bool split_on_script_change,
67       const bool icu_preserve_whitespace_tokens)
68       : Tokenizer(type, unilib, codepoint_ranges,
69                   internal_tokenizer_codepoint_ranges, split_on_script_change,
70                   icu_preserve_whitespace_tokens,
71                   /*preserve_floating_numbers=*/true) {}
72 
Tokenizer(const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const bool split_on_script_change)73   Tokenizer(
74       const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
75       const bool split_on_script_change)
76       : Tokenizer(TokenizationType_INTERNAL_TOKENIZER, /*unilib=*/nullptr,
77                   codepoint_ranges, /*internal_tokenizer_codepoint_ranges=*/{},
78                   split_on_script_change,
79                   /*icu_preserve_whitespace_tokens=*/false,
80                   /*preserve_floating_numbers=*/true) {}
81 
82   // Describes the type of tokens used in the NumberTokenizer.
83   enum NumberTokenType {
84     INVALID_TOKEN_TYPE,
85     NUMERICAL,
86     TERM,
87     WHITESPACE,
88     SEPARATOR,
89     NOT_SET
90   };
91 
92   // Tokenizes the input string using the selected tokenization method.
93   std::vector<Token> Tokenize(const std::string& text) const;
94 
95   // Same as above but takes UnicodeText.
96   std::vector<Token> Tokenize(const UnicodeText& text_unicode) const;
97 
98  protected:
99   // Finds the tokenization codepoint range config for given codepoint.
100   // Internally uses binary search so should be O(log(# of codepoint_ranges)).
101   const TokenizationCodepointRangeT* FindTokenizationRange(int codepoint) const;
102 
103   // Finds the role and script for given codepoint. If not found, DEFAULT_ROLE
104   // and kUnknownScript are assigned.
105   void GetScriptAndRole(char32 codepoint,
106                         TokenizationCodepointRange_::Role* role,
107                         int* script) const;
108 
109   // Tokenizes a substring of the unicode string, appending the resulting tokens
110   // to the output vector. The resulting tokens have bounds relative to the full
111   // string. Does nothing if the start of the span is negative.
112   void TokenizeSubstring(const UnicodeText& unicode_text, CodepointSpan span,
113                          std::vector<Token>* result) const;
114 
115   std::vector<Token> InternalTokenize(const UnicodeText& text_unicode) const;
116 
117   // Takes the result of ICU tokenization and retokenizes stretches of tokens
118   // made of a specific subset of characters using the internal tokenizer.
119   void InternalRetokenize(const UnicodeText& unicode_text,
120                           std::vector<Token>* tokens) const;
121 
122   // Tokenizes the input text using ICU tokenizer.
123   bool ICUTokenize(const UnicodeText& context_unicode,
124                    std::vector<Token>* result) const;
125 
126   // Tokenizes the input in number, word and separator tokens.
127   bool NumberTokenize(const UnicodeText& text_unicode,
128                       std::vector<Token>* result) const;
129 
130  private:
131   const TokenizationType type_;
132 
133   const UniLib* unilib_;
134 
135   // Codepoint ranges that determine how different codepoints are tokenized.
136   // The ranges must not overlap.
137   std::vector<std::unique_ptr<const TokenizationCodepointRangeT>>
138       codepoint_ranges_;
139 
140   // Codepoint ranges that define which tokens (consisting of which codepoints)
141   // should be re-tokenized with the internal tokenizer in the mixed
142   // tokenization mode.
143   // NOTE: Must be sorted.
144   std::vector<CodepointRangeStruct> internal_tokenizer_codepoint_ranges_;
145 
146   // If true, tokens will be additionally split when the codepoint's script_id
147   // changes.
148   const bool split_on_script_change_;
149 
150   const bool icu_preserve_whitespace_tokens_;
151   const bool preserve_floating_numbers_;
152 };
153 
154 }  // namespace libtextclassifier3
155 
156 #endif  // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_
157