• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_
18 #define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_
19 
20 #include <string>
21 #include <vector>
22 
23 #include "annotator/types.h"
24 #include "utils/base/integral_types.h"
25 #include "utils/codepoint-range.h"
26 #include "utils/tokenizer_generated.h"
27 #include "utils/utf8/unicodetext.h"
28 #include "utils/utf8/unilib.h"
29 
30 namespace libtextclassifier3 {
31 
32 const int kInvalidScript = -1;
33 const int kUnknownScript = -2;
34 
35 // Tokenizer splits the input string into a sequence of tokens, according to
36 // the configuration.
37 class Tokenizer {
38  public:
39   // `codepoint_ranges`: Codepoint ranges that determine how different
40   //      codepoints are tokenized. The ranges must not overlap.
41   // `internal_tokenizer_codepoint_ranges`: Codepoint ranges that define which
42   //      tokens should be re-tokenized with the internal tokenizer in the mixed
43   //      tokenization mode.
44   // `split_on_script_change`: Whether to consider a change of codepoint script
45   //      in a sequence of characters as a token boundary. If True, will treat
46   //      script change as a token boundary.
47   // `icu_preserve_whitespace_tokens`: If true, will include empty tokens in the
48   // output (in the ICU tokenization mode).
49   Tokenizer(
50       const TokenizationType type, const UniLib* unilib,
51       const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
52       const std::vector<const CodepointRange*>&
53           internal_tokenizer_codepoint_ranges,
54       const bool split_on_script_change,
55       const bool icu_preserve_whitespace_tokens);
56 
Tokenizer(const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const bool split_on_script_change)57   Tokenizer(
58       const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
59       const bool split_on_script_change)
60       : Tokenizer(TokenizationType_INTERNAL_TOKENIZER, /*unilib=*/nullptr,
61                   codepoint_ranges, /*internal_tokenizer_codepoint_ranges=*/{},
62                   split_on_script_change,
63                   /*icu_preserve_whitespace_tokens=*/false) {}
64 
65   // Tokenizes the input string using the selected tokenization method.
66   std::vector<Token> Tokenize(const std::string& text) const;
67 
68   // Same as above but takes UnicodeText.
69   std::vector<Token> Tokenize(const UnicodeText& text_unicode) const;
70 
71  protected:
72   // Finds the tokenization codepoint range config for given codepoint.
73   // Internally uses binary search so should be O(log(# of codepoint_ranges)).
74   const TokenizationCodepointRangeT* FindTokenizationRange(int codepoint) const;
75 
76   // Finds the role and script for given codepoint. If not found, DEFAULT_ROLE
77   // and kUnknownScript are assigned.
78   void GetScriptAndRole(char32 codepoint,
79                         TokenizationCodepointRange_::Role* role,
80                         int* script) const;
81 
82   // Tokenizes a substring of the unicode string, appending the resulting tokens
83   // to the output vector. The resulting tokens have bounds relative to the full
84   // string. Does nothing if the start of the span is negative.
85   void TokenizeSubstring(const UnicodeText& unicode_text, CodepointSpan span,
86                          std::vector<Token>* result) const;
87 
88   std::vector<Token> InternalTokenize(const UnicodeText& text_unicode) const;
89 
90   // Takes the result of ICU tokenization and retokenizes stretches of tokens
91   // made of a specific subset of characters using the internal tokenizer.
92   void InternalRetokenize(const UnicodeText& unicode_text,
93                           std::vector<Token>* tokens) const;
94 
95   // Tokenizes the input text using ICU tokenizer.
96   bool ICUTokenize(const UnicodeText& context_unicode,
97                    std::vector<Token>* result) const;
98 
99  private:
100   const TokenizationType type_;
101 
102   const UniLib* unilib_;
103 
104   // Codepoint ranges that determine how different codepoints are tokenized.
105   // The ranges must not overlap.
106   std::vector<std::unique_ptr<const TokenizationCodepointRangeT>>
107       codepoint_ranges_;
108 
109   // Codepoint ranges that define which tokens (consisting of which codepoints)
110   // should be re-tokenized with the internal tokenizer in the mixed
111   // tokenization mode.
112   // NOTE: Must be sorted.
113   std::vector<CodepointRangeStruct> internal_tokenizer_codepoint_ranges_;
114 
115   // If true, tokens will be additionally split when the codepoint's script_id
116   // changes.
117   const bool split_on_script_change_;
118 
119   const bool icu_preserve_whitespace_tokens_;
120 };
121 
122 }  // namespace libtextclassifier3
123 
124 #endif  // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_
125