1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_TOKENIZATION_TOKENIZER_H_ 16 #define ICING_TOKENIZATION_TOKENIZER_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string_view> 21 22 #include "icing/text_classifier/lib3/utils/base/statusor.h" 23 #include "icing/absl_ports/canonical_errors.h" 24 #include "icing/tokenization/token.h" 25 #include "icing/util/character-iterator.h" 26 27 namespace icing { 28 namespace lib { 29 30 // A virtual class that all other tokenizers should inherit. It provides 31 // interfaces that allow callers to tokenize text. The return value could be an 32 // iterator or a list of tokens. Example usage: 33 // 34 // std::unique_ptr<Tokenizer> tokenizer = GetTokenizer(); 35 // ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iter, 36 // tokenizer->Tokenize(text)); 37 // ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens, 38 // tokenizer->TokenizeAll(text)); 39 class Tokenizer { 40 public: 41 virtual ~Tokenizer() = default; 42 43 enum Type { 44 // Index tokenizers 45 PLAIN, // Used to tokenize plain text input 46 47 // Query tokenizers 48 RAW_QUERY, // Used to tokenize raw queries 49 }; 50 51 // An iterator helping to get tokens. 52 // Example usage: 53 // 54 // while (iterator.Advance()) { 55 // const Token& token = iterator.GetToken(); 56 // // Do something 57 // } 58 class Iterator { 59 public: 60 virtual ~Iterator() = default; 61 62 // Advances to the next token. Returns false if it has reached the end. 63 virtual bool Advance() = 0; 64 65 // Returns the current token. It can be called only when Advance() returns 66 // true, otherwise an invalid token could be returned. 67 virtual Token GetToken() const = 0; 68 69 virtual libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()70 CalculateTokenStart() { 71 return absl_ports::UnimplementedError( 72 "CalculateTokenStart is not implemented!"); 73 } 74 75 virtual libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()76 CalculateTokenEndExclusive() { 77 return absl_ports::UnimplementedError( 78 "CalculateTokenEndExclusive is not implemented!"); 79 } 80 81 // Sets the tokenizer to point at the first token that *starts* *after* 82 // offset. Returns false if there are no valid tokens starting after 83 // offset. 84 // Ex. 85 // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie(); 86 // iterator.ResetToTokenAfter(4); 87 // // The first full token starting after position 4 (the 'b' in "bar") is 88 // // "baz". 89 // PrintToken(iterator.GetToken()); // prints "baz" ResetToTokenAfter(int32_t offset)90 virtual bool ResetToTokenAfter(int32_t offset) { return false; } 91 92 // Sets the tokenizer to point at the first token that *ends* *before* 93 // offset. Returns false if there are no valid tokens ending 94 // before offset. 95 // Ex. 96 // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie(); 97 // iterator.ResetToTokenBefore(4); 98 // // The first full token ending before position 4 (the 'b' in "bar") is 99 // // "foo". 100 // PrintToken(iterator.GetToken()); // prints "foo" ResetToTokenBefore(int32_t offset)101 virtual bool ResetToTokenBefore(int32_t offset) { return false; } 102 ResetToStart()103 virtual bool ResetToStart() { return false; } 104 }; 105 106 // Tokenizes the input text. The input text should outlive the returned 107 // iterator. 108 // 109 // Returns: 110 // A token iterator on success 111 // INVALID_ARGUMENT with error message if input text has a wrong syntax 112 // according to implementations of different tokenizer 113 // types. 114 // INTERNAL_ERROR if any other errors occur 115 virtual libtextclassifier3::StatusOr<std::unique_ptr<Iterator>> Tokenize( 116 std::string_view text) const = 0; 117 118 // Tokenizes and returns all tokens in the input text. The input text should 119 // outlive the returned vector. 120 // 121 // Returns: 122 // A list of tokens on success 123 // INVALID_ARGUMENT with error message if input text has a wrong syntax 124 // according to implementations of different tokenizer 125 // types. 126 // INTERNAL_ERROR if any other errors occur 127 virtual libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll( 128 std::string_view text) const = 0; 129 }; 130 131 } // namespace lib 132 } // namespace icing 133 134 #endif // ICING_TOKENIZATION_TOKENIZER_H_ 135