• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_TOKENIZATION_TOKENIZER_H_
16 #define ICING_TOKENIZATION_TOKENIZER_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string_view>
21 
22 #include "icing/text_classifier/lib3/utils/base/statusor.h"
23 #include "icing/absl_ports/canonical_errors.h"
24 #include "icing/tokenization/token.h"
25 #include "icing/util/character-iterator.h"
26 
27 namespace icing {
28 namespace lib {
29 
30 // A virtual class that all other tokenizers should inherit. It provides
31 // interfaces that allow callers to tokenize text. The return value could be an
32 // iterator or a list of tokens. Example usage:
33 //
34 // std::unique_ptr<Tokenizer> tokenizer = GetTokenizer();
35 // ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iter,
36 //                  tokenizer->Tokenize(text));
37 // ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens,
38 // tokenizer->TokenizeAll(text));
39 class Tokenizer {
40  public:
41   virtual ~Tokenizer() = default;
42 
43   enum Type {
44     // Index tokenizers
45     PLAIN,  // Used to tokenize plain text input
46 
47     // Query tokenizers
48     RAW_QUERY,  // Used to tokenize raw queries
49   };
50 
51   // An iterator helping to get tokens.
52   // Example usage:
53   //
54   // while (iterator.Advance()) {
55   //   const Token& token = iterator.GetToken();
56   //   // Do something
57   // }
58   class Iterator {
59    public:
60     virtual ~Iterator() = default;
61 
62     // Advances to the next token. Returns false if it has reached the end.
63     virtual bool Advance() = 0;
64 
65     // Returns the current token. It can be called only when Advance() returns
66     // true, otherwise an invalid token could be returned.
67     virtual Token GetToken() const = 0;
68 
69     virtual libtextclassifier3::StatusOr<CharacterIterator>
CalculateTokenStart()70     CalculateTokenStart() {
71       return absl_ports::UnimplementedError(
72           "CalculateTokenStart is not implemented!");
73     }
74 
75     virtual libtextclassifier3::StatusOr<CharacterIterator>
CalculateTokenEndExclusive()76     CalculateTokenEndExclusive() {
77       return absl_ports::UnimplementedError(
78           "CalculateTokenEndExclusive is not implemented!");
79     }
80 
81     // Sets the tokenizer to point at the first token that *starts* *after*
82     // offset. Returns false if there are no valid tokens starting after
83     // offset.
84     // Ex.
85     // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
86     // iterator.ResetToTokenAfter(4);
87     // // The first full token starting after position 4 (the 'b' in "bar") is
88     // // "baz".
89     // PrintToken(iterator.GetToken());  // prints "baz"
ResetToTokenAfter(int32_t offset)90     virtual bool ResetToTokenAfter(int32_t offset) { return false; }
91 
92     // Sets the tokenizer to point at the first token that *ends* *before*
93     // offset. Returns false if there are no valid tokens ending
94     // before offset.
95     // Ex.
96     // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
97     // iterator.ResetToTokenBefore(4);
98     // // The first full token ending before position 4 (the 'b' in "bar") is
99     // // "foo".
100     // PrintToken(iterator.GetToken());  // prints "foo"
ResetToTokenBefore(int32_t offset)101     virtual bool ResetToTokenBefore(int32_t offset) { return false; }
102 
ResetToStart()103     virtual bool ResetToStart() { return false; }
104   };
105 
106   // Tokenizes the input text. The input text should outlive the returned
107   // iterator.
108   //
109   // Returns:
110   //   A token iterator on success
111   //   INVALID_ARGUMENT with error message if input text has a wrong syntax
112   //                    according to implementations of different tokenizer
113   //                    types.
114   //   INTERNAL_ERROR if any other errors occur
115   virtual libtextclassifier3::StatusOr<std::unique_ptr<Iterator>> Tokenize(
116       std::string_view text) const = 0;
117 
118   // Tokenizes and returns all tokens in the input text. The input text should
119   // outlive the returned vector.
120   //
121   // Returns:
122   //   A list of tokens on success
123   //   INVALID_ARGUMENT with error message if input text has a wrong syntax
124   //                    according to implementations of different tokenizer
125   //                    types.
126   //   INTERNAL_ERROR if any other errors occur
127   virtual libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
128       std::string_view text) const = 0;
129 };
130 
131 }  // namespace lib
132 }  // namespace icing
133 
134 #endif  // ICING_TOKENIZATION_TOKENIZER_H_
135