• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_TOKENIZATION_LANGUAGE_SEGMENTER_H_
16 #define ICING_TOKENIZATION_LANGUAGE_SEGMENTER_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string_view>
21 #include <vector>
22 
23 #include "icing/text_classifier/lib3/utils/base/statusor.h"
24 #include "icing/absl_ports/canonical_errors.h"
25 #include "icing/util/character-iterator.h"
26 
27 namespace icing {
28 namespace lib {
29 
30 // A base class that all other LanguageSegmenters should inherit from. It
31 // provides interfaces that allow callers to segment text. The return value
32 // could be an iterator or a list of tokens. Example usage:
33 //
34 // std::unique_ptr<LanguageSegmenter> segmenter = GetSegmenter();
35 // ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iter,
36 //                  segmenter->Segment(text));
37 // ICING_ASSIGN_OR_RETURN(std::vector<std::string_view> segments,
38 // segmenter->GetAllTerms(text));
39 class LanguageSegmenter {
40  public:
41   virtual ~LanguageSegmenter() = default;
42 
43   // An iterator helping to find terms in the input text.
44   // Example usage:
45   //
46   // while (iterator.Advance()) {
47   //   const std::string_view term = iterator.GetTerm();
48   //   // Do something
49   // }
50   class Iterator {
51    public:
52     virtual ~Iterator() = default;
53 
54     // Advances to the next term. Returns false if it has reached the end.
55     virtual bool Advance() = 0;
56 
57     // Returns the current term. It can be called only when Advance() returns
58     // true.
59     virtual std::string_view GetTerm() const = 0;
60 
61     // RETURNS:
62     //   On success, a CharacterIterator pointing to the beginning of the
63     //   current term.
64     //   ABORTED if an invalid unicode character is encountered while
65     //   calculating the term start.
66     virtual libtextclassifier3::StatusOr<CharacterIterator>
CalculateTermStart()67     CalculateTermStart() {
68       return absl_ports::UnimplementedError("");
69     }
70 
71     // RETURNS:
72     //   On success, a CharacterIterator pointing just past the end of the
73     //   current term.
74     //   ABORTED if an invalid unicode character is encountered while
75     //   calculating the term end.
76     virtual libtextclassifier3::StatusOr<CharacterIterator>
CalculateTermEndExclusive()77     CalculateTermEndExclusive() {
78       return absl_ports::UnimplementedError("");
79     }
80 
81     // Resets the iterator to point to the first term that starts after UTF-32
82     // offset.
83     // GetTerm will now return that term. For example:
84     //
85     //   language_segmenter = language_segmenter_factory::Create(type);
86     //   iterator = language_segmenter->Segment("foo bar baz");
87     //   iterator.ResetToTermStartingAfterUtf32(4);
88     //   iterator.GetTerm() // returns "baz";
89     //
90     // Return types of OK and NOT_FOUND indicate that the function call was
91     // valid and the state of the iterator has changed. Return type of
92     // INVALID_ARGUMENT will leave the iterator unchanged. Lastly, a return type
93     // of ABORTED means that the iterator may be left in an undefined state and
94     // no longer be usable.
95     //
96     // Returns:
97     //   On success, the UTF-32 offset of the first term that starts after
98     //   offset.
99     //   NOT_FOUND if an error occurred or there are no terms that start after
100     //   offset.
101     //   INVALID_ARGUMENT if offset is beyond the end of the text.
102     //   ABORTED if an invalid unicode character is encountered while
103     //   traversing the text.
ResetToTermStartingAfterUtf32(int32_t offset)104     virtual libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32(
105         int32_t offset) {
106       return absl_ports::UnimplementedError("");
107     }
108 
109     // Resets the iterator to point to the first term that ends before UTF-32
110     // offset.
111     // GetTerm will now return that term. For example:
112     //
113     //   language_segmenter = language_segmenter_factory::Create(type);
114     //   iterator = language_segmenter->Segment("foo bar baz");
115     //   iterator.ResetToTermEndingBeforeUtf32(7);
116     //   iterator.GetTerm() // returns "bar";
117     //
118     // Return types of OK and NOT_FOUND indicate that the function call was
119     // valid and the state of the iterator has changed. Return type of
120     // INVALID_ARGUMENT will leave the iterator unchanged. Lastly, a return type
121     // of ABORTED means that the iterator may be left in an undefined state and
122     // no longer be usable.
123     //
124     // Returns:
125     //   On success, the UTF-32 offset of the first term that ends before
126     //   offset.
127     //   NOT_FOUND if an error occurred or there are no terms that ends before
128     //   offset.
129     //   INVALID_ARGUMENT if offset is negative
130     //   ABORTED if an invalid unicode character is encountered while
131     //   traversing the text.
ResetToTermEndingBeforeUtf32(int32_t offset)132     virtual libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32(
133         int32_t offset) {
134       return absl_ports::UnimplementedError("");
135     }
136 
137     // Resets the iterator to point to the first term.
138     // GetTerm will now return that term. For example:
139     //
140     //   language_segmenter = language_segmenter_factory::Create(type);
141     //   iterator = language_segmenter->Segment("foo bar baz");
142     //   iterator.Advance();
143     //   iterator.ResetToStartUtf32();
144     //   iterator.GetTerm() // returns "foo";
145     //
146     // Return types of OK and NOT_FOUND indicate that the function call was
147     // valid and the state of the iterator has changed.
148     //
149     // Returns:
150     //   On success, the starting position of the first term.
151     //   NOT_FOUND if an error occurred or there are no valid terms in the text.
152     //   ABORTED if an invalid unicode character is encountered while
153     //   traversing the text.
154     virtual libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() = 0;
155   };
156 
157   // Segments the input text into terms.
158   //
159   // Returns:
160   //   An iterator of terms on success
161   //   INTERNAL_ERROR if any error occurs
162   //
163   // Note: The underlying char* data of the input string won't be copied but
164   // shared with the return strings, so please make sure the input string
165   // outlives the returned iterator.
166   virtual libtextclassifier3::StatusOr<
167       std::unique_ptr<LanguageSegmenter::Iterator>>
168   Segment(std::string_view text) const = 0;
169 
170   // Segments and returns all terms in the input text.
171   //
172   // Returns:
173   //   A list of terms on success
174   //   INTERNAL_ERROR if any error occurs
175   //
176   // Note: The underlying char* data of the input string won't be copied but
177   // shared with the return strings, so please make sure the input string
178   // outlives the returned terms.
179   virtual libtextclassifier3::StatusOr<std::vector<std::string_view>>
180   GetAllTerms(std::string_view text) const = 0;
181 };
182 
183 }  // namespace lib
184 }  // namespace icing
185 
186 #endif  // ICING_TOKENIZATION_LANGUAGE_SEGMENTER_H_
187