1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_TOKENIZATION_LANGUAGE_SEGMENTER_H_ 16 #define ICING_TOKENIZATION_LANGUAGE_SEGMENTER_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string_view> 21 #include <vector> 22 23 #include "icing/text_classifier/lib3/utils/base/statusor.h" 24 #include "icing/absl_ports/canonical_errors.h" 25 #include "icing/util/character-iterator.h" 26 27 namespace icing { 28 namespace lib { 29 30 // A base class that all other LanguageSegmenters should inherit from. It 31 // provides interfaces that allow callers to segment text. The return value 32 // could be an iterator or a list of tokens. Example usage: 33 // 34 // std::unique_ptr<LanguageSegmenter> segmenter = GetSegmenter(); 35 // ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iter, 36 // segmenter->Segment(text)); 37 // ICING_ASSIGN_OR_RETURN(std::vector<std::string_view> segments, 38 // segmenter->GetAllTerms(text)); 39 class LanguageSegmenter { 40 public: 41 virtual ~LanguageSegmenter() = default; 42 43 // An iterator helping to find terms in the input text. 44 // Example usage: 45 // 46 // while (iterator.Advance()) { 47 // const std::string_view term = iterator.GetTerm(); 48 // // Do something 49 // } 50 class Iterator { 51 public: 52 virtual ~Iterator() = default; 53 54 // Advances to the next term. Returns false if it has reached the end. 55 virtual bool Advance() = 0; 56 57 // Returns the current term. It can be called only when Advance() returns 58 // true. 59 virtual std::string_view GetTerm() const = 0; 60 61 // RETURNS: 62 // On success, a CharacterIterator pointing to the beginning of the 63 // current term. 64 // ABORTED if an invalid unicode character is encountered while 65 // calculating the term start. 66 virtual libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart()67 CalculateTermStart() { 68 return absl_ports::UnimplementedError(""); 69 } 70 71 // RETURNS: 72 // On success, a CharacterIterator pointing just past the end of the 73 // current term. 74 // ABORTED if an invalid unicode character is encountered while 75 // calculating the term end. 76 virtual libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive()77 CalculateTermEndExclusive() { 78 return absl_ports::UnimplementedError(""); 79 } 80 81 // Resets the iterator to point to the first term that starts after UTF-32 82 // offset. 83 // GetTerm will now return that term. For example: 84 // 85 // language_segmenter = language_segmenter_factory::Create(type); 86 // iterator = language_segmenter->Segment("foo bar baz"); 87 // iterator.ResetToTermStartingAfterUtf32(4); 88 // iterator.GetTerm() // returns "baz"; 89 // 90 // Return types of OK and NOT_FOUND indicate that the function call was 91 // valid and the state of the iterator has changed. Return type of 92 // INVALID_ARGUMENT will leave the iterator unchanged. Lastly, a return type 93 // of ABORTED means that the iterator may be left in an undefined state and 94 // no longer be usable. 95 // 96 // Returns: 97 // On success, the UTF-32 offset of the first term that starts after 98 // offset. 99 // NOT_FOUND if an error occurred or there are no terms that start after 100 // offset. 101 // INVALID_ARGUMENT if offset is beyond the end of the text. 102 // ABORTED if an invalid unicode character is encountered while 103 // traversing the text. ResetToTermStartingAfterUtf32(int32_t offset)104 virtual libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32( 105 int32_t offset) { 106 return absl_ports::UnimplementedError(""); 107 } 108 109 // Resets the iterator to point to the first term that ends before UTF-32 110 // offset. 111 // GetTerm will now return that term. For example: 112 // 113 // language_segmenter = language_segmenter_factory::Create(type); 114 // iterator = language_segmenter->Segment("foo bar baz"); 115 // iterator.ResetToTermEndingBeforeUtf32(7); 116 // iterator.GetTerm() // returns "bar"; 117 // 118 // Return types of OK and NOT_FOUND indicate that the function call was 119 // valid and the state of the iterator has changed. Return type of 120 // INVALID_ARGUMENT will leave the iterator unchanged. Lastly, a return type 121 // of ABORTED means that the iterator may be left in an undefined state and 122 // no longer be usable. 123 // 124 // Returns: 125 // On success, the UTF-32 offset of the first term that ends before 126 // offset. 127 // NOT_FOUND if an error occurred or there are no terms that ends before 128 // offset. 129 // INVALID_ARGUMENT if offset is negative 130 // ABORTED if an invalid unicode character is encountered while 131 // traversing the text. ResetToTermEndingBeforeUtf32(int32_t offset)132 virtual libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32( 133 int32_t offset) { 134 return absl_ports::UnimplementedError(""); 135 } 136 137 // Resets the iterator to point to the first term. 138 // GetTerm will now return that term. For example: 139 // 140 // language_segmenter = language_segmenter_factory::Create(type); 141 // iterator = language_segmenter->Segment("foo bar baz"); 142 // iterator.Advance(); 143 // iterator.ResetToStartUtf32(); 144 // iterator.GetTerm() // returns "foo"; 145 // 146 // Return types of OK and NOT_FOUND indicate that the function call was 147 // valid and the state of the iterator has changed. 148 // 149 // Returns: 150 // On success, the starting position of the first term. 151 // NOT_FOUND if an error occurred or there are no valid terms in the text. 152 // ABORTED if an invalid unicode character is encountered while 153 // traversing the text. 154 virtual libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() = 0; 155 }; 156 157 // Segments the input text into terms. 158 // 159 // Returns: 160 // An iterator of terms on success 161 // INTERNAL_ERROR if any error occurs 162 // 163 // Note: The underlying char* data of the input string won't be copied but 164 // shared with the return strings, so please make sure the input string 165 // outlives the returned iterator. 166 virtual libtextclassifier3::StatusOr< 167 std::unique_ptr<LanguageSegmenter::Iterator>> 168 Segment(std::string_view text) const = 0; 169 170 // Segments and returns all terms in the input text. 171 // 172 // Returns: 173 // A list of terms on success 174 // INTERNAL_ERROR if any error occurs 175 // 176 // Note: The underlying char* data of the input string won't be copied but 177 // shared with the return strings, so please make sure the input string 178 // outlives the returned terms. 179 virtual libtextclassifier3::StatusOr<std::vector<std::string_view>> 180 GetAllTerms(std::string_view text) const = 0; 181 }; 182 183 } // namespace lib 184 } // namespace icing 185 186 #endif // ICING_TOKENIZATION_LANGUAGE_SEGMENTER_H_ 187