1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/tokenization/plain-tokenizer.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <vector>
20
21 #include "icing/text_classifier/lib3/utils/base/statusor.h"
22 #include "icing/tokenization/language-segmenter.h"
23 #include "icing/util/character-iterator.h"
24 #include "icing/util/i18n-utils.h"
25 #include "icing/util/status-macros.h"
26
27 namespace icing {
28 namespace lib {
29
30 namespace {
31 // Helper function to validate a term.
32 // A term is valid if:
33 // 1. it's not empty
34 // 2. it's not a whitespace
35 // 3. it's not a punctuation mark
36 // 4. it's not a null terminator
37 // TODO(b/141007791): figure out how we'd like to support special characters
38 // like "+", "&", "@", "#" in indexing and query tokenizers.
IsValidTerm(std::string_view term)39 bool IsValidTerm(std::string_view term) {
40 if (term.empty()) {
41 return false;
42 }
43 // Gets the first unicode character. We can know what the whole term is by
44 // checking only the first character.
45 return !i18n_utils::IsWhitespaceAt(term, /*position=*/0) &&
46 !i18n_utils::IsPunctuationAt(term, /*position=*/0) &&
47 !(term[0] == '\0');
48 }
49 } // namespace
50
51 // Plain tokenizer applies its rules to the results from language segmenter. It
52 // simply filters out invalid terms from language segmenter and returns
53 // everything else as tokens. Please refer to IsValidTerm() above for what terms
54 // are valid.
55 class PlainTokenIterator : public Tokenizer::Iterator {
56 public:
PlainTokenIterator(std::unique_ptr<LanguageSegmenter::Iterator> base_iterator)57 explicit PlainTokenIterator(
58 std::unique_ptr<LanguageSegmenter::Iterator> base_iterator)
59 : base_iterator_(std::move(base_iterator)) {}
60
Advance()61 bool Advance() override {
62 bool found_next_valid_term = false;
63 while (!found_next_valid_term && base_iterator_->Advance()) {
64 current_term_ = base_iterator_->GetTerm();
65 found_next_valid_term = IsValidTerm(current_term_);
66 }
67 return found_next_valid_term;
68 }
69
GetTokens() const70 std::vector<Token> GetTokens() const override {
71 std::vector<Token> result;
72 if (!current_term_.empty()) {
73 result.push_back(Token(Token::Type::REGULAR, current_term_));
74 }
75 return result;
76 }
77
CalculateTokenStart()78 libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
79 override {
80 return base_iterator_->CalculateTermStart();
81 }
82
CalculateTokenEndExclusive()83 libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
84 override {
85 return base_iterator_->CalculateTermEndExclusive();
86 }
87
ResetToTokenStartingAfter(int32_t utf32_offset)88 bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
89 if (!base_iterator_->ResetToTermStartingAfterUtf32(utf32_offset).ok()) {
90 return false;
91 }
92 current_term_ = base_iterator_->GetTerm();
93 if (!IsValidTerm(current_term_)) {
94 // If the current value isn't valid, advance to the next valid value.
95 return Advance();
96 }
97 return true;
98 }
99
ResetToTokenEndingBefore(int32_t utf32_offset)100 bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
101 ICING_ASSIGN_OR_RETURN(
102 utf32_offset,
103 base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false);
104 current_term_ = base_iterator_->GetTerm();
105 while (!IsValidTerm(current_term_)) {
106 // Haven't found a valid term yet. Retrieve the term prior to this one
107 // from the segmenter.
108 ICING_ASSIGN_OR_RETURN(
109 utf32_offset,
110 base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false);
111 current_term_ = base_iterator_->GetTerm();
112 }
113 return true;
114 }
115
ResetToStart()116 bool ResetToStart() override {
117 if (!base_iterator_->ResetToStartUtf32().ok()) {
118 return false;
119 }
120 current_term_ = base_iterator_->GetTerm();
121 if (!IsValidTerm(current_term_)) {
122 // If the current value isn't valid, advance to the next valid value.
123 return Advance();
124 }
125 return true;
126 }
127
128 private:
129 std::unique_ptr<LanguageSegmenter::Iterator> base_iterator_;
130 std::string_view current_term_;
131 };
132
133 libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
Tokenize(std::string_view text) const134 PlainTokenizer::Tokenize(std::string_view text) const {
135 ICING_ASSIGN_OR_RETURN(
136 std::unique_ptr<LanguageSegmenter::Iterator> base_iterator,
137 language_segmenter_.Segment(text));
138 return std::make_unique<PlainTokenIterator>(std::move(base_iterator));
139 }
140
TokenizeAll(std::string_view text) const141 libtextclassifier3::StatusOr<std::vector<Token>> PlainTokenizer::TokenizeAll(
142 std::string_view text) const {
143 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
144 Tokenize(text));
145 std::vector<Token> tokens;
146 while (iterator->Advance()) {
147 std::vector<Token> batch_tokens = iterator->GetTokens();
148 tokens.insert(tokens.end(), batch_tokens.begin(), batch_tokens.end());
149 }
150 return tokens;
151 }
152
153 } // namespace lib
154 } // namespace icing
155