• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/tokenization/plain-tokenizer.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <vector>
20 
21 #include "icing/text_classifier/lib3/utils/base/statusor.h"
22 #include "icing/tokenization/language-segmenter.h"
23 #include "icing/util/character-iterator.h"
24 #include "icing/util/i18n-utils.h"
25 #include "icing/util/status-macros.h"
26 
27 namespace icing {
28 namespace lib {
29 
30 namespace {
31 // Helper function to validate a term.
32 // A term is valid if:
33 //   1. it's not empty
34 //   2. it's not a whitespace
35 //   3. it's not a punctuation mark
36 //   4. it's not a null terminator
37 // TODO(b/141007791): figure out how we'd like to support special characters
38 // like "+", "&", "@", "#" in indexing and query tokenizers.
IsValidTerm(std::string_view term)39 bool IsValidTerm(std::string_view term) {
40   if (term.empty()) {
41     return false;
42   }
43   // Gets the first unicode character. We can know what the whole term is by
44   // checking only the first character.
45   return !i18n_utils::IsWhitespaceAt(term, /*position=*/0) &&
46          !i18n_utils::IsPunctuationAt(term, /*position=*/0) &&
47          !(term[0] == '\0');
48 }
49 }  // namespace
50 
51 // Plain tokenizer applies its rules to the results from language segmenter. It
52 // simply filters out invalid terms from language segmenter and returns
53 // everything else as tokens. Please refer to IsValidTerm() above for what terms
54 // are valid.
55 class PlainTokenIterator : public Tokenizer::Iterator {
56  public:
PlainTokenIterator(std::unique_ptr<LanguageSegmenter::Iterator> base_iterator)57   explicit PlainTokenIterator(
58       std::unique_ptr<LanguageSegmenter::Iterator> base_iterator)
59       : base_iterator_(std::move(base_iterator)) {}
60 
Advance()61   bool Advance() override {
62     bool found_next_valid_term = false;
63     while (!found_next_valid_term && base_iterator_->Advance()) {
64       current_term_ = base_iterator_->GetTerm();
65       found_next_valid_term = IsValidTerm(current_term_);
66     }
67     return found_next_valid_term;
68   }
69 
GetTokens() const70   std::vector<Token> GetTokens() const override {
71     std::vector<Token> result;
72     if (!current_term_.empty()) {
73       result.push_back(Token(Token::Type::REGULAR, current_term_));
74     }
75     return result;
76   }
77 
CalculateTokenStart()78   libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
79       override {
80     return base_iterator_->CalculateTermStart();
81   }
82 
CalculateTokenEndExclusive()83   libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
84       override {
85     return base_iterator_->CalculateTermEndExclusive();
86   }
87 
ResetToTokenStartingAfter(int32_t utf32_offset)88   bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
89     if (!base_iterator_->ResetToTermStartingAfterUtf32(utf32_offset).ok()) {
90       return false;
91     }
92     current_term_ = base_iterator_->GetTerm();
93     if (!IsValidTerm(current_term_)) {
94       // If the current value isn't valid, advance to the next valid value.
95       return Advance();
96     }
97     return true;
98   }
99 
ResetToTokenEndingBefore(int32_t utf32_offset)100   bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
101     ICING_ASSIGN_OR_RETURN(
102         utf32_offset,
103         base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false);
104     current_term_ = base_iterator_->GetTerm();
105     while (!IsValidTerm(current_term_)) {
106       // Haven't found a valid term yet. Retrieve the term prior to this one
107       // from the segmenter.
108       ICING_ASSIGN_OR_RETURN(
109           utf32_offset,
110           base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false);
111       current_term_ = base_iterator_->GetTerm();
112     }
113     return true;
114   }
115 
ResetToStart()116   bool ResetToStart() override {
117     if (!base_iterator_->ResetToStartUtf32().ok()) {
118       return false;
119     }
120     current_term_ = base_iterator_->GetTerm();
121     if (!IsValidTerm(current_term_)) {
122       // If the current value isn't valid, advance to the next valid value.
123       return Advance();
124     }
125     return true;
126   }
127 
128  private:
129   std::unique_ptr<LanguageSegmenter::Iterator> base_iterator_;
130   std::string_view current_term_;
131 };
132 
133 libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
Tokenize(std::string_view text) const134 PlainTokenizer::Tokenize(std::string_view text) const {
135   ICING_ASSIGN_OR_RETURN(
136       std::unique_ptr<LanguageSegmenter::Iterator> base_iterator,
137       language_segmenter_.Segment(text));
138   return std::make_unique<PlainTokenIterator>(std::move(base_iterator));
139 }
140 
TokenizeAll(std::string_view text) const141 libtextclassifier3::StatusOr<std::vector<Token>> PlainTokenizer::TokenizeAll(
142     std::string_view text) const {
143   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
144                          Tokenize(text));
145   std::vector<Token> tokens;
146   while (iterator->Advance()) {
147     std::vector<Token> batch_tokens = iterator->GetTokens();
148     tokens.insert(tokens.end(), batch_tokens.begin(), batch_tokens.end());
149   }
150   return tokens;
151 }
152 
153 }  // namespace lib
154 }  // namespace icing
155