1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/query/advanced_query_parser/query-visitor.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <iterator>
20 #include <limits>
21 #include <memory>
22 #include <set>
23 #include <string>
24 #include <string_view>
25 #include <unordered_map>
26 #include <utility>
27 #include <vector>
28
29 #include "icing/text_classifier/lib3/utils/base/status.h"
30 #include "icing/text_classifier/lib3/utils/base/statusor.h"
31 #include "icing/absl_ports/canonical_errors.h"
32 #include "icing/absl_ports/str_cat.h"
33 #include "icing/absl_ports/str_join.h"
34 #include "icing/index/embed/doc-hit-info-iterator-embedding.h"
35 #include "icing/index/embed/embedding-query-results.h"
36 #include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h"
37 #include "icing/index/iterator/doc-hit-info-iterator-and.h"
38 #include "icing/index/iterator/doc-hit-info-iterator-filter.h"
39 #include "icing/index/iterator/doc-hit-info-iterator-none.h"
40 #include "icing/index/iterator/doc-hit-info-iterator-not.h"
41 #include "icing/index/iterator/doc-hit-info-iterator-or.h"
42 #include "icing/index/iterator/doc-hit-info-iterator-property-in-document.h"
43 #include "icing/index/iterator/doc-hit-info-iterator-property-in-schema.h"
44 #include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h"
45 #include "icing/index/iterator/doc-hit-info-iterator.h"
46 #include "icing/index/iterator/section-restrict-data.h"
47 #include "icing/index/property-existence-indexing-handler.h"
48 #include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
49 #include "icing/query/advanced_query_parser/function.h"
50 #include "icing/query/advanced_query_parser/lexer.h"
51 #include "icing/query/advanced_query_parser/param.h"
52 #include "icing/query/advanced_query_parser/parser.h"
53 #include "icing/query/advanced_query_parser/pending-value.h"
54 #include "icing/query/advanced_query_parser/util/string-util.h"
55 #include "icing/query/query-features.h"
56 #include "icing/query/query-results.h"
57 #include "icing/schema/property-util.h"
58 #include "icing/schema/schema-store.h"
59 #include "icing/schema/section.h"
60 #include "icing/tokenization/token.h"
61 #include "icing/tokenization/tokenizer.h"
62 #include "icing/util/embedding-util.h"
63 #include "icing/util/status-macros.h"
64
65 namespace icing {
66 namespace lib {
67
68 namespace {
69
70 struct CreateList {
operator ()icing::lib::__anone73a13e90111::CreateList71 libtextclassifier3::StatusOr<PendingValue> operator()(
72 std::vector<PendingValue>&& args) const {
73 std::vector<std::string> values;
74 values.reserve(args.size());
75 for (PendingValue& arg : args) {
76 QueryTerm string_val = std::move(arg).string_val().ValueOrDie();
77 values.push_back(std::move(string_val.term));
78 }
79 return PendingValue(std::move(values));
80 }
81 };
82
IsNumericComparator(std::string_view operator_text)83 bool IsNumericComparator(std::string_view operator_text) {
84 if (operator_text.length() < 1 || operator_text.length() > 2) {
85 return false;
86 }
87 // TODO(tjbarron) decide how/if to support !=
88 return operator_text == "<" || operator_text == ">" ||
89 operator_text == "==" || operator_text == "<=" ||
90 operator_text == ">=";
91 }
92
IsSupportedNaryOperator(std::string_view operator_text)93 bool IsSupportedNaryOperator(std::string_view operator_text) {
94 return IsNumericComparator(operator_text) || operator_text == "AND" ||
95 operator_text == "OR" || operator_text == ":";
96 }
97
98 struct Int64Range {
99 int64_t low;
100 int64_t high;
101 };
102
GetInt64Range(std::string_view operator_text,int64_t int_value)103 libtextclassifier3::StatusOr<Int64Range> GetInt64Range(
104 std::string_view operator_text, int64_t int_value) {
105 Int64Range range = {std::numeric_limits<int64_t>::min(),
106 std::numeric_limits<int64_t>::max()};
107 if (operator_text == "<") {
108 if (int_value == std::numeric_limits<int64_t>::min()) {
109 return absl_ports::InvalidArgumentError(
110 "Cannot specify < INT64_MIN in query expression.");
111 }
112 range.high = int_value - 1;
113 } else if (operator_text == "<=") {
114 range.high = int_value;
115 } else if (operator_text == "==") {
116 range.high = int_value;
117 range.low = int_value;
118 } else if (operator_text == ">=") {
119 range.low = int_value;
120 } else if (operator_text == ">") {
121 if (int_value == std::numeric_limits<int64_t>::max()) {
122 return absl_ports::InvalidArgumentError(
123 "Cannot specify > INT64_MAX in query expression.");
124 }
125 range.low = int_value + 1;
126 }
127 return range;
128 }
129
130 } // namespace
131
AddValidRestricts(std::set<std::string> new_restricts)132 void QueryVisitor::PendingPropertyRestricts::AddValidRestricts(
133 std::set<std::string> new_restricts) {
134 if (!has_active_property_restricts()) {
135 pending_property_restricts_.push_back(std::move(new_restricts));
136 return;
137 }
138
139 // There is an active property restrict already in effect. To determine the
140 // updated active property restrict being applied at this level, we need to
141 // calculate the intersection of new_restricts and
142 // active_property_restricts.
143 const std::set<std::string>& active_restricts = active_property_restricts();
144 auto active_restricts_itr = active_restricts.begin();
145 for (auto new_restricts_itr = new_restricts.begin();
146 new_restricts_itr != new_restricts.end();) {
147 while (active_restricts_itr != active_restricts.end() &&
148 *active_restricts_itr < *new_restricts_itr) {
149 // new_restricts_itr is behind active_restricts_itr.
150 ++active_restricts_itr;
151 }
152 if (active_restricts_itr == active_restricts.end()) {
153 // There's nothing left in active restricts. Everything at
154 // new_restricts_itr and beyond should be removed
155 new_restricts_itr =
156 new_restricts.erase(new_restricts_itr, new_restricts.end());
157 } else if (*active_restricts_itr > *new_restricts_itr) {
158 // new_restricts_itr points to elements not present in
159 // active_restricts_itr
160 new_restricts_itr = new_restricts.erase(new_restricts_itr);
161 } else {
162 // the element that new_restricts_itr points to is present in
163 // active_restricts_itr.
164 ++new_restricts_itr;
165 }
166 }
167 pending_property_restricts_.push_back(std::move(new_restricts));
168 }
169
170 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
CreateTermIterator(const QueryTerm & query_term)171 QueryVisitor::CreateTermIterator(const QueryTerm& query_term) {
172 if (query_term.is_prefix_val) {
173 // '*' prefix operator was added in list filters
174 features_.insert(kListFilterQueryLanguageFeature);
175 }
176 TermMatchType::Code match_type = GetTermMatchType(query_term.is_prefix_val);
177 int unnormalized_term_start =
178 query_term.raw_term.data() - raw_query_text_.data();
179 if (!processing_not_) {
180 // 1. Add term to property_query_terms_map
181 if (pending_property_restricts_.has_active_property_restricts()) {
182 for (const std::string& property_restrict :
183 pending_property_restricts_.active_property_restricts()) {
184 property_query_terms_map_[property_restrict].insert(query_term.term);
185 }
186 } else {
187 property_query_terms_map_[""].insert(query_term.term);
188 }
189
190 // 2. If needed add term iterator to query_term_iterators_ map.
191 if (needs_term_frequency_info_) {
192 ICING_ASSIGN_OR_RETURN(
193 std::unique_ptr<DocHitInfoIterator> term_iterator,
194 index_.GetIterator(query_term.term, unnormalized_term_start,
195 query_term.raw_term.length(), kSectionIdMaskAll,
196 match_type_, needs_term_frequency_info_));
197 query_term_iterators_[query_term.term] =
198 std::make_unique<DocHitInfoIteratorFilter>(
199 std::move(term_iterator), &document_store_, &schema_store_,
200 filter_options_, current_time_ms_);
201 }
202 }
203
204 // 3. Add the term iterator.
205 return index_.GetIterator(query_term.term, unnormalized_term_start,
206 query_term.raw_term.length(), kSectionIdMaskAll,
207 match_type, needs_term_frequency_info_);
208 }
209
RegisterFunctions()210 void QueryVisitor::RegisterFunctions() {
211 // std::vector<std::string> createList(std::string...);
212 Function create_list_function_ =
213 Function::Create(DataType::kStringList, "createList",
214 {Param(DataType::kString, Cardinality::kRequired),
215 Param(DataType::kString, Cardinality::kVariable)},
216 CreateList())
217 .ValueOrDie();
218 registered_functions_.insert(
219 {create_list_function_.name(), std::move(create_list_function_)});
220
221 // DocHitInfoIterator search(std::string);
222 // DocHitInfoIterator search(std::string, std::vector<std::string>);
223 auto search_eval = [this](std::vector<PendingValue>&& args) {
224 return this->SearchFunction(std::move(args));
225 };
226 Function search_function =
227 Function::Create(DataType::kDocumentIterator, "search",
228 {Param(DataType::kString),
229 Param(DataType::kStringList, Cardinality::kOptional)},
230 std::move(search_eval))
231 .ValueOrDie();
232 registered_functions_.insert(
233 {search_function.name(), std::move(search_function)});
234
235 // DocHitInfoIterator propertyDefined(std::string);
236 auto property_defined = [this](std::vector<PendingValue>&& args) {
237 return this->PropertyDefinedFunction(std::move(args));
238 };
239 Function property_defined_function =
240 Function::Create(DataType::kDocumentIterator, "propertyDefined",
241 {Param(DataType::kString)}, std::move(property_defined))
242 .ValueOrDie();
243 registered_functions_.insert(
244 {property_defined_function.name(), std::move(property_defined_function)});
245
246 // DocHitInfoIterator hasProperty(std::string);
247 auto has_property = [this](std::vector<PendingValue>&& args) {
248 return this->HasPropertyFunction(std::move(args));
249 };
250 Function has_property_function =
251 Function::Create(DataType::kDocumentIterator, "hasProperty",
252 {Param(DataType::kString)}, std::move(has_property))
253 .ValueOrDie();
254 registered_functions_.insert(
255 {has_property_function.name(), std::move(has_property_function)});
256
257 // vector_index getSearchSpecEmbedding(long);
258 auto get_search_spec_embedding = [](std::vector<PendingValue>&& args) {
259 return PendingValue::CreateVectorIndexPendingValue(
260 args.at(0).long_val().ValueOrDie());
261 };
262 Function get_search_spec_embedding_function =
263 Function::Create(DataType::kVectorIndex, "getSearchSpecEmbedding",
264 {Param(DataType::kLong)},
265 std::move(get_search_spec_embedding))
266 .ValueOrDie();
267 registered_functions_.insert({get_search_spec_embedding_function.name(),
268 std::move(get_search_spec_embedding_function)});
269
270 // DocHitInfoIterator semanticSearch(vector_index, double, double, string);
271 auto semantic_search = [this](std::vector<PendingValue>&& args) {
272 return this->SemanticSearchFunction(std::move(args));
273 };
274 Function semantic_search_function =
275 Function::Create(DataType::kDocumentIterator, "semanticSearch",
276 {Param(DataType::kVectorIndex),
277 Param(DataType::kDouble, Cardinality::kOptional),
278 Param(DataType::kDouble, Cardinality::kOptional),
279 Param(DataType::kString, Cardinality::kOptional)},
280 std::move(semantic_search))
281 .ValueOrDie();
282 registered_functions_.insert(
283 {semantic_search_function.name(), std::move(semantic_search_function)});
284
285 // DocHitInfoIterator tokenize(std::string);
286 auto tokenize = [this](std::vector<PendingValue>&& args) {
287 return this->TokenizeFunction(std::move(args));
288 };
289 Function tokenize_function =
290 Function::Create(DataType::kDocumentIterator, "tokenize",
291 {Param(DataType::kString)}, std::move(tokenize))
292 .ValueOrDie();
293 registered_functions_.insert(
294 {tokenize_function.name(), std::move(tokenize_function)});
295 }
296
SearchFunction(std::vector<PendingValue> && args)297 libtextclassifier3::StatusOr<PendingValue> QueryVisitor::SearchFunction(
298 std::vector<PendingValue>&& args) {
299 // The second arg (if present) is a list of sections to restrict to.
300 if (args.size() == 2) {
301 std::set<std::string> new_restricts;
302 std::vector<std::string> property_restricts =
303 std::move(args.at(1)).string_vals().ValueOrDie();
304 for (std::string& property_restrict : property_restricts) {
305 new_restricts.insert(std::move(property_restrict));
306 }
307 pending_property_restricts_.AddValidRestricts(std::move(new_restricts));
308 if (pending_property_restricts_.active_property_restricts().empty()) {
309 pending_property_restricts_.PopRestricts();
310 return PendingValue(std::make_unique<DocHitInfoIteratorNone>());
311 }
312 }
313
314 // The first arg is guaranteed to be a STRING at this point. It should be safe
315 // to call ValueOrDie.
316 const QueryTerm* query = args.at(0).string_val().ValueOrDie();
317 Lexer lexer(query->term, Lexer::Language::QUERY);
318 ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
319 lexer.ExtractTokens());
320
321 Parser parser = Parser::Create(std::move(lexer_tokens));
322 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> tree_root,
323 parser.ConsumeQuery());
324
325 std::unique_ptr<DocHitInfoIterator> iterator;
326 QueryResults query_result;
327 if (tree_root == nullptr) {
328 iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>(
329 document_store_.last_added_document_id());
330 } else {
331 QueryVisitor query_visitor(
332 &index_, &numeric_index_, &embedding_index_, &document_store_,
333 &schema_store_, &normalizer_, &tokenizer_, query->raw_term,
334 embedding_query_vectors_, filter_options_, match_type_,
335 embedding_query_metric_type_, needs_term_frequency_info_,
336 pending_property_restricts_, processing_not_, current_time_ms_);
337 tree_root->Accept(&query_visitor);
338 ICING_ASSIGN_OR_RETURN(query_result,
339 std::move(query_visitor).ConsumeResults());
340 iterator = std::move(query_result.root_iterator);
341 }
342
343 // Update members based on results of processing the query.
344 if (args.size() == 2 &&
345 pending_property_restricts_.has_active_property_restricts()) {
346 iterator = DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
347 std::move(iterator), &document_store_, &schema_store_,
348 pending_property_restricts_.active_property_restricts(),
349 current_time_ms_);
350 pending_property_restricts_.PopRestricts();
351 }
352 if (!processing_not_) {
353 std::move(
354 query_result.query_term_iterators.begin(),
355 query_result.query_term_iterators.end(),
356 std::inserter(query_term_iterators_, query_term_iterators_.end()));
357
358 std::move(query_result.query_terms.begin(), query_result.query_terms.end(),
359 std::inserter(property_query_terms_map_,
360 property_query_terms_map_.end()));
361 }
362 std::move(query_result.features_in_use.begin(),
363 query_result.features_in_use.end(),
364 std::inserter(features_, features_.end()));
365 return PendingValue(std::move(iterator));
366 }
367
368 libtextclassifier3::StatusOr<PendingValue>
PropertyDefinedFunction(std::vector<PendingValue> && args)369 QueryVisitor::PropertyDefinedFunction(std::vector<PendingValue>&& args) {
370 // The first arg is guaranteed to be a STRING at this point. It should be safe
371 // to call ValueOrDie.
372 const QueryTerm* member = args.at(0).string_val().ValueOrDie();
373
374 std::unique_ptr<DocHitInfoIterator> all_docs_iterator =
375 std::make_unique<DocHitInfoIteratorAllDocumentId>(
376 document_store_.last_added_document_id());
377
378 std::set<std::string> target_sections = {std::move(member->term)};
379 std::unique_ptr<DocHitInfoIterator> property_in_schema_iterator =
380 std::make_unique<DocHitInfoIteratorPropertyInSchema>(
381 std::move(all_docs_iterator), &document_store_, &schema_store_,
382 std::move(target_sections), current_time_ms_);
383
384 features_.insert(kListFilterQueryLanguageFeature);
385
386 return PendingValue(std::move(property_in_schema_iterator));
387 }
388
HasPropertyFunction(std::vector<PendingValue> && args)389 libtextclassifier3::StatusOr<PendingValue> QueryVisitor::HasPropertyFunction(
390 std::vector<PendingValue>&& args) {
391 // The first arg is guaranteed to be a STRING at this point. It should be safe
392 // to call ValueOrDie.
393 const std::string& property_path = args.at(0).string_val().ValueOrDie()->term;
394
395 // Perform an exact search for the property existence metadata token.
396 ICING_ASSIGN_OR_RETURN(
397 std::unique_ptr<DocHitInfoIterator> meta_hit_iterator,
398 index_.GetIterator(
399 absl_ports::StrCat(kPropertyExistenceTokenPrefix, property_path),
400 /*term_start_index=*/0,
401 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
402 TermMatchType::EXACT_ONLY,
403 /*need_hit_term_frequency=*/false));
404
405 std::unique_ptr<DocHitInfoIterator> property_in_document_iterator =
406 std::make_unique<DocHitInfoIteratorPropertyInDocument>(
407 std::move(meta_hit_iterator));
408
409 features_.insert(kHasPropertyFunctionFeature);
410
411 return PendingValue(std::move(property_in_document_iterator));
412 }
413
SemanticSearchFunction(std::vector<PendingValue> && args)414 libtextclassifier3::StatusOr<PendingValue> QueryVisitor::SemanticSearchFunction(
415 std::vector<PendingValue>&& args) {
416 features_.insert(kEmbeddingSearchFeature);
417
418 int64_t vector_index = args.at(0).vector_index_val().ValueOrDie();
419 if (embedding_query_vectors_ == nullptr || vector_index < 0 ||
420 vector_index >= embedding_query_vectors_->size()) {
421 return absl_ports::InvalidArgumentError("Got invalid vector search index!");
422 }
423
424 // Handle default values for the optional arguments.
425 double low = -std::numeric_limits<double>::infinity();
426 double high = std::numeric_limits<double>::infinity();
427 SearchSpecProto::EmbeddingQueryMetricType::Code metric_type =
428 embedding_query_metric_type_;
429 if (args.size() >= 2) {
430 low = args.at(1).double_val().ValueOrDie();
431 }
432 if (args.size() >= 3) {
433 high = args.at(2).double_val().ValueOrDie();
434 }
435 if (low > high) {
436 return absl_ports::InvalidArgumentError(
437 "The lower bound cannot be greater than the upper bound.");
438 }
439 if (args.size() >= 4) {
440 const std::string& metric = args.at(3).string_val().ValueOrDie()->term;
441 ICING_ASSIGN_OR_RETURN(
442 metric_type,
443 embedding_util::GetEmbeddingQueryMetricTypeFromName(metric));
444 }
445
446 // Create SectionRestrictData for section restriction.
447 std::unique_ptr<SectionRestrictData> section_restrict_data = nullptr;
448 if (pending_property_restricts_.has_active_property_restricts()) {
449 std::unordered_map<std::string, std::set<std::string>>
450 type_property_filters;
451 type_property_filters[std::string(SchemaStore::kSchemaTypeWildcard)] =
452 pending_property_restricts_.active_property_restricts();
453 section_restrict_data = std::make_unique<SectionRestrictData>(
454 &document_store_, &schema_store_, current_time_ms_,
455 type_property_filters);
456 }
457
458 // Create and return iterator.
459 EmbeddingQueryResults::EmbeddingQueryScoreMap* score_map =
460 &embedding_query_results_.result_scores[vector_index][metric_type];
461 ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iterator,
462 DocHitInfoIteratorEmbedding::Create(
463 &embedding_query_vectors_->at(vector_index),
464 std::move(section_restrict_data), metric_type, low,
465 high, score_map, &embedding_index_));
466 return PendingValue(std::move(iterator));
467 }
468
TokenizeFunction(std::vector<PendingValue> && args)469 libtextclassifier3::StatusOr<PendingValue> QueryVisitor::TokenizeFunction(
470 std::vector<PendingValue>&& args) {
471 features_.insert(kTokenizeFeature);
472
473 QueryTerm text_value = std::move(args.at(0)).string_val().ValueOrDie();
474 text_value.is_prefix_val = false; // the prefix operator cannot be used here.
475 ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iterator,
476 ProduceTextTokenIterators(std::move(text_value)));
477 return PendingValue(std::move(iterator));
478 }
479
PopPendingIntValue()480 libtextclassifier3::StatusOr<int64_t> QueryVisitor::PopPendingIntValue() {
481 if (pending_values_.empty()) {
482 return absl_ports::InvalidArgumentError("Unable to retrieve int value.");
483 }
484 ICING_ASSIGN_OR_RETURN(int64_t int_value, pending_values_.top().long_val());
485 pending_values_.pop();
486 return int_value;
487 }
488
PopPendingStringValue()489 libtextclassifier3::StatusOr<QueryTerm> QueryVisitor::PopPendingStringValue() {
490 if (pending_values_.empty()) {
491 return absl_ports::InvalidArgumentError("Unable to retrieve string value.");
492 }
493 ICING_ASSIGN_OR_RETURN(QueryTerm string_value,
494 std::move(pending_values_.top()).string_val());
495 pending_values_.pop();
496 return string_value;
497 }
498
PopPendingTextValue()499 libtextclassifier3::StatusOr<QueryTerm> QueryVisitor::PopPendingTextValue() {
500 if (pending_values_.empty()) {
501 return absl_ports::InvalidArgumentError("Unable to retrieve text value.");
502 }
503 ICING_ASSIGN_OR_RETURN(QueryTerm text_value,
504 std::move(pending_values_.top()).text_val());
505 pending_values_.pop();
506 return text_value;
507 }
508
509 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
ProduceTextTokenIterators(QueryTerm text_value)510 QueryVisitor::ProduceTextTokenIterators(QueryTerm text_value) {
511 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> token_itr,
512 tokenizer_.Tokenize(text_value.term));
513 std::string normalized_term;
514 std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
515 // raw_text is the portion of text_value.raw_term that hasn't yet been
516 // matched to any of the tokens that we've processed. escaped_token will
517 // hold the portion of raw_text that corresponds to the current token that
518 // is being processed.
519 std::string_view raw_text = text_value.raw_term;
520 std::string_view raw_token;
521 bool reached_final_token = !token_itr->Advance();
522 // If the term is different then the raw_term, then there must have been some
523 // escaped characters that we will need to handle.
524 while (!reached_final_token) {
525 std::vector<Token> tokens = token_itr->GetTokens();
526 if (tokens.size() > 1) {
527 // The tokenizer iterator iterates between token groups. In practice,
528 // the tokenizer used with QueryVisitor (PlainTokenizer) will always
529 // only produce a single token per token group.
530 return absl_ports::InvalidArgumentError(
531 "Encountered unexpected token group with >1 tokens.");
532 }
533
534 reached_final_token = !token_itr->Advance();
535 const Token& token = tokens.at(0);
536 if (reached_final_token && token.text.length() == raw_text.length()) {
537 // Unescaped tokens are strictly smaller than their escaped counterparts
538 // This means that if we're at the final token and token.length equals
539 // raw_text, then all of raw_text must correspond to this token.
540 raw_token = raw_text;
541 } else {
542 ICING_ASSIGN_OR_RETURN(
543 raw_token, string_util::FindEscapedToken(raw_text, token.text));
544 }
545 normalized_term = normalizer_.NormalizeTerm(token.text);
546 QueryTerm term_value{std::move(normalized_term), raw_token,
547 reached_final_token && text_value.is_prefix_val};
548 ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iterator,
549 CreateTermIterator(std::move(term_value)));
550 iterators.push_back(std::move(iterator));
551
552 // Remove escaped_token from raw_text now that we've processed
553 // raw_text.
554 const char* escaped_token_end = raw_token.data() + raw_token.length();
555 raw_text = raw_text.substr(escaped_token_end - raw_text.data());
556 }
557 // Finally, create an And Iterator. If there's only a single term here, then
558 // it will just return that term iterator. Otherwise, segmented text is
559 // treated as a group of terms AND'd together.
560 return CreateAndIterator(std::move(iterators));
561 }
562
563 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
PopPendingIterator()564 QueryVisitor::PopPendingIterator() {
565 if (pending_values_.empty() || pending_values_.top().is_placeholder()) {
566 return absl_ports::InvalidArgumentError("Unable to retrieve iterator.");
567 }
568 if (pending_values_.top().data_type() == DataType::kDocumentIterator) {
569 std::unique_ptr<DocHitInfoIterator> iterator =
570 std::move(pending_values_.top()).iterator().ValueOrDie();
571 pending_values_.pop();
572 return iterator;
573 } else if (pending_values_.top().data_type() == DataType::kString) {
574 features_.insert(kVerbatimSearchFeature);
575 ICING_ASSIGN_OR_RETURN(QueryTerm string_value, PopPendingStringValue());
576 return CreateTermIterator(std::move(string_value));
577 } else {
578 ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue());
579 return ProduceTextTokenIterators(std::move(text_value));
580 }
581 }
582
583 libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DocHitInfoIterator>>>
PopAllPendingIterators()584 QueryVisitor::PopAllPendingIterators() {
585 std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
586 while (!pending_values_.empty() && !pending_values_.top().is_placeholder()) {
587 ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> itr,
588 PopPendingIterator());
589 iterators.push_back(std::move(itr));
590 }
591 if (pending_values_.empty()) {
592 return absl_ports::InvalidArgumentError(
593 "Unable to retrieve expected iterators.");
594 }
595 // Iterators will be in reverse order because we retrieved them from the
596 // stack. Reverse them to get back to the original ordering.
597 std::reverse(iterators.begin(), iterators.end());
598 return iterators;
599 }
600
ProcessNumericComparator(const NaryOperatorNode * node)601 libtextclassifier3::Status QueryVisitor::ProcessNumericComparator(
602 const NaryOperatorNode* node) {
603 if (node->children().size() != 2) {
604 return absl_ports::InvalidArgumentError("Expected 2 children.");
605 }
606
607 // 1. Put in a placeholder PendingValue
608 pending_values_.push(PendingValue());
609
610 // 2. The first child is the property to restrict by.
611 node->children().at(0)->Accept(this);
612 if (has_pending_error()) {
613 return std::move(pending_error_);
614 }
615 ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue());
616
617 if (text_value.is_prefix_val) {
618 return absl_ports::InvalidArgumentError(
619 "Cannot use prefix operator '*' with a property name!");
620 }
621
622 // If there is an active property restrict and this property is not present in
623 // in the active restrict set, then it's not satisfiable.
624 if (pending_property_restricts_.has_active_property_restricts() &&
625 pending_property_restricts_.active_property_restricts().find(
626 text_value.term) ==
627 pending_property_restricts_.active_property_restricts().end()) {
628 // The property restrict can't be satisfiable. Pop the placeholder that was
629 // just added and push a FALSE iterator.
630 pending_property_restricts_.PopRestricts();
631 pending_values_.pop();
632 pending_values_.push(
633 PendingValue(std::make_unique<DocHitInfoIteratorNone>()));
634 return libtextclassifier3::Status::OK;
635 }
636
637 // 3. The second child should be parseable as an integer value.
638 expecting_numeric_arg_ = true;
639 node->children().at(1)->Accept(this);
640 expecting_numeric_arg_ = false;
641 ICING_ASSIGN_OR_RETURN(int64_t int_value, PopPendingIntValue());
642
643 // 4. Check for the placeholder.
644 if (!pending_values_.top().is_placeholder()) {
645 return absl_ports::InvalidArgumentError(
646 "Error processing arguments for node.");
647 }
648 pending_values_.pop();
649
650 // 5. Create the iterator and push it onto pending_values_.
651 ICING_ASSIGN_OR_RETURN(Int64Range range,
652 GetInt64Range(node->operator_text(), int_value));
653 ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iterator,
654 numeric_index_.GetIterator(
655 text_value.term, range.low, range.high,
656 document_store_, schema_store_, current_time_ms_));
657
658 features_.insert(kNumericSearchFeature);
659 pending_values_.push(PendingValue(std::move(iterator)));
660 return libtextclassifier3::Status::OK;
661 }
662
ProcessAndOperator(const NaryOperatorNode * node)663 libtextclassifier3::StatusOr<PendingValue> QueryVisitor::ProcessAndOperator(
664 const NaryOperatorNode* node) {
665 ICING_ASSIGN_OR_RETURN(
666 std::vector<std::unique_ptr<DocHitInfoIterator>> iterators,
667 PopAllPendingIterators());
668 return PendingValue(CreateAndIterator(std::move(iterators)));
669 }
670
ProcessOrOperator(const NaryOperatorNode * node)671 libtextclassifier3::StatusOr<PendingValue> QueryVisitor::ProcessOrOperator(
672 const NaryOperatorNode* node) {
673 ICING_ASSIGN_OR_RETURN(
674 std::vector<std::unique_ptr<DocHitInfoIterator>> iterators,
675 PopAllPendingIterators());
676 return PendingValue(CreateOrIterator(std::move(iterators)));
677 }
678
ProcessNegationOperator(const UnaryOperatorNode * node)679 libtextclassifier3::Status QueryVisitor::ProcessNegationOperator(
680 const UnaryOperatorNode* node) {
681 // 1. Put in a placeholder PendingValue
682 pending_values_.push(PendingValue());
683
684 // 2. Visit child
685 node->child()->Accept(this);
686 if (has_pending_error()) {
687 return std::move(pending_error_);
688 }
689
690 if (pending_values_.size() < 2) {
691 return absl_ports::InvalidArgumentError(
692 "Visit unary operator child didn't correctly add pending values.");
693 }
694
695 // 3. We want to preserve the original text of the numeric value, append our
696 // minus to the text. It will be parsed as either an int or a double later.
697 ICING_ASSIGN_OR_RETURN(QueryTerm numeric_text_val, PopPendingTextValue());
698 numeric_text_val.term = absl_ports::StrCat("-", numeric_text_val.term);
699 PendingValue pending_value =
700 PendingValue::CreateTextPendingValue(std::move(numeric_text_val));
701
702 // We've parsed our numeric value successfully. Pop our placeholder, push it
703 // on to the stack and return successfully.
704 if (!pending_values_.top().is_placeholder()) {
705 return absl_ports::InvalidArgumentError(
706 "Error processing arguments for node.");
707 }
708 pending_values_.pop();
709 pending_values_.push(std::move(pending_value));
710 return libtextclassifier3::Status::OK;
711 }
712
ProcessNotOperator(const UnaryOperatorNode * node)713 libtextclassifier3::Status QueryVisitor::ProcessNotOperator(
714 const UnaryOperatorNode* node) {
715 // TODO(b/265312785) Consider implementing query optimization when we run into
716 // nested NOTs. This would allow us to simplify a query like "NOT (-foo)" to
717 // just "foo". This would also require more complicate rewrites as we would
718 // need to do things like rewrite "NOT (-a OR b)" as "a AND -b" and
719 // "NOT (price < 5)" as "price >= 5".
720 // 1. Put in a placeholder PendingValue
721 pending_values_.push(PendingValue());
722 // Toggle whatever the current value of 'processing_not_' is before visiting
723 // the children.
724 processing_not_ = !processing_not_;
725
726 // 2. Visit child
727 node->child()->Accept(this);
728 if (has_pending_error()) {
729 return std::move(pending_error_);
730 }
731
732 if (pending_values_.size() < 2) {
733 return absl_ports::InvalidArgumentError(
734 "Visit unary operator child didn't correctly add pending values.");
735 }
736
737 // 3. Retrieve the delegate iterator
738 ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> delegate,
739 PopPendingIterator());
740
741 // 4. Check for the placeholder.
742 if (!pending_values_.top().is_placeholder()) {
743 return absl_ports::InvalidArgumentError(
744 "Error processing arguments for node.");
745 }
746 pending_values_.pop();
747
748 pending_values_.push(PendingValue(std::make_unique<DocHitInfoIteratorNot>(
749 std::move(delegate), document_store_.last_added_document_id())));
750
751 // Untoggle whatever the current value of 'processing_not_' is now that we've
752 // finished processing this NOT.
753 processing_not_ = !processing_not_;
754 return libtextclassifier3::Status::OK;
755 }
756
ProcessHasOperator(const NaryOperatorNode * node)757 libtextclassifier3::Status QueryVisitor::ProcessHasOperator(
758 const NaryOperatorNode* node) {
759 if (node->children().size() != 2) {
760 return absl_ports::InvalidArgumentError("Expected 2 children.");
761 }
762
763 // 1. Put in a placeholder PendingValue
764 pending_values_.push(PendingValue());
765
766 // 2. Visit the first child - the property.
767 node->children().at(0)->Accept(this);
768 if (has_pending_error()) {
769 return pending_error_;
770 }
771 ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue());
772 if (text_value.is_prefix_val) {
773 return absl_ports::InvalidArgumentError(
774 "Cannot use prefix operator '*' with a property name!");
775 }
776 pending_property_restricts_.AddValidRestricts({text_value.term});
777
778 // Just added a restrict - if there are no active property restricts then that
779 // be because this restrict is unsatisfiable.
780 if (pending_property_restricts_.active_property_restricts().empty()) {
781 // The property restrict can't be satisfiable. Pop the placeholder that was
782 // just added and push a FALSE iterator.
783 pending_property_restricts_.PopRestricts();
784 pending_values_.pop();
785 pending_values_.push(
786 PendingValue(std::make_unique<DocHitInfoIteratorNone>()));
787 return libtextclassifier3::Status::OK;
788 }
789
790 // 3. Visit the second child - the argument.
791 node->children().at(1)->Accept(this);
792 if (has_pending_error()) {
793 return pending_error_;
794 }
795 ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> delegate,
796 PopPendingIterator());
797
798 // 4. Check for the placeholder.
799 if (!pending_values_.top().is_placeholder()) {
800 return absl_ports::InvalidArgumentError(
801 "Error processing arguments for node.");
802 }
803 pending_values_.pop();
804 pending_property_restricts_.PopRestricts();
805
806 std::set<std::string> property_restricts = {std::move(text_value.term)};
807 pending_values_.push(
808 PendingValue(DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
809 std::move(delegate), &document_store_, &schema_store_,
810 std::move(property_restricts), current_time_ms_)));
811 return libtextclassifier3::Status::OK;
812 }
813
VisitFunctionName(const FunctionNameNode * node)814 void QueryVisitor::VisitFunctionName(const FunctionNameNode* node) {
815 pending_error_ = absl_ports::UnimplementedError(
816 "Function Name node visiting not implemented yet.");
817 }
818
VisitString(const StringNode * node)819 void QueryVisitor::VisitString(const StringNode* node) {
820 // A STRING node can only be a term. Create the iterator now.
821 auto unescaped_string_or = string_util::UnescapeStringValue(node->value());
822 if (!unescaped_string_or.ok()) {
823 pending_error_ = std::move(unescaped_string_or).status();
824 return;
825 }
826 std::string unescaped_string = std::move(unescaped_string_or).ValueOrDie();
827 QueryTerm val{std::move(unescaped_string), node->raw_value(),
828 node->is_prefix()};
829 pending_values_.push(PendingValue::CreateStringPendingValue(std::move(val)));
830 }
831
VisitText(const TextNode * node)832 void QueryVisitor::VisitText(const TextNode* node) {
833 // TEXT nodes could either be a term (and will become DocHitInfoIteratorTerm)
834 // or a property name. As such, we just push the TEXT value into pending
835 // values and determine which it is at a later point.
836 QueryTerm val{std::move(node->value()), node->raw_value(), node->is_prefix()};
837 pending_values_.push(PendingValue::CreateTextPendingValue(std::move(val)));
838 }
839
VisitMember(const MemberNode * node)840 void QueryVisitor::VisitMember(const MemberNode* node) {
841 if (node->children().empty()) {
842 pending_error_ =
843 absl_ports::InvalidArgumentError("Encountered malformed member node.");
844 return;
845 }
846
847 // 1. Put in a placeholder PendingValue
848 pending_values_.push(PendingValue());
849
850 // 2. Visit the children.
851 for (const std::unique_ptr<TextNode>& child : node->children()) {
852 child->Accept(this);
853 if (has_pending_error()) {
854 return;
855 }
856 }
857
858 // 3. Now process the results of the children and produce a single pending
859 // value representing this member.
860 PendingValue pending_value;
861 if (node->children().size() == 1) {
862 // 3a. This member only has a single child, then the pending value produced
863 // by that child is the final value produced by this member.
864 pending_value = std::move(pending_values_.top());
865 pending_values_.pop();
866 } else {
867 // 3b. Retrieve the values of all children and concatenate them into a
868 // single value.
869 libtextclassifier3::StatusOr<QueryTerm> member_or;
870 std::vector<std::string> members;
871 QueryTerm text_val;
872 const char* start = nullptr;
873 const char* end = nullptr;
874 while (!pending_values_.empty() &&
875 !pending_values_.top().is_placeholder()) {
876 member_or = PopPendingTextValue();
877 if (!member_or.ok()) {
878 pending_error_ = std::move(member_or).status();
879 return;
880 }
881 text_val = std::move(member_or).ValueOrDie();
882 if (text_val.is_prefix_val) {
883 pending_error_ = absl_ports::InvalidArgumentError(
884 "Cannot use prefix operator '*' within a property name!");
885 return;
886 }
887 if (start == nullptr) {
888 start = text_val.raw_term.data();
889 end = text_val.raw_term.data() + text_val.raw_term.length();
890 } else {
891 start = std::min(start, text_val.raw_term.data());
892 end = std::max(end,
893 text_val.raw_term.data() + text_val.raw_term.length());
894 }
895 members.push_back(std::move(text_val.term));
896 }
897 QueryTerm member;
898 member.term = absl_ports::StrJoin(members.rbegin(), members.rend(),
899 property_util::kPropertyPathSeparator);
900 member.raw_term = std::string_view(start, end - start);
901 member.is_prefix_val = false;
902 pending_value = PendingValue::CreateTextPendingValue(std::move(member));
903 }
904
905 // 4. If pending_values_ is empty somehow, then our placeholder disappeared
906 // somehow.
907 if (pending_values_.empty()) {
908 pending_error_ = absl_ports::InvalidArgumentError(
909 "Error processing arguments for member node.");
910 return;
911 }
912 pending_values_.pop();
913
914 pending_values_.push(std::move(pending_value));
915 }
916
VisitFunction(const FunctionNode * node)917 void QueryVisitor::VisitFunction(const FunctionNode* node) {
918 // 1. Get the associated function.
919 auto itr = registered_functions_.find(node->function_name()->value());
920 if (itr == registered_functions_.end()) {
921 pending_error_ = absl_ports::InvalidArgumentError(absl_ports::StrCat(
922 "Function ", node->function_name()->value(), " is not supported."));
923 return;
924 }
925 const Function& function = itr->second;
926
927 // 2. Put in a placeholder PendingValue
928 pending_values_.push(PendingValue());
929
930 // 3. Visit the children.
931 expecting_numeric_arg_ = true;
932 for (int i = 0; i < node->args().size(); ++i) {
933 const std::unique_ptr<Node>& arg = node->args()[i];
934 libtextclassifier3::StatusOr<DataType> arg_type_or =
935 function.get_param_type(i);
936 bool current_level_expecting_numeric_arg = expecting_numeric_arg_;
937 // If arg_type_or has an error, we should ignore it for now, since
938 // function.Eval should do the type check and return better error messages.
939 if (arg_type_or.ok() && (arg_type_or.ValueOrDie() == DataType::kLong ||
940 arg_type_or.ValueOrDie() == DataType::kDouble)) {
941 expecting_numeric_arg_ = true;
942 }
943 arg->Accept(this);
944 expecting_numeric_arg_ = current_level_expecting_numeric_arg;
945 if (has_pending_error()) {
946 return;
947 }
948 }
949
950 // 4. Collect the arguments and evaluate the function.
951 std::vector<PendingValue> args;
952 while (!pending_values_.empty() && !pending_values_.top().is_placeholder()) {
953 args.push_back(std::move(pending_values_.top()));
954 pending_values_.pop();
955 }
956 std::reverse(args.begin(), args.end());
957 auto eval_result = function.Eval(std::move(args));
958 if (!eval_result.ok()) {
959 pending_error_ = std::move(eval_result).status();
960 return;
961 }
962
963 // 5. Pop placeholder in pending_values and add the result of our function.
964 pending_values_.pop();
965 pending_values_.push(std::move(eval_result).ValueOrDie());
966
967 // Support for custom functions was added in list filters.
968 features_.insert(kListFilterQueryLanguageFeature);
969 }
970
971 // TODO(b/265312785) Clarify handling of the interaction between HAS and NOT.
972 // Currently, `prop1:(NOT foo bar)` will not match any documents. Likewise,
973 // `search("NOT foo bar", createList("prop1"))` will not match any documents.
974 //
975 // We should either confirm that this is the desired behavior or consider
976 // rewriting these queries so that they're interpreted as
977 // `NOT prop1:foo AND prop1:bar` and
978 // `NOT search("foo", createList("prop1"))
979 // AND search("bar", createList("prop1"))`
VisitUnaryOperator(const UnaryOperatorNode * node)980 void QueryVisitor::VisitUnaryOperator(const UnaryOperatorNode* node) {
981 bool is_minus = node->operator_text() == "MINUS";
982 if (node->operator_text() != "NOT" && !is_minus) {
983 pending_error_ = absl_ports::UnimplementedError(
984 absl_ports::StrCat("Visiting for unary operator ",
985 node->operator_text(), " not implemented yet."));
986 return;
987 }
988
989 libtextclassifier3::Status status;
990 if (expecting_numeric_arg_ && is_minus) {
991 // If the operator is a MINUS ('-') and we're at the child of a numeric
992 // comparator, then this must be a negation ('-3')
993 status = ProcessNegationOperator(node);
994 } else {
995 status = ProcessNotOperator(node);
996 }
997
998 if (!status.ok()) {
999 pending_error_ = std::move(status);
1000 }
1001
1002 if (!is_minus ||
1003 pending_property_restricts_.has_active_property_restricts() ||
1004 processing_not_) {
1005 // 'NOT' operator was added in list filters.
1006 // Likewise, mixing property restricts and NOTs were made valid in list
1007 // filters.
1008 features_.insert(kListFilterQueryLanguageFeature);
1009 }
1010 }
1011
VisitNaryOperator(const NaryOperatorNode * node)1012 void QueryVisitor::VisitNaryOperator(const NaryOperatorNode* node) {
1013 if (!IsSupportedNaryOperator(node->operator_text())) {
1014 pending_error_ = absl_ports::UnimplementedError(
1015 "No support for any non-numeric operators.");
1016 return;
1017 }
1018
1019 if (pending_property_restricts_.has_active_property_restricts() ||
1020 processing_not_) {
1021 // Likewise, mixing property restricts and NOT with compound statements was
1022 // added in list filters.
1023 features_.insert(kListFilterQueryLanguageFeature);
1024 }
1025
1026 if (node->operator_text() == ":") {
1027 libtextclassifier3::Status status = ProcessHasOperator(node);
1028 if (!status.ok()) {
1029 pending_error_ = std::move(status);
1030 }
1031 return;
1032 } else if (IsNumericComparator(node->operator_text())) {
1033 libtextclassifier3::Status status = ProcessNumericComparator(node);
1034 if (!status.ok()) {
1035 pending_error_ = std::move(status);
1036 }
1037 return;
1038 }
1039
1040 // 1. Put in a placeholder PendingValue
1041 pending_values_.push(PendingValue());
1042
1043 // 2. Visit the children.
1044 for (int i = 0; i < node->children().size(); ++i) {
1045 node->children().at(i)->Accept(this);
1046 if (has_pending_error()) {
1047 return;
1048 }
1049 }
1050
1051 // 3. Retrieve the pending value for this node.
1052 libtextclassifier3::StatusOr<PendingValue> pending_value_or;
1053 if (node->operator_text() == "AND") {
1054 pending_value_or = ProcessAndOperator(node);
1055 } else if (node->operator_text() == "OR") {
1056 pending_value_or = ProcessOrOperator(node);
1057 }
1058 if (!pending_value_or.ok()) {
1059 pending_error_ = std::move(pending_value_or).status();
1060 return;
1061 }
1062 PendingValue pending_value = std::move(pending_value_or).ValueOrDie();
1063
1064 // 4. Check for the placeholder.
1065 if (!pending_values_.top().is_placeholder()) {
1066 pending_error_ = absl_ports::InvalidArgumentError(
1067 "Error processing arguments for node.");
1068 return;
1069 }
1070 pending_values_.pop();
1071
1072 pending_values_.push(std::move(pending_value));
1073 }
1074
ConsumeResults()1075 libtextclassifier3::StatusOr<QueryResults> QueryVisitor::ConsumeResults() && {
1076 if (has_pending_error()) {
1077 return std::move(pending_error_);
1078 }
1079 if (pending_values_.size() != 1) {
1080 return absl_ports::InvalidArgumentError(
1081 "Visitor does not contain a single root iterator.");
1082 }
1083 auto iterator_or = PopPendingIterator();
1084 if (!iterator_or.ok()) {
1085 return std::move(iterator_or).status();
1086 }
1087
1088 QueryResults results;
1089 results.root_iterator = std::move(iterator_or).ValueOrDie();
1090 results.query_term_iterators = std::move(query_term_iterators_);
1091 results.query_terms = std::move(property_query_terms_map_);
1092 results.embedding_query_results = std::move(embedding_query_results_);
1093 results.features_in_use = std::move(features_);
1094 return results;
1095 }
1096
1097 } // namespace lib
1098 } // namespace icing
1099