• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/query/advanced_query_parser/query-visitor.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <iterator>
20 #include <limits>
21 #include <memory>
22 #include <set>
23 #include <string>
24 #include <string_view>
25 #include <unordered_map>
26 #include <utility>
27 #include <vector>
28 
29 #include "icing/text_classifier/lib3/utils/base/status.h"
30 #include "icing/text_classifier/lib3/utils/base/statusor.h"
31 #include "icing/absl_ports/canonical_errors.h"
32 #include "icing/absl_ports/str_cat.h"
33 #include "icing/absl_ports/str_join.h"
34 #include "icing/index/embed/doc-hit-info-iterator-embedding.h"
35 #include "icing/index/embed/embedding-query-results.h"
36 #include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h"
37 #include "icing/index/iterator/doc-hit-info-iterator-and.h"
38 #include "icing/index/iterator/doc-hit-info-iterator-filter.h"
39 #include "icing/index/iterator/doc-hit-info-iterator-none.h"
40 #include "icing/index/iterator/doc-hit-info-iterator-not.h"
41 #include "icing/index/iterator/doc-hit-info-iterator-or.h"
42 #include "icing/index/iterator/doc-hit-info-iterator-property-in-document.h"
43 #include "icing/index/iterator/doc-hit-info-iterator-property-in-schema.h"
44 #include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h"
45 #include "icing/index/iterator/doc-hit-info-iterator.h"
46 #include "icing/index/iterator/section-restrict-data.h"
47 #include "icing/index/property-existence-indexing-handler.h"
48 #include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
49 #include "icing/query/advanced_query_parser/function.h"
50 #include "icing/query/advanced_query_parser/lexer.h"
51 #include "icing/query/advanced_query_parser/param.h"
52 #include "icing/query/advanced_query_parser/parser.h"
53 #include "icing/query/advanced_query_parser/pending-value.h"
54 #include "icing/query/advanced_query_parser/util/string-util.h"
55 #include "icing/query/query-features.h"
56 #include "icing/query/query-results.h"
57 #include "icing/schema/property-util.h"
58 #include "icing/schema/schema-store.h"
59 #include "icing/schema/section.h"
60 #include "icing/tokenization/token.h"
61 #include "icing/tokenization/tokenizer.h"
62 #include "icing/util/embedding-util.h"
63 #include "icing/util/status-macros.h"
64 
65 namespace icing {
66 namespace lib {
67 
68 namespace {
69 
70 struct CreateList {
operator ()icing::lib::__anone73a13e90111::CreateList71   libtextclassifier3::StatusOr<PendingValue> operator()(
72       std::vector<PendingValue>&& args) const {
73     std::vector<std::string> values;
74     values.reserve(args.size());
75     for (PendingValue& arg : args) {
76       QueryTerm string_val = std::move(arg).string_val().ValueOrDie();
77       values.push_back(std::move(string_val.term));
78     }
79     return PendingValue(std::move(values));
80   }
81 };
82 
IsNumericComparator(std::string_view operator_text)83 bool IsNumericComparator(std::string_view operator_text) {
84   if (operator_text.length() < 1 || operator_text.length() > 2) {
85     return false;
86   }
87   // TODO(tjbarron) decide how/if to support !=
88   return operator_text == "<" || operator_text == ">" ||
89          operator_text == "==" || operator_text == "<=" ||
90          operator_text == ">=";
91 }
92 
IsSupportedNaryOperator(std::string_view operator_text)93 bool IsSupportedNaryOperator(std::string_view operator_text) {
94   return IsNumericComparator(operator_text) || operator_text == "AND" ||
95          operator_text == "OR" || operator_text == ":";
96 }
97 
98 struct Int64Range {
99   int64_t low;
100   int64_t high;
101 };
102 
GetInt64Range(std::string_view operator_text,int64_t int_value)103 libtextclassifier3::StatusOr<Int64Range> GetInt64Range(
104     std::string_view operator_text, int64_t int_value) {
105   Int64Range range = {std::numeric_limits<int64_t>::min(),
106                       std::numeric_limits<int64_t>::max()};
107   if (operator_text == "<") {
108     if (int_value == std::numeric_limits<int64_t>::min()) {
109       return absl_ports::InvalidArgumentError(
110           "Cannot specify < INT64_MIN in query expression.");
111     }
112     range.high = int_value - 1;
113   } else if (operator_text == "<=") {
114     range.high = int_value;
115   } else if (operator_text == "==") {
116     range.high = int_value;
117     range.low = int_value;
118   } else if (operator_text == ">=") {
119     range.low = int_value;
120   } else if (operator_text == ">") {
121     if (int_value == std::numeric_limits<int64_t>::max()) {
122       return absl_ports::InvalidArgumentError(
123           "Cannot specify > INT64_MAX in query expression.");
124     }
125     range.low = int_value + 1;
126   }
127   return range;
128 }
129 
130 }  // namespace
131 
AddValidRestricts(std::set<std::string> new_restricts)132 void QueryVisitor::PendingPropertyRestricts::AddValidRestricts(
133     std::set<std::string> new_restricts) {
134   if (!has_active_property_restricts()) {
135     pending_property_restricts_.push_back(std::move(new_restricts));
136     return;
137   }
138 
139   // There is an active property restrict already in effect. To determine the
140   // updated active property restrict being applied at this level, we need to
141   // calculate the intersection of new_restricts and
142   // active_property_restricts.
143   const std::set<std::string>& active_restricts = active_property_restricts();
144   auto active_restricts_itr = active_restricts.begin();
145   for (auto new_restricts_itr = new_restricts.begin();
146        new_restricts_itr != new_restricts.end();) {
147     while (active_restricts_itr != active_restricts.end() &&
148            *active_restricts_itr < *new_restricts_itr) {
149       // new_restricts_itr is behind active_restricts_itr.
150       ++active_restricts_itr;
151     }
152     if (active_restricts_itr == active_restricts.end()) {
153       // There's nothing left in active restricts. Everything at
154       // new_restricts_itr and beyond should be removed
155       new_restricts_itr =
156           new_restricts.erase(new_restricts_itr, new_restricts.end());
157     } else if (*active_restricts_itr > *new_restricts_itr) {
158       // new_restricts_itr points to elements not present in
159       // active_restricts_itr
160       new_restricts_itr = new_restricts.erase(new_restricts_itr);
161     } else {
162       // the element that new_restricts_itr points to is present in
163       // active_restricts_itr.
164       ++new_restricts_itr;
165     }
166   }
167   pending_property_restricts_.push_back(std::move(new_restricts));
168 }
169 
170 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
CreateTermIterator(const QueryTerm & query_term)171 QueryVisitor::CreateTermIterator(const QueryTerm& query_term) {
172   if (query_term.is_prefix_val) {
173     // '*' prefix operator was added in list filters
174     features_.insert(kListFilterQueryLanguageFeature);
175   }
176   TermMatchType::Code match_type = GetTermMatchType(query_term.is_prefix_val);
177   int unnormalized_term_start =
178       query_term.raw_term.data() - raw_query_text_.data();
179   if (!processing_not_) {
180     // 1. Add term to property_query_terms_map
181     if (pending_property_restricts_.has_active_property_restricts()) {
182       for (const std::string& property_restrict :
183            pending_property_restricts_.active_property_restricts()) {
184         property_query_terms_map_[property_restrict].insert(query_term.term);
185       }
186     } else {
187       property_query_terms_map_[""].insert(query_term.term);
188     }
189 
190     // 2. If needed add term iterator to query_term_iterators_ map.
191     if (needs_term_frequency_info_) {
192       ICING_ASSIGN_OR_RETURN(
193           std::unique_ptr<DocHitInfoIterator> term_iterator,
194           index_.GetIterator(query_term.term, unnormalized_term_start,
195                              query_term.raw_term.length(), kSectionIdMaskAll,
196                              match_type_, needs_term_frequency_info_));
197       query_term_iterators_[query_term.term] =
198           std::make_unique<DocHitInfoIteratorFilter>(
199               std::move(term_iterator), &document_store_, &schema_store_,
200               filter_options_, current_time_ms_);
201     }
202   }
203 
204   // 3. Add the term iterator.
205   return index_.GetIterator(query_term.term, unnormalized_term_start,
206                             query_term.raw_term.length(), kSectionIdMaskAll,
207                             match_type, needs_term_frequency_info_);
208 }
209 
RegisterFunctions()210 void QueryVisitor::RegisterFunctions() {
211   // std::vector<std::string> createList(std::string...);
212   Function create_list_function_ =
213       Function::Create(DataType::kStringList, "createList",
214                        {Param(DataType::kString, Cardinality::kRequired),
215                         Param(DataType::kString, Cardinality::kVariable)},
216                        CreateList())
217           .ValueOrDie();
218   registered_functions_.insert(
219       {create_list_function_.name(), std::move(create_list_function_)});
220 
221   // DocHitInfoIterator search(std::string);
222   // DocHitInfoIterator search(std::string, std::vector<std::string>);
223   auto search_eval = [this](std::vector<PendingValue>&& args) {
224     return this->SearchFunction(std::move(args));
225   };
226   Function search_function =
227       Function::Create(DataType::kDocumentIterator, "search",
228                        {Param(DataType::kString),
229                         Param(DataType::kStringList, Cardinality::kOptional)},
230                        std::move(search_eval))
231           .ValueOrDie();
232   registered_functions_.insert(
233       {search_function.name(), std::move(search_function)});
234 
235   // DocHitInfoIterator propertyDefined(std::string);
236   auto property_defined = [this](std::vector<PendingValue>&& args) {
237     return this->PropertyDefinedFunction(std::move(args));
238   };
239   Function property_defined_function =
240       Function::Create(DataType::kDocumentIterator, "propertyDefined",
241                        {Param(DataType::kString)}, std::move(property_defined))
242           .ValueOrDie();
243   registered_functions_.insert(
244       {property_defined_function.name(), std::move(property_defined_function)});
245 
246   // DocHitInfoIterator hasProperty(std::string);
247   auto has_property = [this](std::vector<PendingValue>&& args) {
248     return this->HasPropertyFunction(std::move(args));
249   };
250   Function has_property_function =
251       Function::Create(DataType::kDocumentIterator, "hasProperty",
252                        {Param(DataType::kString)}, std::move(has_property))
253           .ValueOrDie();
254   registered_functions_.insert(
255       {has_property_function.name(), std::move(has_property_function)});
256 
257   // vector_index getSearchSpecEmbedding(long);
258   auto get_search_spec_embedding = [](std::vector<PendingValue>&& args) {
259     return PendingValue::CreateVectorIndexPendingValue(
260         args.at(0).long_val().ValueOrDie());
261   };
262   Function get_search_spec_embedding_function =
263       Function::Create(DataType::kVectorIndex, "getSearchSpecEmbedding",
264                        {Param(DataType::kLong)},
265                        std::move(get_search_spec_embedding))
266           .ValueOrDie();
267   registered_functions_.insert({get_search_spec_embedding_function.name(),
268                                 std::move(get_search_spec_embedding_function)});
269 
270   // DocHitInfoIterator semanticSearch(vector_index, double, double, string);
271   auto semantic_search = [this](std::vector<PendingValue>&& args) {
272     return this->SemanticSearchFunction(std::move(args));
273   };
274   Function semantic_search_function =
275       Function::Create(DataType::kDocumentIterator, "semanticSearch",
276                        {Param(DataType::kVectorIndex),
277                         Param(DataType::kDouble, Cardinality::kOptional),
278                         Param(DataType::kDouble, Cardinality::kOptional),
279                         Param(DataType::kString, Cardinality::kOptional)},
280                        std::move(semantic_search))
281           .ValueOrDie();
282   registered_functions_.insert(
283       {semantic_search_function.name(), std::move(semantic_search_function)});
284 
285   // DocHitInfoIterator tokenize(std::string);
286   auto tokenize = [this](std::vector<PendingValue>&& args) {
287     return this->TokenizeFunction(std::move(args));
288   };
289   Function tokenize_function =
290       Function::Create(DataType::kDocumentIterator, "tokenize",
291                        {Param(DataType::kString)}, std::move(tokenize))
292           .ValueOrDie();
293   registered_functions_.insert(
294       {tokenize_function.name(), std::move(tokenize_function)});
295 }
296 
SearchFunction(std::vector<PendingValue> && args)297 libtextclassifier3::StatusOr<PendingValue> QueryVisitor::SearchFunction(
298     std::vector<PendingValue>&& args) {
299   // The second arg (if present) is a list of sections to restrict to.
300   if (args.size() == 2) {
301     std::set<std::string> new_restricts;
302     std::vector<std::string> property_restricts =
303         std::move(args.at(1)).string_vals().ValueOrDie();
304     for (std::string& property_restrict : property_restricts) {
305       new_restricts.insert(std::move(property_restrict));
306     }
307     pending_property_restricts_.AddValidRestricts(std::move(new_restricts));
308     if (pending_property_restricts_.active_property_restricts().empty()) {
309       pending_property_restricts_.PopRestricts();
310       return PendingValue(std::make_unique<DocHitInfoIteratorNone>());
311     }
312   }
313 
314   // The first arg is guaranteed to be a STRING at this point. It should be safe
315   // to call ValueOrDie.
316   const QueryTerm* query = args.at(0).string_val().ValueOrDie();
317   Lexer lexer(query->term, Lexer::Language::QUERY);
318   ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
319                          lexer.ExtractTokens());
320 
321   Parser parser = Parser::Create(std::move(lexer_tokens));
322   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> tree_root,
323                          parser.ConsumeQuery());
324 
325   std::unique_ptr<DocHitInfoIterator> iterator;
326   QueryResults query_result;
327   if (tree_root == nullptr) {
328     iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>(
329         document_store_.last_added_document_id());
330   } else {
331     QueryVisitor query_visitor(
332         &index_, &numeric_index_, &embedding_index_, &document_store_,
333         &schema_store_, &normalizer_, &tokenizer_, query->raw_term,
334         embedding_query_vectors_, filter_options_, match_type_,
335         embedding_query_metric_type_, needs_term_frequency_info_,
336         pending_property_restricts_, processing_not_, current_time_ms_);
337     tree_root->Accept(&query_visitor);
338     ICING_ASSIGN_OR_RETURN(query_result,
339                            std::move(query_visitor).ConsumeResults());
340     iterator = std::move(query_result.root_iterator);
341   }
342 
343   // Update members based on results of processing the query.
344   if (args.size() == 2 &&
345       pending_property_restricts_.has_active_property_restricts()) {
346     iterator = DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
347         std::move(iterator), &document_store_, &schema_store_,
348         pending_property_restricts_.active_property_restricts(),
349         current_time_ms_);
350     pending_property_restricts_.PopRestricts();
351   }
352   if (!processing_not_) {
353     std::move(
354         query_result.query_term_iterators.begin(),
355         query_result.query_term_iterators.end(),
356         std::inserter(query_term_iterators_, query_term_iterators_.end()));
357 
358     std::move(query_result.query_terms.begin(), query_result.query_terms.end(),
359               std::inserter(property_query_terms_map_,
360                             property_query_terms_map_.end()));
361   }
362   std::move(query_result.features_in_use.begin(),
363             query_result.features_in_use.end(),
364             std::inserter(features_, features_.end()));
365   return PendingValue(std::move(iterator));
366 }
367 
368 libtextclassifier3::StatusOr<PendingValue>
PropertyDefinedFunction(std::vector<PendingValue> && args)369 QueryVisitor::PropertyDefinedFunction(std::vector<PendingValue>&& args) {
370   // The first arg is guaranteed to be a STRING at this point. It should be safe
371   // to call ValueOrDie.
372   const QueryTerm* member = args.at(0).string_val().ValueOrDie();
373 
374   std::unique_ptr<DocHitInfoIterator> all_docs_iterator =
375       std::make_unique<DocHitInfoIteratorAllDocumentId>(
376           document_store_.last_added_document_id());
377 
378   std::set<std::string> target_sections = {std::move(member->term)};
379   std::unique_ptr<DocHitInfoIterator> property_in_schema_iterator =
380       std::make_unique<DocHitInfoIteratorPropertyInSchema>(
381           std::move(all_docs_iterator), &document_store_, &schema_store_,
382           std::move(target_sections), current_time_ms_);
383 
384   features_.insert(kListFilterQueryLanguageFeature);
385 
386   return PendingValue(std::move(property_in_schema_iterator));
387 }
388 
HasPropertyFunction(std::vector<PendingValue> && args)389 libtextclassifier3::StatusOr<PendingValue> QueryVisitor::HasPropertyFunction(
390     std::vector<PendingValue>&& args) {
391   // The first arg is guaranteed to be a STRING at this point. It should be safe
392   // to call ValueOrDie.
393   const std::string& property_path = args.at(0).string_val().ValueOrDie()->term;
394 
395   // Perform an exact search for the property existence metadata token.
396   ICING_ASSIGN_OR_RETURN(
397       std::unique_ptr<DocHitInfoIterator> meta_hit_iterator,
398       index_.GetIterator(
399           absl_ports::StrCat(kPropertyExistenceTokenPrefix, property_path),
400           /*term_start_index=*/0,
401           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
402           TermMatchType::EXACT_ONLY,
403           /*need_hit_term_frequency=*/false));
404 
405   std::unique_ptr<DocHitInfoIterator> property_in_document_iterator =
406       std::make_unique<DocHitInfoIteratorPropertyInDocument>(
407           std::move(meta_hit_iterator));
408 
409   features_.insert(kHasPropertyFunctionFeature);
410 
411   return PendingValue(std::move(property_in_document_iterator));
412 }
413 
SemanticSearchFunction(std::vector<PendingValue> && args)414 libtextclassifier3::StatusOr<PendingValue> QueryVisitor::SemanticSearchFunction(
415     std::vector<PendingValue>&& args) {
416   features_.insert(kEmbeddingSearchFeature);
417 
418   int64_t vector_index = args.at(0).vector_index_val().ValueOrDie();
419   if (embedding_query_vectors_ == nullptr || vector_index < 0 ||
420       vector_index >= embedding_query_vectors_->size()) {
421     return absl_ports::InvalidArgumentError("Got invalid vector search index!");
422   }
423 
424   // Handle default values for the optional arguments.
425   double low = -std::numeric_limits<double>::infinity();
426   double high = std::numeric_limits<double>::infinity();
427   SearchSpecProto::EmbeddingQueryMetricType::Code metric_type =
428       embedding_query_metric_type_;
429   if (args.size() >= 2) {
430     low = args.at(1).double_val().ValueOrDie();
431   }
432   if (args.size() >= 3) {
433     high = args.at(2).double_val().ValueOrDie();
434   }
435   if (low > high) {
436     return absl_ports::InvalidArgumentError(
437         "The lower bound cannot be greater than the upper bound.");
438   }
439   if (args.size() >= 4) {
440     const std::string& metric = args.at(3).string_val().ValueOrDie()->term;
441     ICING_ASSIGN_OR_RETURN(
442         metric_type,
443         embedding_util::GetEmbeddingQueryMetricTypeFromName(metric));
444   }
445 
446   // Create SectionRestrictData for section restriction.
447   std::unique_ptr<SectionRestrictData> section_restrict_data = nullptr;
448   if (pending_property_restricts_.has_active_property_restricts()) {
449     std::unordered_map<std::string, std::set<std::string>>
450         type_property_filters;
451     type_property_filters[std::string(SchemaStore::kSchemaTypeWildcard)] =
452         pending_property_restricts_.active_property_restricts();
453     section_restrict_data = std::make_unique<SectionRestrictData>(
454         &document_store_, &schema_store_, current_time_ms_,
455         type_property_filters);
456   }
457 
458   // Create and return iterator.
459   EmbeddingQueryResults::EmbeddingQueryScoreMap* score_map =
460       &embedding_query_results_.result_scores[vector_index][metric_type];
461   ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iterator,
462                          DocHitInfoIteratorEmbedding::Create(
463                              &embedding_query_vectors_->at(vector_index),
464                              std::move(section_restrict_data), metric_type, low,
465                              high, score_map, &embedding_index_));
466   return PendingValue(std::move(iterator));
467 }
468 
TokenizeFunction(std::vector<PendingValue> && args)469 libtextclassifier3::StatusOr<PendingValue> QueryVisitor::TokenizeFunction(
470     std::vector<PendingValue>&& args) {
471   features_.insert(kTokenizeFeature);
472 
473   QueryTerm text_value = std::move(args.at(0)).string_val().ValueOrDie();
474   text_value.is_prefix_val = false;  // the prefix operator cannot be used here.
475   ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iterator,
476                          ProduceTextTokenIterators(std::move(text_value)));
477   return PendingValue(std::move(iterator));
478 }
479 
PopPendingIntValue()480 libtextclassifier3::StatusOr<int64_t> QueryVisitor::PopPendingIntValue() {
481   if (pending_values_.empty()) {
482     return absl_ports::InvalidArgumentError("Unable to retrieve int value.");
483   }
484   ICING_ASSIGN_OR_RETURN(int64_t int_value, pending_values_.top().long_val());
485   pending_values_.pop();
486   return int_value;
487 }
488 
PopPendingStringValue()489 libtextclassifier3::StatusOr<QueryTerm> QueryVisitor::PopPendingStringValue() {
490   if (pending_values_.empty()) {
491     return absl_ports::InvalidArgumentError("Unable to retrieve string value.");
492   }
493   ICING_ASSIGN_OR_RETURN(QueryTerm string_value,
494                          std::move(pending_values_.top()).string_val());
495   pending_values_.pop();
496   return string_value;
497 }
498 
PopPendingTextValue()499 libtextclassifier3::StatusOr<QueryTerm> QueryVisitor::PopPendingTextValue() {
500   if (pending_values_.empty()) {
501     return absl_ports::InvalidArgumentError("Unable to retrieve text value.");
502   }
503   ICING_ASSIGN_OR_RETURN(QueryTerm text_value,
504                          std::move(pending_values_.top()).text_val());
505   pending_values_.pop();
506   return text_value;
507 }
508 
509 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
ProduceTextTokenIterators(QueryTerm text_value)510 QueryVisitor::ProduceTextTokenIterators(QueryTerm text_value) {
511   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> token_itr,
512                          tokenizer_.Tokenize(text_value.term));
513   std::string normalized_term;
514   std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
515   // raw_text is the portion of text_value.raw_term that hasn't yet been
516   // matched to any of the tokens that we've processed. escaped_token will
517   // hold the portion of raw_text that corresponds to the current token that
518   // is being processed.
519   std::string_view raw_text = text_value.raw_term;
520   std::string_view raw_token;
521   bool reached_final_token = !token_itr->Advance();
522   // If the term is different then the raw_term, then there must have been some
523   // escaped characters that we will need to handle.
524   while (!reached_final_token) {
525     std::vector<Token> tokens = token_itr->GetTokens();
526     if (tokens.size() > 1) {
527       // The tokenizer iterator iterates between token groups. In practice,
528       // the tokenizer used with QueryVisitor (PlainTokenizer) will always
529       // only produce a single token per token group.
530       return absl_ports::InvalidArgumentError(
531           "Encountered unexpected token group with >1 tokens.");
532     }
533 
534     reached_final_token = !token_itr->Advance();
535     const Token& token = tokens.at(0);
536     if (reached_final_token && token.text.length() == raw_text.length()) {
537       // Unescaped tokens are strictly smaller than their escaped counterparts
538       // This means that if we're at the final token and token.length equals
539       // raw_text, then all of raw_text must correspond to this token.
540       raw_token = raw_text;
541     } else {
542       ICING_ASSIGN_OR_RETURN(
543           raw_token, string_util::FindEscapedToken(raw_text, token.text));
544     }
545     normalized_term = normalizer_.NormalizeTerm(token.text);
546     QueryTerm term_value{std::move(normalized_term), raw_token,
547                          reached_final_token && text_value.is_prefix_val};
548     ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iterator,
549                            CreateTermIterator(std::move(term_value)));
550     iterators.push_back(std::move(iterator));
551 
552     // Remove escaped_token from raw_text now that we've processed
553     // raw_text.
554     const char* escaped_token_end = raw_token.data() + raw_token.length();
555     raw_text = raw_text.substr(escaped_token_end - raw_text.data());
556   }
557   // Finally, create an And Iterator. If there's only a single term here, then
558   // it will just return that term iterator. Otherwise, segmented text is
559   // treated as a group of terms AND'd together.
560   return CreateAndIterator(std::move(iterators));
561 }
562 
563 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
PopPendingIterator()564 QueryVisitor::PopPendingIterator() {
565   if (pending_values_.empty() || pending_values_.top().is_placeholder()) {
566     return absl_ports::InvalidArgumentError("Unable to retrieve iterator.");
567   }
568   if (pending_values_.top().data_type() == DataType::kDocumentIterator) {
569     std::unique_ptr<DocHitInfoIterator> iterator =
570         std::move(pending_values_.top()).iterator().ValueOrDie();
571     pending_values_.pop();
572     return iterator;
573   } else if (pending_values_.top().data_type() == DataType::kString) {
574     features_.insert(kVerbatimSearchFeature);
575     ICING_ASSIGN_OR_RETURN(QueryTerm string_value, PopPendingStringValue());
576     return CreateTermIterator(std::move(string_value));
577   } else {
578     ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue());
579     return ProduceTextTokenIterators(std::move(text_value));
580   }
581 }
582 
583 libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DocHitInfoIterator>>>
PopAllPendingIterators()584 QueryVisitor::PopAllPendingIterators() {
585   std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
586   while (!pending_values_.empty() && !pending_values_.top().is_placeholder()) {
587     ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> itr,
588                            PopPendingIterator());
589     iterators.push_back(std::move(itr));
590   }
591   if (pending_values_.empty()) {
592     return absl_ports::InvalidArgumentError(
593         "Unable to retrieve expected iterators.");
594   }
595   // Iterators will be in reverse order because we retrieved them from the
596   // stack. Reverse them to get back to the original ordering.
597   std::reverse(iterators.begin(), iterators.end());
598   return iterators;
599 }
600 
ProcessNumericComparator(const NaryOperatorNode * node)601 libtextclassifier3::Status QueryVisitor::ProcessNumericComparator(
602     const NaryOperatorNode* node) {
603   if (node->children().size() != 2) {
604     return absl_ports::InvalidArgumentError("Expected 2 children.");
605   }
606 
607   // 1. Put in a placeholder PendingValue
608   pending_values_.push(PendingValue());
609 
610   // 2. The first child is the property to restrict by.
611   node->children().at(0)->Accept(this);
612   if (has_pending_error()) {
613     return std::move(pending_error_);
614   }
615   ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue());
616 
617   if (text_value.is_prefix_val) {
618     return absl_ports::InvalidArgumentError(
619         "Cannot use prefix operator '*' with a property name!");
620   }
621 
622   // If there is an active property restrict and this property is not present in
623   // in the active restrict set, then it's not satisfiable.
624   if (pending_property_restricts_.has_active_property_restricts() &&
625       pending_property_restricts_.active_property_restricts().find(
626           text_value.term) ==
627           pending_property_restricts_.active_property_restricts().end()) {
628     // The property restrict can't be satisfiable. Pop the placeholder that was
629     // just added and push a FALSE iterator.
630     pending_property_restricts_.PopRestricts();
631     pending_values_.pop();
632     pending_values_.push(
633         PendingValue(std::make_unique<DocHitInfoIteratorNone>()));
634     return libtextclassifier3::Status::OK;
635   }
636 
637   // 3. The second child should be parseable as an integer value.
638   expecting_numeric_arg_ = true;
639   node->children().at(1)->Accept(this);
640   expecting_numeric_arg_ = false;
641   ICING_ASSIGN_OR_RETURN(int64_t int_value, PopPendingIntValue());
642 
643   // 4. Check for the placeholder.
644   if (!pending_values_.top().is_placeholder()) {
645     return absl_ports::InvalidArgumentError(
646         "Error processing arguments for node.");
647   }
648   pending_values_.pop();
649 
650   // 5. Create the iterator and push it onto pending_values_.
651   ICING_ASSIGN_OR_RETURN(Int64Range range,
652                          GetInt64Range(node->operator_text(), int_value));
653   ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iterator,
654                          numeric_index_.GetIterator(
655                              text_value.term, range.low, range.high,
656                              document_store_, schema_store_, current_time_ms_));
657 
658   features_.insert(kNumericSearchFeature);
659   pending_values_.push(PendingValue(std::move(iterator)));
660   return libtextclassifier3::Status::OK;
661 }
662 
ProcessAndOperator(const NaryOperatorNode * node)663 libtextclassifier3::StatusOr<PendingValue> QueryVisitor::ProcessAndOperator(
664     const NaryOperatorNode* node) {
665   ICING_ASSIGN_OR_RETURN(
666       std::vector<std::unique_ptr<DocHitInfoIterator>> iterators,
667       PopAllPendingIterators());
668   return PendingValue(CreateAndIterator(std::move(iterators)));
669 }
670 
ProcessOrOperator(const NaryOperatorNode * node)671 libtextclassifier3::StatusOr<PendingValue> QueryVisitor::ProcessOrOperator(
672     const NaryOperatorNode* node) {
673   ICING_ASSIGN_OR_RETURN(
674       std::vector<std::unique_ptr<DocHitInfoIterator>> iterators,
675       PopAllPendingIterators());
676   return PendingValue(CreateOrIterator(std::move(iterators)));
677 }
678 
ProcessNegationOperator(const UnaryOperatorNode * node)679 libtextclassifier3::Status QueryVisitor::ProcessNegationOperator(
680     const UnaryOperatorNode* node) {
681   // 1. Put in a placeholder PendingValue
682   pending_values_.push(PendingValue());
683 
684   // 2. Visit child
685   node->child()->Accept(this);
686   if (has_pending_error()) {
687     return std::move(pending_error_);
688   }
689 
690   if (pending_values_.size() < 2) {
691     return absl_ports::InvalidArgumentError(
692         "Visit unary operator child didn't correctly add pending values.");
693   }
694 
695   // 3. We want to preserve the original text of the numeric value, append our
696   // minus to the text. It will be parsed as either an int or a double later.
697   ICING_ASSIGN_OR_RETURN(QueryTerm numeric_text_val, PopPendingTextValue());
698   numeric_text_val.term = absl_ports::StrCat("-", numeric_text_val.term);
699   PendingValue pending_value =
700       PendingValue::CreateTextPendingValue(std::move(numeric_text_val));
701 
702   // We've parsed our numeric value successfully. Pop our placeholder, push it
703   // on to the stack and return successfully.
704   if (!pending_values_.top().is_placeholder()) {
705     return absl_ports::InvalidArgumentError(
706         "Error processing arguments for node.");
707   }
708   pending_values_.pop();
709   pending_values_.push(std::move(pending_value));
710   return libtextclassifier3::Status::OK;
711 }
712 
ProcessNotOperator(const UnaryOperatorNode * node)713 libtextclassifier3::Status QueryVisitor::ProcessNotOperator(
714     const UnaryOperatorNode* node) {
715   // TODO(b/265312785) Consider implementing query optimization when we run into
716   // nested NOTs. This would allow us to simplify a query like "NOT (-foo)" to
717   // just "foo". This would also require more complicate rewrites as we would
718   // need to do things like rewrite "NOT (-a OR b)" as "a AND -b" and
719   // "NOT (price < 5)" as "price >= 5".
720   // 1. Put in a placeholder PendingValue
721   pending_values_.push(PendingValue());
722   // Toggle whatever the current value of 'processing_not_' is before visiting
723   // the children.
724   processing_not_ = !processing_not_;
725 
726   // 2. Visit child
727   node->child()->Accept(this);
728   if (has_pending_error()) {
729     return std::move(pending_error_);
730   }
731 
732   if (pending_values_.size() < 2) {
733     return absl_ports::InvalidArgumentError(
734         "Visit unary operator child didn't correctly add pending values.");
735   }
736 
737   // 3. Retrieve the delegate iterator
738   ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> delegate,
739                          PopPendingIterator());
740 
741   // 4. Check for the placeholder.
742   if (!pending_values_.top().is_placeholder()) {
743     return absl_ports::InvalidArgumentError(
744         "Error processing arguments for node.");
745   }
746   pending_values_.pop();
747 
748   pending_values_.push(PendingValue(std::make_unique<DocHitInfoIteratorNot>(
749       std::move(delegate), document_store_.last_added_document_id())));
750 
751   // Untoggle whatever the current value of 'processing_not_' is now that we've
752   // finished processing this NOT.
753   processing_not_ = !processing_not_;
754   return libtextclassifier3::Status::OK;
755 }
756 
ProcessHasOperator(const NaryOperatorNode * node)757 libtextclassifier3::Status QueryVisitor::ProcessHasOperator(
758     const NaryOperatorNode* node) {
759   if (node->children().size() != 2) {
760     return absl_ports::InvalidArgumentError("Expected 2 children.");
761   }
762 
763   // 1. Put in a placeholder PendingValue
764   pending_values_.push(PendingValue());
765 
766   // 2. Visit the first child - the property.
767   node->children().at(0)->Accept(this);
768   if (has_pending_error()) {
769     return pending_error_;
770   }
771   ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue());
772   if (text_value.is_prefix_val) {
773     return absl_ports::InvalidArgumentError(
774         "Cannot use prefix operator '*' with a property name!");
775   }
776   pending_property_restricts_.AddValidRestricts({text_value.term});
777 
778   // Just added a restrict - if there are no active property restricts then that
779   // be because this restrict is unsatisfiable.
780   if (pending_property_restricts_.active_property_restricts().empty()) {
781     // The property restrict can't be satisfiable. Pop the placeholder that was
782     // just added and push a FALSE iterator.
783     pending_property_restricts_.PopRestricts();
784     pending_values_.pop();
785     pending_values_.push(
786         PendingValue(std::make_unique<DocHitInfoIteratorNone>()));
787     return libtextclassifier3::Status::OK;
788   }
789 
790   // 3. Visit the second child - the argument.
791   node->children().at(1)->Accept(this);
792   if (has_pending_error()) {
793     return pending_error_;
794   }
795   ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> delegate,
796                          PopPendingIterator());
797 
798   // 4. Check for the placeholder.
799   if (!pending_values_.top().is_placeholder()) {
800     return absl_ports::InvalidArgumentError(
801         "Error processing arguments for node.");
802   }
803   pending_values_.pop();
804   pending_property_restricts_.PopRestricts();
805 
806   std::set<std::string> property_restricts = {std::move(text_value.term)};
807   pending_values_.push(
808       PendingValue(DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
809           std::move(delegate), &document_store_, &schema_store_,
810           std::move(property_restricts), current_time_ms_)));
811   return libtextclassifier3::Status::OK;
812 }
813 
VisitFunctionName(const FunctionNameNode * node)814 void QueryVisitor::VisitFunctionName(const FunctionNameNode* node) {
815   pending_error_ = absl_ports::UnimplementedError(
816       "Function Name node visiting not implemented yet.");
817 }
818 
VisitString(const StringNode * node)819 void QueryVisitor::VisitString(const StringNode* node) {
820   // A STRING node can only be a term. Create the iterator now.
821   auto unescaped_string_or = string_util::UnescapeStringValue(node->value());
822   if (!unescaped_string_or.ok()) {
823     pending_error_ = std::move(unescaped_string_or).status();
824     return;
825   }
826   std::string unescaped_string = std::move(unescaped_string_or).ValueOrDie();
827   QueryTerm val{std::move(unescaped_string), node->raw_value(),
828                 node->is_prefix()};
829   pending_values_.push(PendingValue::CreateStringPendingValue(std::move(val)));
830 }
831 
VisitText(const TextNode * node)832 void QueryVisitor::VisitText(const TextNode* node) {
833   // TEXT nodes could either be a term (and will become DocHitInfoIteratorTerm)
834   // or a property name. As such, we just push the TEXT value into pending
835   // values and determine which it is at a later point.
836   QueryTerm val{std::move(node->value()), node->raw_value(), node->is_prefix()};
837   pending_values_.push(PendingValue::CreateTextPendingValue(std::move(val)));
838 }
839 
VisitMember(const MemberNode * node)840 void QueryVisitor::VisitMember(const MemberNode* node) {
841   if (node->children().empty()) {
842     pending_error_ =
843         absl_ports::InvalidArgumentError("Encountered malformed member node.");
844     return;
845   }
846 
847   // 1. Put in a placeholder PendingValue
848   pending_values_.push(PendingValue());
849 
850   // 2. Visit the children.
851   for (const std::unique_ptr<TextNode>& child : node->children()) {
852     child->Accept(this);
853     if (has_pending_error()) {
854       return;
855     }
856   }
857 
858   // 3. Now process the results of the children and produce a single pending
859   //    value representing this member.
860   PendingValue pending_value;
861   if (node->children().size() == 1) {
862     // 3a. This member only has a single child, then the pending value produced
863     //    by that child is the final value produced by this member.
864     pending_value = std::move(pending_values_.top());
865     pending_values_.pop();
866   } else {
867     // 3b. Retrieve the values of all children and concatenate them into a
868     // single value.
869     libtextclassifier3::StatusOr<QueryTerm> member_or;
870     std::vector<std::string> members;
871     QueryTerm text_val;
872     const char* start = nullptr;
873     const char* end = nullptr;
874     while (!pending_values_.empty() &&
875            !pending_values_.top().is_placeholder()) {
876       member_or = PopPendingTextValue();
877       if (!member_or.ok()) {
878         pending_error_ = std::move(member_or).status();
879         return;
880       }
881       text_val = std::move(member_or).ValueOrDie();
882       if (text_val.is_prefix_val) {
883         pending_error_ = absl_ports::InvalidArgumentError(
884             "Cannot use prefix operator '*' within a property name!");
885         return;
886       }
887       if (start == nullptr) {
888         start = text_val.raw_term.data();
889         end = text_val.raw_term.data() + text_val.raw_term.length();
890       } else {
891         start = std::min(start, text_val.raw_term.data());
892         end = std::max(end,
893                        text_val.raw_term.data() + text_val.raw_term.length());
894       }
895       members.push_back(std::move(text_val.term));
896     }
897     QueryTerm member;
898     member.term = absl_ports::StrJoin(members.rbegin(), members.rend(),
899                                       property_util::kPropertyPathSeparator);
900     member.raw_term = std::string_view(start, end - start);
901     member.is_prefix_val = false;
902     pending_value = PendingValue::CreateTextPendingValue(std::move(member));
903   }
904 
905   // 4. If pending_values_ is empty somehow, then our placeholder disappeared
906   // somehow.
907   if (pending_values_.empty()) {
908     pending_error_ = absl_ports::InvalidArgumentError(
909         "Error processing arguments for member node.");
910     return;
911   }
912   pending_values_.pop();
913 
914   pending_values_.push(std::move(pending_value));
915 }
916 
VisitFunction(const FunctionNode * node)917 void QueryVisitor::VisitFunction(const FunctionNode* node) {
918   // 1. Get the associated function.
919   auto itr = registered_functions_.find(node->function_name()->value());
920   if (itr == registered_functions_.end()) {
921     pending_error_ = absl_ports::InvalidArgumentError(absl_ports::StrCat(
922         "Function ", node->function_name()->value(), " is not supported."));
923     return;
924   }
925   const Function& function = itr->second;
926 
927   // 2. Put in a placeholder PendingValue
928   pending_values_.push(PendingValue());
929 
930   // 3. Visit the children.
931   expecting_numeric_arg_ = true;
932   for (int i = 0; i < node->args().size(); ++i) {
933     const std::unique_ptr<Node>& arg = node->args()[i];
934     libtextclassifier3::StatusOr<DataType> arg_type_or =
935         function.get_param_type(i);
936     bool current_level_expecting_numeric_arg = expecting_numeric_arg_;
937     // If arg_type_or has an error, we should ignore it for now, since
938     // function.Eval should do the type check and return better error messages.
939     if (arg_type_or.ok() && (arg_type_or.ValueOrDie() == DataType::kLong ||
940                              arg_type_or.ValueOrDie() == DataType::kDouble)) {
941       expecting_numeric_arg_ = true;
942     }
943     arg->Accept(this);
944     expecting_numeric_arg_ = current_level_expecting_numeric_arg;
945     if (has_pending_error()) {
946       return;
947     }
948   }
949 
950   // 4. Collect the arguments and evaluate the function.
951   std::vector<PendingValue> args;
952   while (!pending_values_.empty() && !pending_values_.top().is_placeholder()) {
953     args.push_back(std::move(pending_values_.top()));
954     pending_values_.pop();
955   }
956   std::reverse(args.begin(), args.end());
957   auto eval_result = function.Eval(std::move(args));
958   if (!eval_result.ok()) {
959     pending_error_ = std::move(eval_result).status();
960     return;
961   }
962 
963   // 5. Pop placeholder in pending_values and add the result of our function.
964   pending_values_.pop();
965   pending_values_.push(std::move(eval_result).ValueOrDie());
966 
967   // Support for custom functions was added in list filters.
968   features_.insert(kListFilterQueryLanguageFeature);
969 }
970 
971 // TODO(b/265312785) Clarify handling of the interaction between HAS and NOT.
972 // Currently, `prop1:(NOT foo bar)` will not match any documents. Likewise,
973 // `search("NOT foo bar", createList("prop1"))` will not match any documents.
974 //
975 // We should either confirm that this is the desired behavior or consider
976 // rewriting these queries so that they're interpreted as
977 // `NOT prop1:foo AND prop1:bar` and
978 // `NOT search("foo", createList("prop1"))
979 //  AND search("bar", createList("prop1"))`
VisitUnaryOperator(const UnaryOperatorNode * node)980 void QueryVisitor::VisitUnaryOperator(const UnaryOperatorNode* node) {
981   bool is_minus = node->operator_text() == "MINUS";
982   if (node->operator_text() != "NOT" && !is_minus) {
983     pending_error_ = absl_ports::UnimplementedError(
984         absl_ports::StrCat("Visiting for unary operator ",
985                            node->operator_text(), " not implemented yet."));
986     return;
987   }
988 
989   libtextclassifier3::Status status;
990   if (expecting_numeric_arg_ && is_minus) {
991     // If the operator is a MINUS ('-') and we're at the child of a numeric
992     // comparator, then this must be a negation ('-3')
993     status = ProcessNegationOperator(node);
994   } else {
995     status = ProcessNotOperator(node);
996   }
997 
998   if (!status.ok()) {
999     pending_error_ = std::move(status);
1000   }
1001 
1002   if (!is_minus ||
1003       pending_property_restricts_.has_active_property_restricts() ||
1004       processing_not_) {
1005     // 'NOT' operator was added in list filters.
1006     // Likewise, mixing property restricts and NOTs were made valid in list
1007     // filters.
1008     features_.insert(kListFilterQueryLanguageFeature);
1009   }
1010 }
1011 
VisitNaryOperator(const NaryOperatorNode * node)1012 void QueryVisitor::VisitNaryOperator(const NaryOperatorNode* node) {
1013   if (!IsSupportedNaryOperator(node->operator_text())) {
1014     pending_error_ = absl_ports::UnimplementedError(
1015         "No support for any non-numeric operators.");
1016     return;
1017   }
1018 
1019   if (pending_property_restricts_.has_active_property_restricts() ||
1020       processing_not_) {
1021     // Likewise, mixing property restricts and NOT with compound statements was
1022     // added in list filters.
1023     features_.insert(kListFilterQueryLanguageFeature);
1024   }
1025 
1026   if (node->operator_text() == ":") {
1027     libtextclassifier3::Status status = ProcessHasOperator(node);
1028     if (!status.ok()) {
1029       pending_error_ = std::move(status);
1030     }
1031     return;
1032   } else if (IsNumericComparator(node->operator_text())) {
1033     libtextclassifier3::Status status = ProcessNumericComparator(node);
1034     if (!status.ok()) {
1035       pending_error_ = std::move(status);
1036     }
1037     return;
1038   }
1039 
1040   // 1. Put in a placeholder PendingValue
1041   pending_values_.push(PendingValue());
1042 
1043   // 2. Visit the children.
1044   for (int i = 0; i < node->children().size(); ++i) {
1045     node->children().at(i)->Accept(this);
1046     if (has_pending_error()) {
1047       return;
1048     }
1049   }
1050 
1051   // 3. Retrieve the pending value for this node.
1052   libtextclassifier3::StatusOr<PendingValue> pending_value_or;
1053   if (node->operator_text() == "AND") {
1054     pending_value_or = ProcessAndOperator(node);
1055   } else if (node->operator_text() == "OR") {
1056     pending_value_or = ProcessOrOperator(node);
1057   }
1058   if (!pending_value_or.ok()) {
1059     pending_error_ = std::move(pending_value_or).status();
1060     return;
1061   }
1062   PendingValue pending_value = std::move(pending_value_or).ValueOrDie();
1063 
1064   // 4. Check for the placeholder.
1065   if (!pending_values_.top().is_placeholder()) {
1066     pending_error_ = absl_ports::InvalidArgumentError(
1067         "Error processing arguments for node.");
1068     return;
1069   }
1070   pending_values_.pop();
1071 
1072   pending_values_.push(std::move(pending_value));
1073 }
1074 
ConsumeResults()1075 libtextclassifier3::StatusOr<QueryResults> QueryVisitor::ConsumeResults() && {
1076   if (has_pending_error()) {
1077     return std::move(pending_error_);
1078   }
1079   if (pending_values_.size() != 1) {
1080     return absl_ports::InvalidArgumentError(
1081         "Visitor does not contain a single root iterator.");
1082   }
1083   auto iterator_or = PopPendingIterator();
1084   if (!iterator_or.ok()) {
1085     return std::move(iterator_or).status();
1086   }
1087 
1088   QueryResults results;
1089   results.root_iterator = std::move(iterator_or).ValueOrDie();
1090   results.query_term_iterators = std::move(query_term_iterators_);
1091   results.query_terms = std::move(property_query_terms_map_);
1092   results.embedding_query_results = std::move(embedding_query_results_);
1093   results.features_in_use = std::move(features_);
1094   return results;
1095 }
1096 
1097 }  // namespace lib
1098 }  // namespace icing
1099