• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_
16 #define ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <set>
21 #include <stack>
22 #include <string>
23 #include <string_view>
24 #include <unordered_map>
25 #include <unordered_set>
26 #include <utility>
27 #include <vector>
28 
29 #include "icing/text_classifier/lib3/utils/base/status.h"
30 #include "icing/text_classifier/lib3/utils/base/statusor.h"
31 #include "icing/index/embed/embedding-index.h"
32 #include "icing/index/embed/embedding-query-results.h"
33 #include "icing/index/index.h"
34 #include "icing/index/iterator/doc-hit-info-iterator-filter.h"
35 #include "icing/index/iterator/doc-hit-info-iterator.h"
36 #include "icing/index/numeric/numeric-index.h"
37 #include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
38 #include "icing/query/advanced_query_parser/function.h"
39 #include "icing/query/advanced_query_parser/pending-value.h"
40 #include "icing/query/query-features.h"
41 #include "icing/query/query-results.h"
42 #include "icing/query/query-terms.h"
43 #include "icing/schema/schema-store.h"
44 #include "icing/store/document-store.h"
45 #include "icing/tokenization/tokenizer.h"
46 #include "icing/transform/normalizer.h"
47 #include <google/protobuf/repeated_field.h>
48 
49 namespace icing {
50 namespace lib {
51 
52 // The Visitor used to create the DocHitInfoIterator tree from the AST output by
53 // the parser.
54 class QueryVisitor : public AbstractSyntaxTreeVisitor {
55  public:
QueryVisitor(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const DocumentStore * document_store,const SchemaStore * schema_store,const Normalizer * normalizer,const Tokenizer * tokenizer,std::string_view raw_query_text,const google::protobuf::RepeatedPtrField<PropertyProto::VectorProto> * embedding_query_vectors,DocHitInfoIteratorFilter::Options filter_options,TermMatchType::Code match_type,SearchSpecProto::EmbeddingQueryMetricType::Code embedding_query_metric_type,bool needs_term_frequency_info,int64_t current_time_ms)56   explicit QueryVisitor(
57       Index* index, const NumericIndex<int64_t>* numeric_index,
58       const EmbeddingIndex* embedding_index,
59       const DocumentStore* document_store, const SchemaStore* schema_store,
60       const Normalizer* normalizer, const Tokenizer* tokenizer,
61       std::string_view raw_query_text,
62       const google::protobuf::RepeatedPtrField<PropertyProto::VectorProto>*
63           embedding_query_vectors,
64       DocHitInfoIteratorFilter::Options filter_options,
65       TermMatchType::Code match_type,
66       SearchSpecProto::EmbeddingQueryMetricType::Code
67           embedding_query_metric_type,
68       bool needs_term_frequency_info, int64_t current_time_ms)
69       : QueryVisitor(index, numeric_index, embedding_index, document_store,
70                      schema_store, normalizer, tokenizer, raw_query_text,
71                      embedding_query_vectors, filter_options, match_type,
72                      embedding_query_metric_type, needs_term_frequency_info,
73                      PendingPropertyRestricts(),
74                      /*processing_not=*/false, current_time_ms) {}
75 
76   void VisitFunctionName(const FunctionNameNode* node) override;
77   void VisitString(const StringNode* node) override;
78   void VisitText(const TextNode* node) override;
79   void VisitMember(const MemberNode* node) override;
80   void VisitFunction(const FunctionNode* node) override;
81   void VisitUnaryOperator(const UnaryOperatorNode* node) override;
82   void VisitNaryOperator(const NaryOperatorNode* node) override;
83 
84   // RETURNS:
85   //   - the QueryResults reflecting the AST that was visited
86   //   - INVALID_ARGUMENT if the AST does not conform to supported expressions
87   //   - NOT_FOUND if the AST refers to a property that does not exist
88   libtextclassifier3::StatusOr<QueryResults> ConsumeResults() &&;
89 
90  private:
91   // An internal class to help manage property restricts being applied at
92   // different levels.
93   class PendingPropertyRestricts {
94    public:
95     // Add another set of property restricts. Elements of new_restricts that are
96     // not present in active_property_rest
97     void AddValidRestricts(std::set<std::string> new_restricts);
98 
99     // Pops the most recently added set of property restricts.
PopRestricts()100     void PopRestricts() {
101       if (has_active_property_restricts()) {
102         pending_property_restricts_.pop_back();
103       }
104     }
105 
has_active_property_restricts()106     bool has_active_property_restricts() const {
107       return !pending_property_restricts_.empty();
108     }
109 
110     // The set of all property restrictions that are currently being applied.
active_property_restricts()111     const std::set<std::string>& active_property_restricts() const {
112       return pending_property_restricts_.back();
113     }
114 
115    private:
116     std::vector<std::set<std::string>> pending_property_restricts_;
117   };
118 
QueryVisitor(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const DocumentStore * document_store,const SchemaStore * schema_store,const Normalizer * normalizer,const Tokenizer * tokenizer,std::string_view raw_query_text,const google::protobuf::RepeatedPtrField<PropertyProto::VectorProto> * embedding_query_vectors,DocHitInfoIteratorFilter::Options filter_options,TermMatchType::Code match_type,SearchSpecProto::EmbeddingQueryMetricType::Code embedding_query_metric_type,bool needs_term_frequency_info,PendingPropertyRestricts pending_property_restricts,bool processing_not,int64_t current_time_ms)119   explicit QueryVisitor(
120       Index* index, const NumericIndex<int64_t>* numeric_index,
121       const EmbeddingIndex* embedding_index,
122       const DocumentStore* document_store, const SchemaStore* schema_store,
123       const Normalizer* normalizer, const Tokenizer* tokenizer,
124       std::string_view raw_query_text,
125       const google::protobuf::RepeatedPtrField<PropertyProto::VectorProto>*
126           embedding_query_vectors,
127       DocHitInfoIteratorFilter::Options filter_options,
128       TermMatchType::Code match_type,
129       SearchSpecProto::EmbeddingQueryMetricType::Code
130           embedding_query_metric_type,
131       bool needs_term_frequency_info,
132       PendingPropertyRestricts pending_property_restricts, bool processing_not,
133       int64_t current_time_ms)
134       : index_(*index),
135         numeric_index_(*numeric_index),
136         embedding_index_(*embedding_index),
137         document_store_(*document_store),
138         schema_store_(*schema_store),
139         normalizer_(*normalizer),
140         tokenizer_(*tokenizer),
141         raw_query_text_(raw_query_text),
142         embedding_query_vectors_(embedding_query_vectors),
143         filter_options_(std::move(filter_options)),
144         match_type_(match_type),
145         embedding_query_metric_type_(embedding_query_metric_type),
146         needs_term_frequency_info_(needs_term_frequency_info),
147         pending_property_restricts_(std::move(pending_property_restricts)),
148         processing_not_(processing_not),
149         expecting_numeric_arg_(false),
150         current_time_ms_(current_time_ms) {
151     RegisterFunctions();
152   }
153 
has_pending_error()154   bool has_pending_error() const { return !pending_error_.ok(); }
155 
156   // Creates a DocHitInfoIterator reflecting the provided term and whether the
157   // prefix operator has been applied to this term. Also populates,
158   // property_query_terms_map_ and query_term_iterators_ as appropriate.
159   // Returns:
160   //   - On success, a DocHitInfoIterator for the provided term
161   //   - INVALID_ARGUMENT if unable to create an iterator for the term.
162   libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
163   CreateTermIterator(const QueryTerm& term);
164 
165   // Processes the PendingValue at the top of pending_values_, parses it into a
166   // int64_t and pops the top.
167   // Returns:
168   //   - On success, the int value stored in the text at the top
169   //   - INVALID_ARGUMENT if pending_values_ is empty, doesn't hold a text or
170   //     can't be parsed as an int.
171   libtextclassifier3::StatusOr<int64_t> PopPendingIntValue();
172 
173   // Processes the PendingValue at the top of pending_values_ and pops the top.
174   // Returns:
175   //   - On success, the string value stored in the text at the top and a bool
176   //     indicating whether or not the string value has a prefix operator.
177   //   - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a string.
178   libtextclassifier3::StatusOr<QueryTerm> PopPendingStringValue();
179 
180   // Processes the PendingValue at the top of pending_values_ and pops the top.
181   // Returns:
182   //   - On success, the string value stored in the text at the top
183   //     indicating whether or not the string value has a prefix operator.
184   //   - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a text.
185   libtextclassifier3::StatusOr<QueryTerm> PopPendingTextValue();
186 
187   // Processes the PendingValue at the top of pending_values_ and pops the top.
188   // Returns:
189   //   - On success, a DocHitInfoIterator representing for the term at the top
190   //   - INVALID_ARGUMENT if pending_values_ is empty or if unable to create an
191   //       iterator for the term.
192   libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
193   PopPendingIterator();
194 
195   // Processes all PendingValues at the top of pending_values_ until the first
196   // placeholder is encounter.
197   // Returns:
198   //   - On success, a vector containing all DocHitInfoIterators representing
199   //     the values at the top of pending_values_
200   //   - INVALID_ARGUMENT if pending_values_is empty or if unable to create an
201   //       iterator for any of the terms at the top of pending_values_
202   libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DocHitInfoIterator>>>
203   PopAllPendingIterators();
204 
205   // Processes the TEXT segment within text_value. Processing includes
206   // tokenizing the text, normalizing it and outputting a DocHitIterator that
207   // ANDs all the produced tokens together.
208   // Returns:
209   //   - On success, a DocHitInfoIterator representing the tokenized text from
210   //     text_value
211   //   - INVALID_ARGUMENT if the tokenizer produces more than one token within
212   //       a token group.
213   //   - Any errors that could be produced by Tokenizer::Tokenize.
214   libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
215   ProduceTextTokenIterators(QueryTerm text_value);
216 
217   // Processes the unary operator node as a NOT operator. A NOT can have an
218   // operator type of "NOT" or "MINUS"
219   //
220   // RETURNS:
221   //   - OK on success
222   //   - INVALID_ARGUMENT if any errors are encountered while processing
223   //     node->child
224   libtextclassifier3::Status ProcessNotOperator(const UnaryOperatorNode* node);
225 
226   // Processes the unary operator node as a negation operator. A negation
227   // operator should have an operator of type "MINUS" and it's children must
228   // resolve to a numeric value.
229   //
230   // RETURNS:
231   //   - OK on success
232   //   - INVALID_ARGUMENT if the node->child can't be resolved to a numeric
233   //     value.
234   libtextclassifier3::Status ProcessNegationOperator(
235       const UnaryOperatorNode* node);
236 
237   // Processes the NumericComparator represented by node. This must be called
238   // *after* this node's children have been visited. The PendingValues added by
239   // this node's children will be consumed by this function and the PendingValue
240   // for this node will be returned.
241   // Returns:
242   //   - On success, OK
243   //   - INVALID_ARGUMENT if unable to retrieve string value or int value
244   //   - NOT_FOUND if there is no entry in the numeric index for the property
245   libtextclassifier3::Status ProcessNumericComparator(
246       const NaryOperatorNode* node);
247 
248   // Processes the AND and OR operators represented by the node. This must be
249   // called *after* this node's children have been visited. The PendingValues
250   // added by this node's children will be consumed by this function and the
251   // PendingValue for this node will be returned.
252   // Returns:
253   //   - On success, then PendingValue representing this node and it's children.
254   //   - INVALID_ARGUMENT if unable to retrieve iterators for any of this node's
255   //       children.
256   libtextclassifier3::StatusOr<PendingValue> ProcessAndOperator(
257       const NaryOperatorNode* node);
258 
259   // Processes the OR operator represented by the node. This must be called
260   // *after* this node's children have been visited. The PendingValues added by
261   // this node's children will be consumed by this function and the PendingValue
262   // for this node will be returned.
263   // Returns:
264   //   - On success, then PendingValue representing this node and it's children.
265   //   - INVALID_ARGUMENT if unable to retrieve iterators for any of this node's
266   //       children.
267   libtextclassifier3::StatusOr<PendingValue> ProcessOrOperator(
268       const NaryOperatorNode* node);
269 
270   // Populates registered_functions with the currently supported set of
271   // functions.
272   void RegisterFunctions();
273 
274   // Implementation of `search` custom function in the query language.
275   // Returns:
276   //   - a PendingValue holding the DocHitInfoIterator reflecting the query
277   //     provided to SearchFunction
278   //   - any errors returned by Lexer::ExtractTokens, Parser::ConsumeQuery or
279   //     QueryVisitor::ConsumeResults.
280   libtextclassifier3::StatusOr<PendingValue> SearchFunction(
281       std::vector<PendingValue>&& args);
282 
283   // Implementation of the propertyDefined(property_path) custom function.
284   // Returns:
285   //   - a Pending Value holding a DocHitIterator that returns hits for all
286   //     documents whose schema types have defined the property specified by
287   //     property_path.
288   //   - any errors returned by Lexer::ExtractTokens
289   libtextclassifier3::StatusOr<PendingValue> PropertyDefinedFunction(
290       std::vector<PendingValue>&& args);
291 
292   // Implementation of the hasProperty(property_path) custom function.
293   // Returns:
294   //   - a Pending Value holding a DocHitIterator that returns hits for all
295   //     documents that have the property specified by property_path.
296   //   - any errors returned by Lexer::ExtractTokens
297   libtextclassifier3::StatusOr<PendingValue> HasPropertyFunction(
298       std::vector<PendingValue>&& args);
299 
300   // Implementation of the semanticSearch(vector, low, high, metric) custom
301   // function. This function is used for supporting vector search with a
302   // syntax like `semanticSearch(getSearchSpecEmbedding(0), 0.5, 1, "COSINE")`.
303   //
304   // low, high, metric are optional parameters:
305   //   - low is default to negative infinity
306   //   - high is default to positive infinity
307   //   - metric is default to the metric specified in SearchSpec
308   //
309   // Returns:
310   //   - a Pending Value of type DocHitIterator that returns all documents with
311   //     an embedding vector that has a score within [low, high].
312   //   - any errors returned by Lexer::ExtractTokens
313   libtextclassifier3::StatusOr<PendingValue> SemanticSearchFunction(
314       std::vector<PendingValue>&& args);
315 
316   // Implementation of the tokenize(string) custom function.
317   // Returns:
318   //   - a Pending Value holding a DocHitIterator that returns hits for all
319   //     documents containing the normalized tokens present in the string.
320   //   - any errors returned by ProduceTextTokenIterators
321   libtextclassifier3::StatusOr<PendingValue> TokenizeFunction(
322       std::vector<PendingValue>&& args);
323 
324   // Handles a NaryOperatorNode where the operator is HAS (':') and pushes an
325   // iterator with the proper section filter applied. If the current property
326   // restriction represented by pending_property_restricts and the first child
327   // of this node is unsatisfiable (ex. `prop1:(prop2:foo)`), then a NONE
328   // iterator is returned immediately and subtree represented by the second
329   // child is not traversed.
330   //
331   // Returns:
332   //  - OK on success
333   //  - INVALID_ARGUMENT node does not have exactly two children or the two
334   //    children cannot be resolved to a MEMBER or an iterator respectively.
335   libtextclassifier3::Status ProcessHasOperator(const NaryOperatorNode* node);
336 
337   // Returns the correct match type to apply based on both the match type and
338   // whether the prefix operator is currently present.
GetTermMatchType(bool is_prefix)339   TermMatchType::Code GetTermMatchType(bool is_prefix) const {
340     return (is_prefix) ? TermMatchType::PREFIX : match_type_;
341   }
342 
343   std::stack<PendingValue> pending_values_;
344   libtextclassifier3::Status pending_error_;
345 
346   // A map from function name to Function instance.
347   std::unordered_map<std::string, Function> registered_functions_;
348 
349   SectionRestrictQueryTermsMap property_query_terms_map_;
350 
351   QueryTermIteratorsMap query_term_iterators_;
352 
353   EmbeddingQueryResults embedding_query_results_;
354 
355   // Set of features invoked in the query.
356   std::unordered_set<Feature> features_;
357 
358   Index& index_;                                // Does not own!
359   const NumericIndex<int64_t>& numeric_index_;  // Does not own!
360   const EmbeddingIndex& embedding_index_;       // Does not own!
361   const DocumentStore& document_store_;         // Does not own!
362   const SchemaStore& schema_store_;             // Does not own!
363   const Normalizer& normalizer_;                // Does not own!
364   const Tokenizer& tokenizer_;                  // Does not own!
365 
366   std::string_view raw_query_text_;
367   const google::protobuf::RepeatedPtrField<PropertyProto::VectorProto>*
368       embedding_query_vectors_;  // Nullable, does not own!
369   DocHitInfoIteratorFilter::Options filter_options_;
370   TermMatchType::Code match_type_;
371   SearchSpecProto::EmbeddingQueryMetricType::Code embedding_query_metric_type_;
372   // Whether or not term_frequency information is needed. This affects:
373   //  - how DocHitInfoIteratorTerms are constructed
374   //  - whether the QueryTermIteratorsMap is populated in the QueryResults.
375   bool needs_term_frequency_info_;
376 
377   // The stack of property restricts currently being processed by the visitor.
378   PendingPropertyRestricts pending_property_restricts_;
379   bool processing_not_;
380 
381   // Whether we are in the midst of processing a subtree that is expected to
382   // resolve to a numeric argument.
383   bool expecting_numeric_arg_;
384 
385   int64_t current_time_ms_;
386 };
387 
388 }  // namespace lib
389 }  // namespace icing
390 
391 #endif  // ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_
392