1 // Copyright (C) 2022 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_ 16 #define ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <set> 21 #include <stack> 22 #include <string> 23 #include <string_view> 24 #include <unordered_map> 25 #include <unordered_set> 26 #include <utility> 27 #include <vector> 28 29 #include "icing/text_classifier/lib3/utils/base/status.h" 30 #include "icing/text_classifier/lib3/utils/base/statusor.h" 31 #include "icing/index/embed/embedding-index.h" 32 #include "icing/index/embed/embedding-query-results.h" 33 #include "icing/index/index.h" 34 #include "icing/index/iterator/doc-hit-info-iterator-filter.h" 35 #include "icing/index/iterator/doc-hit-info-iterator.h" 36 #include "icing/index/numeric/numeric-index.h" 37 #include "icing/query/advanced_query_parser/abstract-syntax-tree.h" 38 #include "icing/query/advanced_query_parser/function.h" 39 #include "icing/query/advanced_query_parser/pending-value.h" 40 #include "icing/query/query-features.h" 41 #include "icing/query/query-results.h" 42 #include "icing/query/query-terms.h" 43 #include "icing/schema/schema-store.h" 44 #include "icing/store/document-store.h" 45 #include "icing/tokenization/tokenizer.h" 46 #include "icing/transform/normalizer.h" 47 #include <google/protobuf/repeated_field.h> 48 49 namespace icing { 50 namespace lib { 51 52 // The Visitor used to create the DocHitInfoIterator tree from the AST output by 53 // the parser. 54 class QueryVisitor : public AbstractSyntaxTreeVisitor { 55 public: QueryVisitor(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const DocumentStore * document_store,const SchemaStore * schema_store,const Normalizer * normalizer,const Tokenizer * tokenizer,std::string_view raw_query_text,const google::protobuf::RepeatedPtrField<PropertyProto::VectorProto> * embedding_query_vectors,DocHitInfoIteratorFilter::Options filter_options,TermMatchType::Code match_type,SearchSpecProto::EmbeddingQueryMetricType::Code embedding_query_metric_type,bool needs_term_frequency_info,int64_t current_time_ms)56 explicit QueryVisitor( 57 Index* index, const NumericIndex<int64_t>* numeric_index, 58 const EmbeddingIndex* embedding_index, 59 const DocumentStore* document_store, const SchemaStore* schema_store, 60 const Normalizer* normalizer, const Tokenizer* tokenizer, 61 std::string_view raw_query_text, 62 const google::protobuf::RepeatedPtrField<PropertyProto::VectorProto>* 63 embedding_query_vectors, 64 DocHitInfoIteratorFilter::Options filter_options, 65 TermMatchType::Code match_type, 66 SearchSpecProto::EmbeddingQueryMetricType::Code 67 embedding_query_metric_type, 68 bool needs_term_frequency_info, int64_t current_time_ms) 69 : QueryVisitor(index, numeric_index, embedding_index, document_store, 70 schema_store, normalizer, tokenizer, raw_query_text, 71 embedding_query_vectors, filter_options, match_type, 72 embedding_query_metric_type, needs_term_frequency_info, 73 PendingPropertyRestricts(), 74 /*processing_not=*/false, current_time_ms) {} 75 76 void VisitFunctionName(const FunctionNameNode* node) override; 77 void VisitString(const StringNode* node) override; 78 void VisitText(const TextNode* node) override; 79 void VisitMember(const MemberNode* node) override; 80 void VisitFunction(const FunctionNode* node) override; 81 void VisitUnaryOperator(const UnaryOperatorNode* node) override; 82 void VisitNaryOperator(const NaryOperatorNode* node) override; 83 84 // RETURNS: 85 // - the QueryResults reflecting the AST that was visited 86 // - INVALID_ARGUMENT if the AST does not conform to supported expressions 87 // - NOT_FOUND if the AST refers to a property that does not exist 88 libtextclassifier3::StatusOr<QueryResults> ConsumeResults() &&; 89 90 private: 91 // An internal class to help manage property restricts being applied at 92 // different levels. 93 class PendingPropertyRestricts { 94 public: 95 // Add another set of property restricts. Elements of new_restricts that are 96 // not present in active_property_rest 97 void AddValidRestricts(std::set<std::string> new_restricts); 98 99 // Pops the most recently added set of property restricts. PopRestricts()100 void PopRestricts() { 101 if (has_active_property_restricts()) { 102 pending_property_restricts_.pop_back(); 103 } 104 } 105 has_active_property_restricts()106 bool has_active_property_restricts() const { 107 return !pending_property_restricts_.empty(); 108 } 109 110 // The set of all property restrictions that are currently being applied. active_property_restricts()111 const std::set<std::string>& active_property_restricts() const { 112 return pending_property_restricts_.back(); 113 } 114 115 private: 116 std::vector<std::set<std::string>> pending_property_restricts_; 117 }; 118 QueryVisitor(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const DocumentStore * document_store,const SchemaStore * schema_store,const Normalizer * normalizer,const Tokenizer * tokenizer,std::string_view raw_query_text,const google::protobuf::RepeatedPtrField<PropertyProto::VectorProto> * embedding_query_vectors,DocHitInfoIteratorFilter::Options filter_options,TermMatchType::Code match_type,SearchSpecProto::EmbeddingQueryMetricType::Code embedding_query_metric_type,bool needs_term_frequency_info,PendingPropertyRestricts pending_property_restricts,bool processing_not,int64_t current_time_ms)119 explicit QueryVisitor( 120 Index* index, const NumericIndex<int64_t>* numeric_index, 121 const EmbeddingIndex* embedding_index, 122 const DocumentStore* document_store, const SchemaStore* schema_store, 123 const Normalizer* normalizer, const Tokenizer* tokenizer, 124 std::string_view raw_query_text, 125 const google::protobuf::RepeatedPtrField<PropertyProto::VectorProto>* 126 embedding_query_vectors, 127 DocHitInfoIteratorFilter::Options filter_options, 128 TermMatchType::Code match_type, 129 SearchSpecProto::EmbeddingQueryMetricType::Code 130 embedding_query_metric_type, 131 bool needs_term_frequency_info, 132 PendingPropertyRestricts pending_property_restricts, bool processing_not, 133 int64_t current_time_ms) 134 : index_(*index), 135 numeric_index_(*numeric_index), 136 embedding_index_(*embedding_index), 137 document_store_(*document_store), 138 schema_store_(*schema_store), 139 normalizer_(*normalizer), 140 tokenizer_(*tokenizer), 141 raw_query_text_(raw_query_text), 142 embedding_query_vectors_(embedding_query_vectors), 143 filter_options_(std::move(filter_options)), 144 match_type_(match_type), 145 embedding_query_metric_type_(embedding_query_metric_type), 146 needs_term_frequency_info_(needs_term_frequency_info), 147 pending_property_restricts_(std::move(pending_property_restricts)), 148 processing_not_(processing_not), 149 expecting_numeric_arg_(false), 150 current_time_ms_(current_time_ms) { 151 RegisterFunctions(); 152 } 153 has_pending_error()154 bool has_pending_error() const { return !pending_error_.ok(); } 155 156 // Creates a DocHitInfoIterator reflecting the provided term and whether the 157 // prefix operator has been applied to this term. Also populates, 158 // property_query_terms_map_ and query_term_iterators_ as appropriate. 159 // Returns: 160 // - On success, a DocHitInfoIterator for the provided term 161 // - INVALID_ARGUMENT if unable to create an iterator for the term. 162 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> 163 CreateTermIterator(const QueryTerm& term); 164 165 // Processes the PendingValue at the top of pending_values_, parses it into a 166 // int64_t and pops the top. 167 // Returns: 168 // - On success, the int value stored in the text at the top 169 // - INVALID_ARGUMENT if pending_values_ is empty, doesn't hold a text or 170 // can't be parsed as an int. 171 libtextclassifier3::StatusOr<int64_t> PopPendingIntValue(); 172 173 // Processes the PendingValue at the top of pending_values_ and pops the top. 174 // Returns: 175 // - On success, the string value stored in the text at the top and a bool 176 // indicating whether or not the string value has a prefix operator. 177 // - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a string. 178 libtextclassifier3::StatusOr<QueryTerm> PopPendingStringValue(); 179 180 // Processes the PendingValue at the top of pending_values_ and pops the top. 181 // Returns: 182 // - On success, the string value stored in the text at the top 183 // indicating whether or not the string value has a prefix operator. 184 // - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a text. 185 libtextclassifier3::StatusOr<QueryTerm> PopPendingTextValue(); 186 187 // Processes the PendingValue at the top of pending_values_ and pops the top. 188 // Returns: 189 // - On success, a DocHitInfoIterator representing for the term at the top 190 // - INVALID_ARGUMENT if pending_values_ is empty or if unable to create an 191 // iterator for the term. 192 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> 193 PopPendingIterator(); 194 195 // Processes all PendingValues at the top of pending_values_ until the first 196 // placeholder is encounter. 197 // Returns: 198 // - On success, a vector containing all DocHitInfoIterators representing 199 // the values at the top of pending_values_ 200 // - INVALID_ARGUMENT if pending_values_is empty or if unable to create an 201 // iterator for any of the terms at the top of pending_values_ 202 libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DocHitInfoIterator>>> 203 PopAllPendingIterators(); 204 205 // Processes the TEXT segment within text_value. Processing includes 206 // tokenizing the text, normalizing it and outputting a DocHitIterator that 207 // ANDs all the produced tokens together. 208 // Returns: 209 // - On success, a DocHitInfoIterator representing the tokenized text from 210 // text_value 211 // - INVALID_ARGUMENT if the tokenizer produces more than one token within 212 // a token group. 213 // - Any errors that could be produced by Tokenizer::Tokenize. 214 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> 215 ProduceTextTokenIterators(QueryTerm text_value); 216 217 // Processes the unary operator node as a NOT operator. A NOT can have an 218 // operator type of "NOT" or "MINUS" 219 // 220 // RETURNS: 221 // - OK on success 222 // - INVALID_ARGUMENT if any errors are encountered while processing 223 // node->child 224 libtextclassifier3::Status ProcessNotOperator(const UnaryOperatorNode* node); 225 226 // Processes the unary operator node as a negation operator. A negation 227 // operator should have an operator of type "MINUS" and it's children must 228 // resolve to a numeric value. 229 // 230 // RETURNS: 231 // - OK on success 232 // - INVALID_ARGUMENT if the node->child can't be resolved to a numeric 233 // value. 234 libtextclassifier3::Status ProcessNegationOperator( 235 const UnaryOperatorNode* node); 236 237 // Processes the NumericComparator represented by node. This must be called 238 // *after* this node's children have been visited. The PendingValues added by 239 // this node's children will be consumed by this function and the PendingValue 240 // for this node will be returned. 241 // Returns: 242 // - On success, OK 243 // - INVALID_ARGUMENT if unable to retrieve string value or int value 244 // - NOT_FOUND if there is no entry in the numeric index for the property 245 libtextclassifier3::Status ProcessNumericComparator( 246 const NaryOperatorNode* node); 247 248 // Processes the AND and OR operators represented by the node. This must be 249 // called *after* this node's children have been visited. The PendingValues 250 // added by this node's children will be consumed by this function and the 251 // PendingValue for this node will be returned. 252 // Returns: 253 // - On success, then PendingValue representing this node and it's children. 254 // - INVALID_ARGUMENT if unable to retrieve iterators for any of this node's 255 // children. 256 libtextclassifier3::StatusOr<PendingValue> ProcessAndOperator( 257 const NaryOperatorNode* node); 258 259 // Processes the OR operator represented by the node. This must be called 260 // *after* this node's children have been visited. The PendingValues added by 261 // this node's children will be consumed by this function and the PendingValue 262 // for this node will be returned. 263 // Returns: 264 // - On success, then PendingValue representing this node and it's children. 265 // - INVALID_ARGUMENT if unable to retrieve iterators for any of this node's 266 // children. 267 libtextclassifier3::StatusOr<PendingValue> ProcessOrOperator( 268 const NaryOperatorNode* node); 269 270 // Populates registered_functions with the currently supported set of 271 // functions. 272 void RegisterFunctions(); 273 274 // Implementation of `search` custom function in the query language. 275 // Returns: 276 // - a PendingValue holding the DocHitInfoIterator reflecting the query 277 // provided to SearchFunction 278 // - any errors returned by Lexer::ExtractTokens, Parser::ConsumeQuery or 279 // QueryVisitor::ConsumeResults. 280 libtextclassifier3::StatusOr<PendingValue> SearchFunction( 281 std::vector<PendingValue>&& args); 282 283 // Implementation of the propertyDefined(property_path) custom function. 284 // Returns: 285 // - a Pending Value holding a DocHitIterator that returns hits for all 286 // documents whose schema types have defined the property specified by 287 // property_path. 288 // - any errors returned by Lexer::ExtractTokens 289 libtextclassifier3::StatusOr<PendingValue> PropertyDefinedFunction( 290 std::vector<PendingValue>&& args); 291 292 // Implementation of the hasProperty(property_path) custom function. 293 // Returns: 294 // - a Pending Value holding a DocHitIterator that returns hits for all 295 // documents that have the property specified by property_path. 296 // - any errors returned by Lexer::ExtractTokens 297 libtextclassifier3::StatusOr<PendingValue> HasPropertyFunction( 298 std::vector<PendingValue>&& args); 299 300 // Implementation of the semanticSearch(vector, low, high, metric) custom 301 // function. This function is used for supporting vector search with a 302 // syntax like `semanticSearch(getSearchSpecEmbedding(0), 0.5, 1, "COSINE")`. 303 // 304 // low, high, metric are optional parameters: 305 // - low is default to negative infinity 306 // - high is default to positive infinity 307 // - metric is default to the metric specified in SearchSpec 308 // 309 // Returns: 310 // - a Pending Value of type DocHitIterator that returns all documents with 311 // an embedding vector that has a score within [low, high]. 312 // - any errors returned by Lexer::ExtractTokens 313 libtextclassifier3::StatusOr<PendingValue> SemanticSearchFunction( 314 std::vector<PendingValue>&& args); 315 316 // Implementation of the tokenize(string) custom function. 317 // Returns: 318 // - a Pending Value holding a DocHitIterator that returns hits for all 319 // documents containing the normalized tokens present in the string. 320 // - any errors returned by ProduceTextTokenIterators 321 libtextclassifier3::StatusOr<PendingValue> TokenizeFunction( 322 std::vector<PendingValue>&& args); 323 324 // Handles a NaryOperatorNode where the operator is HAS (':') and pushes an 325 // iterator with the proper section filter applied. If the current property 326 // restriction represented by pending_property_restricts and the first child 327 // of this node is unsatisfiable (ex. `prop1:(prop2:foo)`), then a NONE 328 // iterator is returned immediately and subtree represented by the second 329 // child is not traversed. 330 // 331 // Returns: 332 // - OK on success 333 // - INVALID_ARGUMENT node does not have exactly two children or the two 334 // children cannot be resolved to a MEMBER or an iterator respectively. 335 libtextclassifier3::Status ProcessHasOperator(const NaryOperatorNode* node); 336 337 // Returns the correct match type to apply based on both the match type and 338 // whether the prefix operator is currently present. GetTermMatchType(bool is_prefix)339 TermMatchType::Code GetTermMatchType(bool is_prefix) const { 340 return (is_prefix) ? TermMatchType::PREFIX : match_type_; 341 } 342 343 std::stack<PendingValue> pending_values_; 344 libtextclassifier3::Status pending_error_; 345 346 // A map from function name to Function instance. 347 std::unordered_map<std::string, Function> registered_functions_; 348 349 SectionRestrictQueryTermsMap property_query_terms_map_; 350 351 QueryTermIteratorsMap query_term_iterators_; 352 353 EmbeddingQueryResults embedding_query_results_; 354 355 // Set of features invoked in the query. 356 std::unordered_set<Feature> features_; 357 358 Index& index_; // Does not own! 359 const NumericIndex<int64_t>& numeric_index_; // Does not own! 360 const EmbeddingIndex& embedding_index_; // Does not own! 361 const DocumentStore& document_store_; // Does not own! 362 const SchemaStore& schema_store_; // Does not own! 363 const Normalizer& normalizer_; // Does not own! 364 const Tokenizer& tokenizer_; // Does not own! 365 366 std::string_view raw_query_text_; 367 const google::protobuf::RepeatedPtrField<PropertyProto::VectorProto>* 368 embedding_query_vectors_; // Nullable, does not own! 369 DocHitInfoIteratorFilter::Options filter_options_; 370 TermMatchType::Code match_type_; 371 SearchSpecProto::EmbeddingQueryMetricType::Code embedding_query_metric_type_; 372 // Whether or not term_frequency information is needed. This affects: 373 // - how DocHitInfoIteratorTerms are constructed 374 // - whether the QueryTermIteratorsMap is populated in the QueryResults. 375 bool needs_term_frequency_info_; 376 377 // The stack of property restricts currently being processed by the visitor. 378 PendingPropertyRestricts pending_property_restricts_; 379 bool processing_not_; 380 381 // Whether we are in the midst of processing a subtree that is expected to 382 // resolve to a numeric argument. 383 bool expecting_numeric_arg_; 384 385 int64_t current_time_ms_; 386 }; 387 388 } // namespace lib 389 } // namespace icing 390 391 #endif // ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_ 392