• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/query/suggestion-processor.h"
16 
17 #include <cstdint>
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 #include <unordered_map>
22 #include <unordered_set>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/absl_ports/str_cat.h"
29 #include "icing/index/embed/embedding-index.h"
30 #include "icing/index/index.h"
31 #include "icing/index/iterator/doc-hit-info-iterator.h"
32 #include "icing/index/numeric/numeric-index.h"
33 #include "icing/index/term-metadata.h"
34 #include "icing/proto/search.pb.h"
35 #include "icing/query/query-processor.h"
36 #include "icing/query/query-results.h"
37 #include "icing/schema/schema-store.h"
38 #include "icing/schema/section.h"
39 #include "icing/store/document-filter-data.h"
40 #include "icing/store/document-id.h"
41 #include "icing/store/document-store.h"
42 #include "icing/store/namespace-id.h"
43 #include "icing/store/suggestion-result-checker-impl.h"
44 #include "icing/tokenization/language-segmenter.h"
45 #include "icing/transform/normalizer.h"
46 #include "icing/util/clock.h"
47 #include "icing/util/status-macros.h"
48 
49 namespace icing {
50 namespace lib {
51 
52 libtextclassifier3::StatusOr<std::unique_ptr<SuggestionProcessor>>
Create(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const LanguageSegmenter * language_segmenter,const Normalizer * normalizer,const DocumentStore * document_store,const SchemaStore * schema_store,const Clock * clock)53 SuggestionProcessor::Create(Index* index,
54                             const NumericIndex<int64_t>* numeric_index,
55                             const EmbeddingIndex* embedding_index,
56                             const LanguageSegmenter* language_segmenter,
57                             const Normalizer* normalizer,
58                             const DocumentStore* document_store,
59                             const SchemaStore* schema_store,
60                             const Clock* clock) {
61   ICING_RETURN_ERROR_IF_NULL(index);
62   ICING_RETURN_ERROR_IF_NULL(numeric_index);
63   ICING_RETURN_ERROR_IF_NULL(embedding_index);
64   ICING_RETURN_ERROR_IF_NULL(language_segmenter);
65   ICING_RETURN_ERROR_IF_NULL(normalizer);
66   ICING_RETURN_ERROR_IF_NULL(document_store);
67   ICING_RETURN_ERROR_IF_NULL(schema_store);
68   ICING_RETURN_ERROR_IF_NULL(clock);
69 
70   return std::unique_ptr<SuggestionProcessor>(new SuggestionProcessor(
71       index, numeric_index, embedding_index, language_segmenter, normalizer,
72       document_store, schema_store, clock));
73 }
74 
75 libtextclassifier3::StatusOr<
76     std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>>
PopulateDocumentIdFilters(const DocumentStore * document_store,const icing::lib::SuggestionSpecProto & suggestion_spec,const std::unordered_set<NamespaceId> & namespace_ids)77 PopulateDocumentIdFilters(
78     const DocumentStore* document_store,
79     const icing::lib::SuggestionSpecProto& suggestion_spec,
80     const std::unordered_set<NamespaceId>& namespace_ids) {
81   std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>
82       document_id_filter_map;
83   document_id_filter_map.reserve(suggestion_spec.document_uri_filters_size());
84   for (const NamespaceDocumentUriGroup& namespace_document_uri_group :
85        suggestion_spec.document_uri_filters()) {
86     auto namespace_id_or = document_store->GetNamespaceId(
87         namespace_document_uri_group.namespace_());
88     if (!namespace_id_or.ok()) {
89       // The current namespace doesn't exist.
90       continue;
91     }
92     NamespaceId namespace_id = namespace_id_or.ValueOrDie();
93     if (!namespace_ids.empty() &&
94         namespace_ids.find(namespace_id) == namespace_ids.end()) {
95       // The current namespace doesn't appear in the namespace filter.
96       return absl_ports::InvalidArgumentError(absl_ports::StrCat(
97           "The namespace : ", namespace_document_uri_group.namespace_(),
98           " appears in the document uri filter, but doesn't appear in the "
99           "namespace filter."));
100     }
101 
102     if (namespace_document_uri_group.document_uris().empty()) {
103       // Client should use namespace filter to filter out all document under
104       // a namespace.
105       return absl_ports::InvalidArgumentError(absl_ports::StrCat(
106           "The namespace : ", namespace_document_uri_group.namespace_(),
107           " has empty document uri in the document uri filter. Please use the "
108           "namespace filter to exclude a namespace instead of the document uri "
109           "filter."));
110     }
111 
112     // Translate namespace document Uris into document_ids
113     std::unordered_set<DocumentId> target_document_ids;
114     target_document_ids.reserve(
115         namespace_document_uri_group.document_uris_size());
116     for (std::string_view document_uri :
117          namespace_document_uri_group.document_uris()) {
118       auto document_id_or = document_store->GetDocumentId(
119           namespace_document_uri_group.namespace_(), document_uri);
120       if (!document_id_or.ok()) {
121         continue;
122       }
123       target_document_ids.insert(document_id_or.ValueOrDie());
124     }
125     document_id_filter_map.insert({namespace_id, target_document_ids});
126   }
127   return document_id_filter_map;
128 }
129 
130 libtextclassifier3::StatusOr<std::unordered_map<SchemaTypeId, SectionIdMask>>
PopulatePropertyFilters(const SchemaStore * schema_store,const icing::lib::SuggestionSpecProto & suggestion_spec,const std::unordered_set<SchemaTypeId> & schema_type_ids)131 PopulatePropertyFilters(
132     const SchemaStore* schema_store,
133     const icing::lib::SuggestionSpecProto& suggestion_spec,
134     const std::unordered_set<SchemaTypeId>& schema_type_ids) {
135   std::unordered_map<SchemaTypeId, SectionIdMask> property_filter_map;
136   property_filter_map.reserve(suggestion_spec.type_property_filters_size());
137   for (const TypePropertyMask& type_field_mask :
138        suggestion_spec.type_property_filters()) {
139     auto schema_type_id_or =
140         schema_store->GetSchemaTypeId(type_field_mask.schema_type());
141     if (!schema_type_id_or.ok()) {
142       // The current schema doesn't exist
143       continue;
144     }
145     SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
146 
147     if (!schema_type_ids.empty() &&
148         schema_type_ids.find(schema_type_id) == schema_type_ids.end()) {
149       // The current schema type doesn't appear in the schema type filter.
150       return absl_ports::InvalidArgumentError(absl_ports::StrCat(
151           "The schema : ", type_field_mask.schema_type(),
152           " appears in the property filter, but doesn't appear in the schema"
153           " type filter."));
154     }
155 
156     if (type_field_mask.paths().empty()) {
157       return absl_ports::InvalidArgumentError(absl_ports::StrCat(
158           "The schema type : ", type_field_mask.schema_type(),
159           " has empty path in the property filter. Please use the schema type"
160           " filter to exclude a schema type instead of the property filter."));
161     }
162 
163     // Translate property paths into section id mask
164     SectionIdMask section_mask = kSectionIdMaskNone;
165     auto section_metadata_list_or =
166         schema_store->GetSectionMetadata(type_field_mask.schema_type());
167     if (!section_metadata_list_or.ok()) {
168       // The current schema doesn't has section metadata.
169       continue;
170     }
171     std::unordered_set<std::string> target_property_paths;
172     target_property_paths.reserve(type_field_mask.paths_size());
173     for (const std::string& target_property_path : type_field_mask.paths()) {
174       target_property_paths.insert(target_property_path);
175     }
176     const std::vector<SectionMetadata>* section_metadata_list =
177         section_metadata_list_or.ValueOrDie();
178     for (const SectionMetadata& section_metadata : *section_metadata_list) {
179       if (target_property_paths.find(section_metadata.path) !=
180           target_property_paths.end()) {
181         section_mask |= UINT64_C(1) << section_metadata.id;
182       }
183     }
184     property_filter_map.insert({schema_type_id, section_mask});
185   }
186   return property_filter_map;
187 }
188 
189 libtextclassifier3::StatusOr<std::vector<TermMetadata>>
QuerySuggestions(const icing::lib::SuggestionSpecProto & suggestion_spec,int64_t current_time_ms)190 SuggestionProcessor::QuerySuggestions(
191     const icing::lib::SuggestionSpecProto& suggestion_spec,
192     int64_t current_time_ms) {
193   // We use query tokenizer to tokenize the give prefix, and we only use the
194   // last token to be the suggestion prefix.
195 
196   // Populate target namespace filter.
197   std::unordered_set<NamespaceId> namespace_ids;
198   namespace_ids.reserve(suggestion_spec.namespace_filters_size());
199   for (std::string_view name_space : suggestion_spec.namespace_filters()) {
200     auto namespace_id_or = document_store_.GetNamespaceId(name_space);
201     if (!namespace_id_or.ok()) {
202       // The current namespace doesn't exist.
203       continue;
204     }
205     namespace_ids.insert(namespace_id_or.ValueOrDie());
206   }
207   if (namespace_ids.empty() && !suggestion_spec.namespace_filters().empty()) {
208     // None of desired namespace exists, we should return directly.
209     return std::vector<TermMetadata>();
210   }
211 
212   // Populate target document id filter.
213   auto document_id_filter_map_or = PopulateDocumentIdFilters(
214       &document_store_, suggestion_spec, namespace_ids);
215   if (!document_id_filter_map_or.ok()) {
216     return std::move(document_id_filter_map_or).status();
217   }
218 
219   std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>
220       document_id_filter_map = document_id_filter_map_or.ValueOrDie();
221   if (document_id_filter_map.empty() &&
222       !suggestion_spec.document_uri_filters().empty()) {
223     // None of desired DocumentId exists, we should return directly.
224     return std::vector<TermMetadata>();
225   }
226 
227   // Populate target schema type filter.
228   std::unordered_set<SchemaTypeId> schema_type_ids;
229   schema_type_ids.reserve(suggestion_spec.schema_type_filters_size());
230   for (std::string_view schema_type : suggestion_spec.schema_type_filters()) {
231     auto schema_type_id_or = schema_store_.GetSchemaTypeId(schema_type);
232     if (!schema_type_id_or.ok()) {
233       continue;
234     }
235     schema_type_ids.insert(schema_type_id_or.ValueOrDie());
236   }
237   if (schema_type_ids.empty() &&
238       !suggestion_spec.schema_type_filters().empty()) {
239     // None of desired schema type exists, we should return directly.
240     return std::vector<TermMetadata>();
241   }
242 
243   // Populate target properties filter.
244   auto property_filter_map_or =
245       PopulatePropertyFilters(&schema_store_, suggestion_spec, schema_type_ids);
246   if (!property_filter_map_or.ok()) {
247     return std::move(property_filter_map_or).status();
248   }
249   std::unordered_map<SchemaTypeId, SectionIdMask> property_filter_map =
250       property_filter_map_or.ValueOrDie();
251 
252   ICING_ASSIGN_OR_RETURN(
253       std::unique_ptr<QueryProcessor> query_processor,
254       QueryProcessor::Create(&index_, &numeric_index_, &embedding_index_,
255                              &language_segmenter_, &normalizer_,
256                              &document_store_, &schema_store_, &clock_));
257 
258   SearchSpecProto search_spec;
259   search_spec.set_query(suggestion_spec.prefix());
260   search_spec.set_term_match_type(
261       suggestion_spec.scoring_spec().scoring_match_type());
262   ICING_ASSIGN_OR_RETURN(
263       QueryResults query_results,
264       query_processor->ParseSearch(search_spec,
265                                    ScoringSpecProto::RankingStrategy::NONE,
266                                    current_time_ms));
267 
268   ICING_ASSIGN_OR_RETURN(
269       DocHitInfoIterator::TrimmedNode trimmed_node,
270       std::move(*query_results.root_iterator).TrimRightMostNode());
271 
272   // If the position of the last token is not the end of the prefix, it means
273   // there should be some operator tokens after it and are ignored by the
274   // tokenizer.
275   bool is_last_token =
276       trimmed_node.term_start_index_ + trimmed_node.unnormalized_term_length_ >=
277       suggestion_spec.prefix().length();
278 
279   if (!is_last_token || trimmed_node.term_.empty()) {
280     // We don't have a valid last token, return early.
281     return std::vector<TermMetadata>();
282   }
283 
284   // Populate the search base in document ids.
285   // Suggestions are only generated for the very last term,
286   // trimmed_node.iterator_ tracks search results for all previous terms. If it
287   // is null means there is no pervious term and we are generating suggetion for
288   // a single term.
289   std::unordered_set<DocumentId> search_base;
290   if (trimmed_node.iterator_ != nullptr) {
291     while (trimmed_node.iterator_->Advance().ok()) {
292       search_base.insert(trimmed_node.iterator_->doc_hit_info().document_id());
293     }
294     if (search_base.empty()) {
295       // Nothing matches the previous terms in the query. There are no valid
296       // suggestions to make, we should return directly.
297       return std::vector<TermMetadata>();
298     }
299   }
300 
301   // Create result checker based on given filters.
302   SuggestionResultCheckerImpl suggestion_result_checker_impl(
303       &document_store_, &schema_store_, std::move(namespace_ids),
304       std::move(document_id_filter_map), std::move(schema_type_ids),
305       std::move(property_filter_map), std::move(trimmed_node.target_section_),
306       std::move(search_base), current_time_ms);
307   // TODO(b/228240987) support generate suggestion and append suffix for advance
308   // query and function call.
309   std::string query_prefix =
310       suggestion_spec.prefix().substr(0, trimmed_node.term_start_index_);
311   // Run suggestion based on given SuggestionSpec.
312   // Normalize token text to lowercase since all tokens in the lexicon are
313   // lowercase.
314   ICING_ASSIGN_OR_RETURN(
315       std::vector<TermMetadata> terms,
316       index_.FindTermsByPrefix(
317           trimmed_node.term_, suggestion_spec.num_to_return(),
318           suggestion_spec.scoring_spec().scoring_match_type(),
319           suggestion_spec.scoring_spec().rank_by(),
320           &suggestion_result_checker_impl));
321   for (TermMetadata& term : terms) {
322     term.content = query_prefix + term.content;
323   }
324   return terms;
325 }
326 
SuggestionProcessor(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const LanguageSegmenter * language_segmenter,const Normalizer * normalizer,const DocumentStore * document_store,const SchemaStore * schema_store,const Clock * clock)327 SuggestionProcessor::SuggestionProcessor(
328     Index* index, const NumericIndex<int64_t>* numeric_index,
329     const EmbeddingIndex* embedding_index,
330     const LanguageSegmenter* language_segmenter, const Normalizer* normalizer,
331     const DocumentStore* document_store, const SchemaStore* schema_store,
332     const Clock* clock)
333     : index_(*index),
334       numeric_index_(*numeric_index),
335       embedding_index_(*embedding_index),
336       language_segmenter_(*language_segmenter),
337       normalizer_(*normalizer),
338       document_store_(*document_store),
339       schema_store_(*schema_store),
340       clock_(*clock) {}
341 
342 }  // namespace lib
343 }  // namespace icing
344