1 // Copyright (C) 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/query/suggestion-processor.h"
16
17 #include <cstdint>
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 #include <unordered_map>
22 #include <unordered_set>
23 #include <utility>
24 #include <vector>
25
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/absl_ports/str_cat.h"
29 #include "icing/index/embed/embedding-index.h"
30 #include "icing/index/index.h"
31 #include "icing/index/iterator/doc-hit-info-iterator.h"
32 #include "icing/index/numeric/numeric-index.h"
33 #include "icing/index/term-metadata.h"
34 #include "icing/proto/search.pb.h"
35 #include "icing/query/query-processor.h"
36 #include "icing/query/query-results.h"
37 #include "icing/schema/schema-store.h"
38 #include "icing/schema/section.h"
39 #include "icing/store/document-filter-data.h"
40 #include "icing/store/document-id.h"
41 #include "icing/store/document-store.h"
42 #include "icing/store/namespace-id.h"
43 #include "icing/store/suggestion-result-checker-impl.h"
44 #include "icing/tokenization/language-segmenter.h"
45 #include "icing/transform/normalizer.h"
46 #include "icing/util/clock.h"
47 #include "icing/util/status-macros.h"
48
49 namespace icing {
50 namespace lib {
51
52 libtextclassifier3::StatusOr<std::unique_ptr<SuggestionProcessor>>
Create(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const LanguageSegmenter * language_segmenter,const Normalizer * normalizer,const DocumentStore * document_store,const SchemaStore * schema_store,const Clock * clock)53 SuggestionProcessor::Create(Index* index,
54 const NumericIndex<int64_t>* numeric_index,
55 const EmbeddingIndex* embedding_index,
56 const LanguageSegmenter* language_segmenter,
57 const Normalizer* normalizer,
58 const DocumentStore* document_store,
59 const SchemaStore* schema_store,
60 const Clock* clock) {
61 ICING_RETURN_ERROR_IF_NULL(index);
62 ICING_RETURN_ERROR_IF_NULL(numeric_index);
63 ICING_RETURN_ERROR_IF_NULL(embedding_index);
64 ICING_RETURN_ERROR_IF_NULL(language_segmenter);
65 ICING_RETURN_ERROR_IF_NULL(normalizer);
66 ICING_RETURN_ERROR_IF_NULL(document_store);
67 ICING_RETURN_ERROR_IF_NULL(schema_store);
68 ICING_RETURN_ERROR_IF_NULL(clock);
69
70 return std::unique_ptr<SuggestionProcessor>(new SuggestionProcessor(
71 index, numeric_index, embedding_index, language_segmenter, normalizer,
72 document_store, schema_store, clock));
73 }
74
75 libtextclassifier3::StatusOr<
76 std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>>
PopulateDocumentIdFilters(const DocumentStore * document_store,const icing::lib::SuggestionSpecProto & suggestion_spec,const std::unordered_set<NamespaceId> & namespace_ids)77 PopulateDocumentIdFilters(
78 const DocumentStore* document_store,
79 const icing::lib::SuggestionSpecProto& suggestion_spec,
80 const std::unordered_set<NamespaceId>& namespace_ids) {
81 std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>
82 document_id_filter_map;
83 document_id_filter_map.reserve(suggestion_spec.document_uri_filters_size());
84 for (const NamespaceDocumentUriGroup& namespace_document_uri_group :
85 suggestion_spec.document_uri_filters()) {
86 auto namespace_id_or = document_store->GetNamespaceId(
87 namespace_document_uri_group.namespace_());
88 if (!namespace_id_or.ok()) {
89 // The current namespace doesn't exist.
90 continue;
91 }
92 NamespaceId namespace_id = namespace_id_or.ValueOrDie();
93 if (!namespace_ids.empty() &&
94 namespace_ids.find(namespace_id) == namespace_ids.end()) {
95 // The current namespace doesn't appear in the namespace filter.
96 return absl_ports::InvalidArgumentError(absl_ports::StrCat(
97 "The namespace : ", namespace_document_uri_group.namespace_(),
98 " appears in the document uri filter, but doesn't appear in the "
99 "namespace filter."));
100 }
101
102 if (namespace_document_uri_group.document_uris().empty()) {
103 // Client should use namespace filter to filter out all document under
104 // a namespace.
105 return absl_ports::InvalidArgumentError(absl_ports::StrCat(
106 "The namespace : ", namespace_document_uri_group.namespace_(),
107 " has empty document uri in the document uri filter. Please use the "
108 "namespace filter to exclude a namespace instead of the document uri "
109 "filter."));
110 }
111
112 // Translate namespace document Uris into document_ids
113 std::unordered_set<DocumentId> target_document_ids;
114 target_document_ids.reserve(
115 namespace_document_uri_group.document_uris_size());
116 for (std::string_view document_uri :
117 namespace_document_uri_group.document_uris()) {
118 auto document_id_or = document_store->GetDocumentId(
119 namespace_document_uri_group.namespace_(), document_uri);
120 if (!document_id_or.ok()) {
121 continue;
122 }
123 target_document_ids.insert(document_id_or.ValueOrDie());
124 }
125 document_id_filter_map.insert({namespace_id, target_document_ids});
126 }
127 return document_id_filter_map;
128 }
129
130 libtextclassifier3::StatusOr<std::unordered_map<SchemaTypeId, SectionIdMask>>
PopulatePropertyFilters(const SchemaStore * schema_store,const icing::lib::SuggestionSpecProto & suggestion_spec,const std::unordered_set<SchemaTypeId> & schema_type_ids)131 PopulatePropertyFilters(
132 const SchemaStore* schema_store,
133 const icing::lib::SuggestionSpecProto& suggestion_spec,
134 const std::unordered_set<SchemaTypeId>& schema_type_ids) {
135 std::unordered_map<SchemaTypeId, SectionIdMask> property_filter_map;
136 property_filter_map.reserve(suggestion_spec.type_property_filters_size());
137 for (const TypePropertyMask& type_field_mask :
138 suggestion_spec.type_property_filters()) {
139 auto schema_type_id_or =
140 schema_store->GetSchemaTypeId(type_field_mask.schema_type());
141 if (!schema_type_id_or.ok()) {
142 // The current schema doesn't exist
143 continue;
144 }
145 SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
146
147 if (!schema_type_ids.empty() &&
148 schema_type_ids.find(schema_type_id) == schema_type_ids.end()) {
149 // The current schema type doesn't appear in the schema type filter.
150 return absl_ports::InvalidArgumentError(absl_ports::StrCat(
151 "The schema : ", type_field_mask.schema_type(),
152 " appears in the property filter, but doesn't appear in the schema"
153 " type filter."));
154 }
155
156 if (type_field_mask.paths().empty()) {
157 return absl_ports::InvalidArgumentError(absl_ports::StrCat(
158 "The schema type : ", type_field_mask.schema_type(),
159 " has empty path in the property filter. Please use the schema type"
160 " filter to exclude a schema type instead of the property filter."));
161 }
162
163 // Translate property paths into section id mask
164 SectionIdMask section_mask = kSectionIdMaskNone;
165 auto section_metadata_list_or =
166 schema_store->GetSectionMetadata(type_field_mask.schema_type());
167 if (!section_metadata_list_or.ok()) {
168 // The current schema doesn't has section metadata.
169 continue;
170 }
171 std::unordered_set<std::string> target_property_paths;
172 target_property_paths.reserve(type_field_mask.paths_size());
173 for (const std::string& target_property_path : type_field_mask.paths()) {
174 target_property_paths.insert(target_property_path);
175 }
176 const std::vector<SectionMetadata>* section_metadata_list =
177 section_metadata_list_or.ValueOrDie();
178 for (const SectionMetadata& section_metadata : *section_metadata_list) {
179 if (target_property_paths.find(section_metadata.path) !=
180 target_property_paths.end()) {
181 section_mask |= UINT64_C(1) << section_metadata.id;
182 }
183 }
184 property_filter_map.insert({schema_type_id, section_mask});
185 }
186 return property_filter_map;
187 }
188
189 libtextclassifier3::StatusOr<std::vector<TermMetadata>>
QuerySuggestions(const icing::lib::SuggestionSpecProto & suggestion_spec,int64_t current_time_ms)190 SuggestionProcessor::QuerySuggestions(
191 const icing::lib::SuggestionSpecProto& suggestion_spec,
192 int64_t current_time_ms) {
193 // We use query tokenizer to tokenize the give prefix, and we only use the
194 // last token to be the suggestion prefix.
195
196 // Populate target namespace filter.
197 std::unordered_set<NamespaceId> namespace_ids;
198 namespace_ids.reserve(suggestion_spec.namespace_filters_size());
199 for (std::string_view name_space : suggestion_spec.namespace_filters()) {
200 auto namespace_id_or = document_store_.GetNamespaceId(name_space);
201 if (!namespace_id_or.ok()) {
202 // The current namespace doesn't exist.
203 continue;
204 }
205 namespace_ids.insert(namespace_id_or.ValueOrDie());
206 }
207 if (namespace_ids.empty() && !suggestion_spec.namespace_filters().empty()) {
208 // None of desired namespace exists, we should return directly.
209 return std::vector<TermMetadata>();
210 }
211
212 // Populate target document id filter.
213 auto document_id_filter_map_or = PopulateDocumentIdFilters(
214 &document_store_, suggestion_spec, namespace_ids);
215 if (!document_id_filter_map_or.ok()) {
216 return std::move(document_id_filter_map_or).status();
217 }
218
219 std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>
220 document_id_filter_map = document_id_filter_map_or.ValueOrDie();
221 if (document_id_filter_map.empty() &&
222 !suggestion_spec.document_uri_filters().empty()) {
223 // None of desired DocumentId exists, we should return directly.
224 return std::vector<TermMetadata>();
225 }
226
227 // Populate target schema type filter.
228 std::unordered_set<SchemaTypeId> schema_type_ids;
229 schema_type_ids.reserve(suggestion_spec.schema_type_filters_size());
230 for (std::string_view schema_type : suggestion_spec.schema_type_filters()) {
231 auto schema_type_id_or = schema_store_.GetSchemaTypeId(schema_type);
232 if (!schema_type_id_or.ok()) {
233 continue;
234 }
235 schema_type_ids.insert(schema_type_id_or.ValueOrDie());
236 }
237 if (schema_type_ids.empty() &&
238 !suggestion_spec.schema_type_filters().empty()) {
239 // None of desired schema type exists, we should return directly.
240 return std::vector<TermMetadata>();
241 }
242
243 // Populate target properties filter.
244 auto property_filter_map_or =
245 PopulatePropertyFilters(&schema_store_, suggestion_spec, schema_type_ids);
246 if (!property_filter_map_or.ok()) {
247 return std::move(property_filter_map_or).status();
248 }
249 std::unordered_map<SchemaTypeId, SectionIdMask> property_filter_map =
250 property_filter_map_or.ValueOrDie();
251
252 ICING_ASSIGN_OR_RETURN(
253 std::unique_ptr<QueryProcessor> query_processor,
254 QueryProcessor::Create(&index_, &numeric_index_, &embedding_index_,
255 &language_segmenter_, &normalizer_,
256 &document_store_, &schema_store_, &clock_));
257
258 SearchSpecProto search_spec;
259 search_spec.set_query(suggestion_spec.prefix());
260 search_spec.set_term_match_type(
261 suggestion_spec.scoring_spec().scoring_match_type());
262 ICING_ASSIGN_OR_RETURN(
263 QueryResults query_results,
264 query_processor->ParseSearch(search_spec,
265 ScoringSpecProto::RankingStrategy::NONE,
266 current_time_ms));
267
268 ICING_ASSIGN_OR_RETURN(
269 DocHitInfoIterator::TrimmedNode trimmed_node,
270 std::move(*query_results.root_iterator).TrimRightMostNode());
271
272 // If the position of the last token is not the end of the prefix, it means
273 // there should be some operator tokens after it and are ignored by the
274 // tokenizer.
275 bool is_last_token =
276 trimmed_node.term_start_index_ + trimmed_node.unnormalized_term_length_ >=
277 suggestion_spec.prefix().length();
278
279 if (!is_last_token || trimmed_node.term_.empty()) {
280 // We don't have a valid last token, return early.
281 return std::vector<TermMetadata>();
282 }
283
284 // Populate the search base in document ids.
285 // Suggestions are only generated for the very last term,
286 // trimmed_node.iterator_ tracks search results for all previous terms. If it
287 // is null means there is no pervious term and we are generating suggetion for
288 // a single term.
289 std::unordered_set<DocumentId> search_base;
290 if (trimmed_node.iterator_ != nullptr) {
291 while (trimmed_node.iterator_->Advance().ok()) {
292 search_base.insert(trimmed_node.iterator_->doc_hit_info().document_id());
293 }
294 if (search_base.empty()) {
295 // Nothing matches the previous terms in the query. There are no valid
296 // suggestions to make, we should return directly.
297 return std::vector<TermMetadata>();
298 }
299 }
300
301 // Create result checker based on given filters.
302 SuggestionResultCheckerImpl suggestion_result_checker_impl(
303 &document_store_, &schema_store_, std::move(namespace_ids),
304 std::move(document_id_filter_map), std::move(schema_type_ids),
305 std::move(property_filter_map), std::move(trimmed_node.target_section_),
306 std::move(search_base), current_time_ms);
307 // TODO(b/228240987) support generate suggestion and append suffix for advance
308 // query and function call.
309 std::string query_prefix =
310 suggestion_spec.prefix().substr(0, trimmed_node.term_start_index_);
311 // Run suggestion based on given SuggestionSpec.
312 // Normalize token text to lowercase since all tokens in the lexicon are
313 // lowercase.
314 ICING_ASSIGN_OR_RETURN(
315 std::vector<TermMetadata> terms,
316 index_.FindTermsByPrefix(
317 trimmed_node.term_, suggestion_spec.num_to_return(),
318 suggestion_spec.scoring_spec().scoring_match_type(),
319 suggestion_spec.scoring_spec().rank_by(),
320 &suggestion_result_checker_impl));
321 for (TermMetadata& term : terms) {
322 term.content = query_prefix + term.content;
323 }
324 return terms;
325 }
326
SuggestionProcessor(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const LanguageSegmenter * language_segmenter,const Normalizer * normalizer,const DocumentStore * document_store,const SchemaStore * schema_store,const Clock * clock)327 SuggestionProcessor::SuggestionProcessor(
328 Index* index, const NumericIndex<int64_t>* numeric_index,
329 const EmbeddingIndex* embedding_index,
330 const LanguageSegmenter* language_segmenter, const Normalizer* normalizer,
331 const DocumentStore* document_store, const SchemaStore* schema_store,
332 const Clock* clock)
333 : index_(*index),
334 numeric_index_(*numeric_index),
335 embedding_index_(*embedding_index),
336 language_segmenter_(*language_segmenter),
337 normalizer_(*normalizer),
338 document_store_(*document_store),
339 schema_store_(*schema_store),
340 clock_(*clock) {}
341
342 } // namespace lib
343 } // namespace icing
344