1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/icing-search-engine.h"
16
17 #include <algorithm>
18 #include <cstddef>
19 #include <cstdint>
20 #include <functional>
21 #include <memory>
22 #include <string>
23 #include <string_view>
24 #include <unordered_map>
25 #include <unordered_set>
26 #include <utility>
27 #include <vector>
28
29 #include "icing/text_classifier/lib3/utils/base/status.h"
30 #include "icing/text_classifier/lib3/utils/base/statusor.h"
31 #include "icing/absl_ports/annotate.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/mutex.h"
34 #include "icing/absl_ports/str_cat.h"
35 #include "icing/file/destructible-file.h"
36 #include "icing/file/file-backed-proto.h"
37 #include "icing/file/filesystem.h"
38 #include "icing/file/version-util.h"
39 #include "icing/index/data-indexing-handler.h"
40 #include "icing/index/embed/embedding-index.h"
41 #include "icing/index/embedding-indexing-handler.h"
42 #include "icing/index/hit/doc-hit-info.h"
43 #include "icing/index/index-processor.h"
44 #include "icing/index/index.h"
45 #include "icing/index/integer-section-indexing-handler.h"
46 #include "icing/index/iterator/doc-hit-info-iterator.h"
47 #include "icing/index/numeric/integer-index.h"
48 #include "icing/index/term-indexing-handler.h"
49 #include "icing/index/term-metadata.h"
50 #include "icing/jni/jni-cache.h"
51 #include "icing/join/join-children-fetcher.h"
52 #include "icing/join/join-processor.h"
53 #include "icing/join/qualified-id-join-index-impl-v1.h"
54 #include "icing/join/qualified-id-join-index-impl-v2.h"
55 #include "icing/join/qualified-id-join-index.h"
56 #include "icing/join/qualified-id-join-indexing-handler.h"
57 #include "icing/legacy/index/icing-filesystem.h"
58 #include "icing/performance-configuration.h"
59 #include "icing/portable/endian.h"
60 #include "icing/proto/debug.pb.h"
61 #include "icing/proto/document.pb.h"
62 #include "icing/proto/initialize.pb.h"
63 #include "icing/proto/internal/optimize.pb.h"
64 #include "icing/proto/logging.pb.h"
65 #include "icing/proto/optimize.pb.h"
66 #include "icing/proto/persist.pb.h"
67 #include "icing/proto/reset.pb.h"
68 #include "icing/proto/schema.pb.h"
69 #include "icing/proto/scoring.pb.h"
70 #include "icing/proto/search.pb.h"
71 #include "icing/proto/status.pb.h"
72 #include "icing/proto/storage.pb.h"
73 #include "icing/proto/term.pb.h"
74 #include "icing/proto/usage.pb.h"
75 #include "icing/query/advanced_query_parser/lexer.h"
76 #include "icing/query/query-features.h"
77 #include "icing/query/query-processor.h"
78 #include "icing/query/query-results.h"
79 #include "icing/query/suggestion-processor.h"
80 #include "icing/result/page-result.h"
81 #include "icing/result/projection-tree.h"
82 #include "icing/result/projector.h"
83 #include "icing/result/result-adjustment-info.h"
84 #include "icing/result/result-retriever-v2.h"
85 #include "icing/result/result-state-manager.h"
86 #include "icing/schema/schema-store.h"
87 #include "icing/scoring/advanced_scoring/score-expression.h"
88 #include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
89 #include "icing/scoring/scored-document-hit.h"
90 #include "icing/scoring/scored-document-hits-ranker.h"
91 #include "icing/scoring/scoring-processor.h"
92 #include "icing/store/document-id.h"
93 #include "icing/store/document-store.h"
94 #include "icing/tokenization/language-segmenter-factory.h"
95 #include "icing/transform/normalizer-factory.h"
96 #include "icing/util/clock.h"
97 #include "icing/util/data-loss.h"
98 #include "icing/util/logging.h"
99 #include "icing/util/status-macros.h"
100 #include "icing/util/tokenized-document.h"
101 #include "unicode/uloc.h"
102
103 namespace icing {
104 namespace lib {
105
106 namespace {
107
108 constexpr std::string_view kDocumentSubfolderName = "document_dir";
109 constexpr std::string_view kIndexSubfolderName = "index_dir";
110 constexpr std::string_view kIntegerIndexSubfolderName = "integer_index_dir";
111 constexpr std::string_view kQualifiedIdJoinIndexSubfolderName =
112 "qualified_id_join_index_dir";
113 constexpr std::string_view kEmbeddingIndexSubfolderName = "embedding_index_dir";
114 constexpr std::string_view kSchemaSubfolderName = "schema_dir";
115 constexpr std::string_view kSetSchemaMarkerFilename = "set_schema_marker";
116 constexpr std::string_view kInitMarkerFilename = "init_marker";
117 constexpr std::string_view kOptimizeStatusFilename = "optimize_status";
118
119 // The maximum number of unsuccessful initialization attempts from the current
120 // state that we will tolerate before deleting all data and starting from a
121 // fresh state.
122 constexpr int kMaxUnsuccessfulInitAttempts = 5;
123
124 // A pair that holds namespace and type.
125 struct NamespaceTypePair {
126 std::string namespace_;
127 std::string type;
128
operator ==icing::lib::__anon759ef75b0111::NamespaceTypePair129 bool operator==(const NamespaceTypePair& other) const {
130 return namespace_ == other.namespace_ && type == other.type;
131 }
132 };
133
134 struct NamespaceTypePairHasher {
operator ()icing::lib::__anon759ef75b0111::NamespaceTypePairHasher135 std::size_t operator()(const NamespaceTypePair& pair) const {
136 return std::hash<std::string>()(pair.namespace_) ^
137 std::hash<std::string>()(pair.type);
138 }
139 };
140
ValidateResultSpec(const DocumentStore * document_store,const ResultSpecProto & result_spec)141 libtextclassifier3::Status ValidateResultSpec(
142 const DocumentStore* document_store, const ResultSpecProto& result_spec) {
143 if (result_spec.num_per_page() < 0) {
144 return absl_ports::InvalidArgumentError(
145 "ResultSpecProto.num_per_page cannot be negative.");
146 }
147 if (result_spec.num_total_bytes_per_page_threshold() <= 0) {
148 return absl_ports::InvalidArgumentError(
149 "ResultSpecProto.num_total_bytes_per_page_threshold cannot be "
150 "non-positive.");
151 }
152 if (result_spec.max_joined_children_per_parent_to_return() < 0) {
153 return absl_ports::InvalidArgumentError(
154 "ResultSpecProto.max_joined_children_per_parent_to_return cannot be "
155 "negative.");
156 }
157 if (result_spec.num_to_score() <= 0) {
158 return absl_ports::InvalidArgumentError(
159 "ResultSpecProto.num_to_score cannot be non-positive.");
160 }
161 // Validate ResultGroupings.
162 std::unordered_set<int32_t> unique_entry_ids;
163 ResultSpecProto::ResultGroupingType result_grouping_type =
164 result_spec.result_group_type();
165 for (const ResultSpecProto::ResultGrouping& result_grouping :
166 result_spec.result_groupings()) {
167 if (result_grouping.max_results() <= 0) {
168 return absl_ports::InvalidArgumentError(
169 "Cannot specify a result grouping with max results <= 0.");
170 }
171 for (const ResultSpecProto::ResultGrouping::Entry& entry :
172 result_grouping.entry_groupings()) {
173 const std::string& name_space = entry.namespace_();
174 const std::string& schema = entry.schema();
175 auto entry_id_or = document_store->GetResultGroupingEntryId(
176 result_grouping_type, name_space, schema);
177 if (!entry_id_or.ok()) {
178 continue;
179 }
180 int32_t entry_id = entry_id_or.ValueOrDie();
181 if (unique_entry_ids.find(entry_id) != unique_entry_ids.end()) {
182 return absl_ports::InvalidArgumentError(
183 "Entry Ids must be unique across result groups.");
184 }
185 unique_entry_ids.insert(entry_id);
186 }
187 }
188 return libtextclassifier3::Status::OK;
189 }
190
ValidateSearchSpec(const SearchSpecProto & search_spec,const PerformanceConfiguration & configuration)191 libtextclassifier3::Status ValidateSearchSpec(
192 const SearchSpecProto& search_spec,
193 const PerformanceConfiguration& configuration) {
194 if (search_spec.query().size() > configuration.max_query_length) {
195 return absl_ports::InvalidArgumentError(
196 absl_ports::StrCat("SearchSpecProto.query is longer than the maximum "
197 "allowed query length: ",
198 std::to_string(configuration.max_query_length)));
199 }
200 // Check that no unknown features have been enabled in the search spec.
201 std::unordered_set<Feature> query_features_set = GetQueryFeaturesSet();
202 for (const Feature feature : search_spec.enabled_features()) {
203 if (query_features_set.find(feature) == query_features_set.end()) {
204 return absl_ports::InvalidArgumentError(
205 absl_ports::StrCat("Unknown feature in "
206 "SearchSpecProto.enabled_features: ",
207 feature));
208 }
209 }
210 return libtextclassifier3::Status::OK;
211 }
212
ValidateSuggestionSpec(const SuggestionSpecProto & suggestion_spec,const PerformanceConfiguration & configuration)213 libtextclassifier3::Status ValidateSuggestionSpec(
214 const SuggestionSpecProto& suggestion_spec,
215 const PerformanceConfiguration& configuration) {
216 if (suggestion_spec.prefix().empty()) {
217 return absl_ports::InvalidArgumentError(
218 absl_ports::StrCat("SuggestionSpecProto.prefix is empty!"));
219 }
220 if (suggestion_spec.scoring_spec().scoring_match_type() ==
221 TermMatchType::UNKNOWN) {
222 return absl_ports::InvalidArgumentError(
223 absl_ports::StrCat("SuggestionSpecProto.term_match_type is unknown!"));
224 }
225 if (suggestion_spec.num_to_return() <= 0) {
226 return absl_ports::InvalidArgumentError(absl_ports::StrCat(
227 "SuggestionSpecProto.num_to_return must be positive."));
228 }
229 if (suggestion_spec.prefix().size() > configuration.max_query_length) {
230 return absl_ports::InvalidArgumentError(
231 absl_ports::StrCat("SuggestionSpecProto.prefix is longer than the "
232 "maximum allowed prefix length: ",
233 std::to_string(configuration.max_query_length)));
234 }
235 return libtextclassifier3::Status::OK;
236 }
237
IsV2QualifiedIdJoinIndexEnabled(const IcingSearchEngineOptions & options)238 bool IsV2QualifiedIdJoinIndexEnabled(const IcingSearchEngineOptions& options) {
239 return true;
240 }
241
242 libtextclassifier3::StatusOr<std::unique_ptr<QualifiedIdJoinIndex>>
CreateQualifiedIdJoinIndex(const Filesystem & filesystem,std::string qualified_id_join_index_dir,const IcingSearchEngineOptions & options)243 CreateQualifiedIdJoinIndex(const Filesystem& filesystem,
244 std::string qualified_id_join_index_dir,
245 const IcingSearchEngineOptions& options) {
246 if (IsV2QualifiedIdJoinIndexEnabled(options)) {
247 // V2
248 return QualifiedIdJoinIndexImplV2::Create(
249 filesystem, std::move(qualified_id_join_index_dir),
250 options.pre_mapping_fbv());
251 } else {
252 // V1
253 // TODO(b/275121148): deprecate this part after rollout v2.
254 return QualifiedIdJoinIndexImplV1::Create(
255 filesystem, std::move(qualified_id_join_index_dir),
256 options.pre_mapping_fbv(), options.use_persistent_hash_map());
257 }
258 }
259
260 // Document store files are in a standalone subfolder for easier file
261 // management. We can delete and recreate the subfolder and not touch/affect
262 // anything else.
MakeDocumentDirectoryPath(const std::string & base_dir)263 std::string MakeDocumentDirectoryPath(const std::string& base_dir) {
264 return absl_ports::StrCat(base_dir, "/", kDocumentSubfolderName);
265 }
266
267 // Makes a temporary folder path for the document store which will be used
268 // during full optimization.
MakeDocumentTemporaryDirectoryPath(const std::string & base_dir)269 std::string MakeDocumentTemporaryDirectoryPath(const std::string& base_dir) {
270 return absl_ports::StrCat(base_dir, "/", kDocumentSubfolderName,
271 "_optimize_tmp");
272 }
273
274 // Index files are in a standalone subfolder because for easier file management.
275 // We can delete and recreate the subfolder and not touch/affect anything
276 // else.
MakeIndexDirectoryPath(const std::string & base_dir)277 std::string MakeIndexDirectoryPath(const std::string& base_dir) {
278 return absl_ports::StrCat(base_dir, "/", kIndexSubfolderName);
279 }
280
281 // Working path for integer index. Integer index is derived from
282 // PersistentStorage and it will take full ownership of this working path,
283 // including creation/deletion. See PersistentStorage for more details about
284 // working path.
MakeIntegerIndexWorkingPath(const std::string & base_dir)285 std::string MakeIntegerIndexWorkingPath(const std::string& base_dir) {
286 return absl_ports::StrCat(base_dir, "/", kIntegerIndexSubfolderName);
287 }
288
289 // Working path for qualified id join index. It is derived from
290 // PersistentStorage and it will take full ownership of this working path,
291 // including creation/deletion. See PersistentStorage for more details about
292 // working path.
MakeQualifiedIdJoinIndexWorkingPath(const std::string & base_dir)293 std::string MakeQualifiedIdJoinIndexWorkingPath(const std::string& base_dir) {
294 return absl_ports::StrCat(base_dir, "/", kQualifiedIdJoinIndexSubfolderName);
295 }
296
297 // Working path for embedding index.
MakeEmbeddingIndexWorkingPath(const std::string & base_dir)298 std::string MakeEmbeddingIndexWorkingPath(const std::string& base_dir) {
299 return absl_ports::StrCat(base_dir, "/", kEmbeddingIndexSubfolderName);
300 }
301
302 // SchemaStore files are in a standalone subfolder for easier file management.
303 // We can delete and recreate the subfolder and not touch/affect anything
304 // else.
MakeSchemaDirectoryPath(const std::string & base_dir)305 std::string MakeSchemaDirectoryPath(const std::string& base_dir) {
306 return absl_ports::StrCat(base_dir, "/", kSchemaSubfolderName);
307 }
308
MakeSetSchemaMarkerFilePath(const std::string & base_dir)309 std::string MakeSetSchemaMarkerFilePath(const std::string& base_dir) {
310 return absl_ports::StrCat(base_dir, "/", kSetSchemaMarkerFilename);
311 }
312
MakeInitMarkerFilePath(const std::string & base_dir)313 std::string MakeInitMarkerFilePath(const std::string& base_dir) {
314 return absl_ports::StrCat(base_dir, "/", kInitMarkerFilename);
315 }
316
TransformStatus(const libtextclassifier3::Status & internal_status,StatusProto * status_proto)317 void TransformStatus(const libtextclassifier3::Status& internal_status,
318 StatusProto* status_proto) {
319 StatusProto::Code code;
320 if (!internal_status.ok()) {
321 ICING_LOG(WARNING) << "Error: " << internal_status.error_code()
322 << ", Message: " << internal_status.error_message();
323 }
324 switch (internal_status.CanonicalCode()) {
325 case libtextclassifier3::StatusCode::OK:
326 code = StatusProto::OK;
327 break;
328 case libtextclassifier3::StatusCode::DATA_LOSS:
329 code = StatusProto::WARNING_DATA_LOSS;
330 break;
331 case libtextclassifier3::StatusCode::INVALID_ARGUMENT:
332 code = StatusProto::INVALID_ARGUMENT;
333 break;
334 case libtextclassifier3::StatusCode::NOT_FOUND:
335 code = StatusProto::NOT_FOUND;
336 break;
337 case libtextclassifier3::StatusCode::FAILED_PRECONDITION:
338 code = StatusProto::FAILED_PRECONDITION;
339 break;
340 case libtextclassifier3::StatusCode::ABORTED:
341 code = StatusProto::ABORTED;
342 break;
343 case libtextclassifier3::StatusCode::INTERNAL:
344 // TODO(b/147699081): Cleanup our internal use of INTERNAL since it
345 // doesn't match with what it *should* indicate as described in
346 // go/icing-library-apis.
347 code = StatusProto::INTERNAL;
348 break;
349 case libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED:
350 // TODO(b/147699081): Note that we don't detect all cases of OUT_OF_SPACE
351 // (e.g. if the document log is full). And we use RESOURCE_EXHAUSTED
352 // internally to indicate other resources are exhausted (e.g.
353 // DocHitInfos) - although none of these are exposed through the API.
354 // Consider separating the two cases out more clearly.
355 code = StatusProto::OUT_OF_SPACE;
356 break;
357 case libtextclassifier3::StatusCode::ALREADY_EXISTS:
358 code = StatusProto::ALREADY_EXISTS;
359 break;
360 case libtextclassifier3::StatusCode::CANCELLED:
361 [[fallthrough]];
362 case libtextclassifier3::StatusCode::UNKNOWN:
363 [[fallthrough]];
364 case libtextclassifier3::StatusCode::DEADLINE_EXCEEDED:
365 [[fallthrough]];
366 case libtextclassifier3::StatusCode::PERMISSION_DENIED:
367 [[fallthrough]];
368 case libtextclassifier3::StatusCode::OUT_OF_RANGE:
369 [[fallthrough]];
370 case libtextclassifier3::StatusCode::UNIMPLEMENTED:
371 [[fallthrough]];
372 case libtextclassifier3::StatusCode::UNAVAILABLE:
373 [[fallthrough]];
374 case libtextclassifier3::StatusCode::UNAUTHENTICATED:
375 // Other internal status codes aren't supported externally yet. If it
376 // should be supported, add another switch-case above.
377 ICING_LOG(ERROR) << "Internal status code "
378 << internal_status.error_code()
379 << " not supported in the external API";
380 code = StatusProto::UNKNOWN;
381 break;
382 }
383 status_proto->set_code(code);
384 status_proto->set_message(internal_status.error_message());
385 }
386
RetrieveAndAddDocumentInfo(const DocumentStore * document_store,DeleteByQueryResultProto & result_proto,std::unordered_map<NamespaceTypePair,DeleteByQueryResultProto::DocumentGroupInfo *,NamespaceTypePairHasher> & info_map,DocumentId document_id)387 libtextclassifier3::Status RetrieveAndAddDocumentInfo(
388 const DocumentStore* document_store, DeleteByQueryResultProto& result_proto,
389 std::unordered_map<NamespaceTypePair,
390 DeleteByQueryResultProto::DocumentGroupInfo*,
391 NamespaceTypePairHasher>& info_map,
392 DocumentId document_id) {
393 ICING_ASSIGN_OR_RETURN(DocumentProto document,
394 document_store->Get(document_id));
395 NamespaceTypePair key = {document.namespace_(), document.schema()};
396 auto iter = info_map.find(key);
397 if (iter == info_map.end()) {
398 auto entry = result_proto.add_deleted_documents();
399 entry->set_namespace_(std::move(document.namespace_()));
400 entry->set_schema(std::move(document.schema()));
401 entry->add_uris(std::move(document.uri()));
402 info_map[key] = entry;
403 } else {
404 iter->second->add_uris(std::move(document.uri()));
405 }
406 return libtextclassifier3::Status::OK;
407 }
408
ShouldRebuildIndex(const OptimizeStatsProto & optimize_stats,float optimize_rebuild_index_threshold)409 bool ShouldRebuildIndex(const OptimizeStatsProto& optimize_stats,
410 float optimize_rebuild_index_threshold) {
411 int num_invalid_documents = optimize_stats.num_deleted_documents() +
412 optimize_stats.num_expired_documents();
413 return num_invalid_documents >= optimize_stats.num_original_documents() *
414 optimize_rebuild_index_threshold;
415 }
416
ScoringExpressionHasRelevanceScoreFunction(std::string_view scoring_expression)417 libtextclassifier3::StatusOr<bool> ScoringExpressionHasRelevanceScoreFunction(
418 std::string_view scoring_expression) {
419 // TODO(b/261474063) The Lexer will be called again when creating the
420 // AdvancedScorer instance. Consider refactoring the code to allow the Lexer
421 // to be called only once.
422 Lexer lexer(scoring_expression, Lexer::Language::SCORING);
423 ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
424 lexer.ExtractTokens());
425 for (const Lexer::LexerToken& token : lexer_tokens) {
426 if (token.type == Lexer::TokenType::FUNCTION_NAME &&
427 token.text == RelevanceScoreFunctionScoreExpression::kFunctionName) {
428 return true;
429 }
430 }
431 return false;
432 }
433
434 // Useful method to get RankingStrategy if advanced scoring is enabled. When the
435 // "RelevanceScore" function is used in the advanced scoring expression,
436 // RankingStrategy will be treated as RELEVANCE_SCORE in order to prepare the
437 // necessary information needed for calculating relevance score.
438 libtextclassifier3::StatusOr<ScoringSpecProto::RankingStrategy::Code>
GetRankingStrategyFromScoringSpec(const ScoringSpecProto & scoring_spec)439 GetRankingStrategyFromScoringSpec(const ScoringSpecProto& scoring_spec) {
440 if (scoring_spec.advanced_scoring_expression().empty() &&
441 scoring_spec.additional_advanced_scoring_expressions().empty()) {
442 return scoring_spec.rank_by();
443 }
444
445 ICING_ASSIGN_OR_RETURN(bool has_relevance_score_function,
446 ScoringExpressionHasRelevanceScoreFunction(
447 scoring_spec.advanced_scoring_expression()));
448 if (has_relevance_score_function) {
449 return ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE;
450 }
451 for (std::string_view additional_scoring_expression :
452 scoring_spec.additional_advanced_scoring_expressions()) {
453 ICING_ASSIGN_OR_RETURN(has_relevance_score_function,
454 ScoringExpressionHasRelevanceScoreFunction(
455 additional_scoring_expression));
456 if (has_relevance_score_function) {
457 return ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE;
458 }
459 }
460 return ScoringSpecProto::RankingStrategy::NONE;
461 }
462
463 } // namespace
464
IcingSearchEngine(const IcingSearchEngineOptions & options,std::unique_ptr<const JniCache> jni_cache)465 IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options,
466 std::unique_ptr<const JniCache> jni_cache)
467 : IcingSearchEngine(options, std::make_unique<Filesystem>(),
468 std::make_unique<IcingFilesystem>(),
469 std::make_unique<Clock>(), std::move(jni_cache)) {}
470
IcingSearchEngine(IcingSearchEngineOptions options,std::unique_ptr<const Filesystem> filesystem,std::unique_ptr<const IcingFilesystem> icing_filesystem,std::unique_ptr<Clock> clock,std::unique_ptr<const JniCache> jni_cache)471 IcingSearchEngine::IcingSearchEngine(
472 IcingSearchEngineOptions options,
473 std::unique_ptr<const Filesystem> filesystem,
474 std::unique_ptr<const IcingFilesystem> icing_filesystem,
475 std::unique_ptr<Clock> clock, std::unique_ptr<const JniCache> jni_cache)
476 : options_(std::move(options)),
477 filesystem_(std::move(filesystem)),
478 icing_filesystem_(std::move(icing_filesystem)),
479 clock_(std::move(clock)),
480 jni_cache_(std::move(jni_cache)) {
481 ICING_VLOG(1) << "Creating IcingSearchEngine in dir: " << options_.base_dir();
482 }
483
~IcingSearchEngine()484 IcingSearchEngine::~IcingSearchEngine() {
485 if (initialized_) {
486 if (PersistToDisk(PersistType::FULL).status().code() != StatusProto::OK) {
487 ICING_LOG(ERROR)
488 << "Error persisting to disk in IcingSearchEngine destructor";
489 }
490 }
491 }
492
Initialize()493 InitializeResultProto IcingSearchEngine::Initialize() {
494 // This method does both read and write so we need a writer lock. Using two
495 // locks (reader and writer) has the chance to be interrupted during
496 // switching.
497 absl_ports::unique_lock l(&mutex_);
498 return InternalInitialize();
499 }
500
ResetMembers()501 void IcingSearchEngine::ResetMembers() {
502 schema_store_.reset();
503 document_store_.reset();
504 language_segmenter_.reset();
505 normalizer_.reset();
506 index_.reset();
507 integer_index_.reset();
508 qualified_id_join_index_.reset();
509 embedding_index_.reset();
510 }
511
CheckInitMarkerFile(InitializeStatsProto * initialize_stats)512 libtextclassifier3::Status IcingSearchEngine::CheckInitMarkerFile(
513 InitializeStatsProto* initialize_stats) {
514 // Check to see if the marker file exists and if we've already passed our max
515 // number of init attempts.
516 std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir());
517 bool file_exists = filesystem_->FileExists(marker_filepath.c_str());
518 int network_init_attempts = 0;
519 int host_init_attempts = 0;
520
521 // Read the number of previous failed init attempts from the file. If it
522 // fails, then just assume the value is zero (the most likely reason for
523 // failure would be non-existence because the last init was successful
524 // anyways).
525 std::unique_ptr<ScopedFd> marker_file_fd = std::make_unique<ScopedFd>(
526 filesystem_->OpenForWrite(marker_filepath.c_str()));
527 libtextclassifier3::Status status;
528 if (file_exists &&
529 filesystem_->PRead(marker_file_fd->get(), &network_init_attempts,
530 sizeof(network_init_attempts), /*offset=*/0)) {
531 host_init_attempts = GNetworkToHostL(network_init_attempts);
532 if (host_init_attempts > kMaxUnsuccessfulInitAttempts) {
533 // We're tried and failed to init too many times. We need to throw
534 // everything out and start from scratch.
535 ResetMembers();
536 marker_file_fd.reset();
537
538 // Delete the entire base directory.
539 if (!filesystem_->DeleteDirectoryRecursively(
540 options_.base_dir().c_str())) {
541 return absl_ports::InternalError("Failed to delete icing base dir!");
542 }
543
544 // Create the base directory again and reopen marker file.
545 if (!filesystem_->CreateDirectoryRecursively(
546 options_.base_dir().c_str())) {
547 return absl_ports::InternalError("Failed to create icing base dir!");
548 }
549
550 marker_file_fd = std::make_unique<ScopedFd>(
551 filesystem_->OpenForWrite(marker_filepath.c_str()));
552
553 status = absl_ports::DataLossError(
554 "Encountered failed initialization limit. Cleared all data.");
555 host_init_attempts = 0;
556 }
557 }
558
559 // Use network_init_attempts here because we might have set host_init_attempts
560 // to 0 if it exceeded the max threshold.
561 initialize_stats->set_num_previous_init_failures(
562 GNetworkToHostL(network_init_attempts));
563
564 ++host_init_attempts;
565 network_init_attempts = GHostToNetworkL(host_init_attempts);
566 // Write the updated number of attempts before we get started.
567 if (!filesystem_->PWrite(marker_file_fd->get(), /*offset=*/0,
568 &network_init_attempts,
569 sizeof(network_init_attempts)) ||
570 !filesystem_->DataSync(marker_file_fd->get())) {
571 return absl_ports::InternalError(
572 "Failed to write and sync init marker file");
573 }
574
575 return status;
576 }
577
InternalInitialize()578 InitializeResultProto IcingSearchEngine::InternalInitialize() {
579 ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: "
580 << options_.base_dir();
581
582 // Measure the latency of the initialization process.
583 std::unique_ptr<Timer> initialize_timer = clock_->GetNewTimer();
584
585 InitializeResultProto result_proto;
586 StatusProto* result_status = result_proto.mutable_status();
587 InitializeStatsProto* initialize_stats =
588 result_proto.mutable_initialize_stats();
589 if (initialized_) {
590 // Already initialized.
591 result_status->set_code(StatusProto::OK);
592 initialize_stats->set_latency_ms(
593 initialize_timer->GetElapsedMilliseconds());
594 initialize_stats->set_num_documents(document_store_->num_documents());
595 return result_proto;
596 }
597
598 // Now go ahead and try to initialize.
599 libtextclassifier3::Status status = InitializeMembers(initialize_stats);
600 if (status.ok() || absl_ports::IsDataLoss(status)) {
601 // We successfully initialized. We should delete the init marker file to
602 // indicate a successful init.
603 std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir());
604 if (!filesystem_->DeleteFile(marker_filepath.c_str())) {
605 status = absl_ports::InternalError("Failed to delete init marker file!");
606 } else {
607 initialized_ = true;
608 }
609 }
610 TransformStatus(status, result_status);
611 initialize_stats->set_latency_ms(initialize_timer->GetElapsedMilliseconds());
612 return result_proto;
613 }
614
InitializeMembers(InitializeStatsProto * initialize_stats)615 libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
616 InitializeStatsProto* initialize_stats) {
617 ICING_RETURN_ERROR_IF_NULL(initialize_stats);
618
619 // Make sure the base directory exists
620 if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) {
621 return absl_ports::InternalError(absl_ports::StrCat(
622 "Could not create directory: ", options_.base_dir()));
623 }
624
625 // Check to see if the marker file exists and if we've already passed our max
626 // number of init attempts.
627 libtextclassifier3::Status status = CheckInitMarkerFile(initialize_stats);
628 if (!status.ok() && !absl_ports::IsDataLoss(status)) {
629 return status;
630 }
631
632 // Do version and flags compatibility check
633 // Read version file, determine the state change and rebuild derived files if
634 // needed.
635 const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
636 ICING_ASSIGN_OR_RETURN(
637 IcingSearchEngineVersionProto stored_version_proto,
638 version_util::ReadVersion(
639 *filesystem_, /*version_file_dir=*/options_.base_dir(), index_dir));
640 version_util::VersionInfo stored_version_info =
641 version_util::GetVersionInfoFromProto(stored_version_proto);
642 version_util::StateChange version_state_change =
643 version_util::GetVersionStateChange(stored_version_info);
644
645 // Construct icing's current version proto based on the current code version
646 IcingSearchEngineVersionProto current_version_proto;
647 current_version_proto.set_version(version_util::kVersion);
648 current_version_proto.set_max_version(
649 std::max(stored_version_info.max_version, version_util::kVersion));
650 version_util::AddEnabledFeatures(options_, ¤t_version_proto);
651
652 // Step 1: If versions are incompatible, migrate schema according to the
653 // version state change.
654 if (version_state_change != version_util::StateChange::kCompatible) {
655 ICING_RETURN_IF_ERROR(SchemaStore::MigrateSchema(
656 filesystem_.get(), MakeSchemaDirectoryPath(options_.base_dir()),
657 version_state_change, version_util::kVersion));
658 }
659
660 // Step 2: Discard derived files that need to be rebuilt
661 version_util::DerivedFilesRebuildResult required_derived_files_rebuild =
662 version_util::CalculateRequiredDerivedFilesRebuild(stored_version_proto,
663 current_version_proto);
664 ICING_RETURN_IF_ERROR(DiscardDerivedFiles(required_derived_files_rebuild));
665
666 // Step 3: update version files. We need to update both the V1 and V2
667 // version files.
668 ICING_RETURN_IF_ERROR(version_util::WriteV1Version(
669 *filesystem_, /*version_file_dir=*/options_.base_dir(),
670 version_util::GetVersionInfoFromProto(current_version_proto)));
671 ICING_RETURN_IF_ERROR(version_util::WriteV2Version(
672 *filesystem_, /*version_file_dir=*/options_.base_dir(),
673 std::make_unique<IcingSearchEngineVersionProto>(
674 std::move(current_version_proto))));
675
676 ICING_RETURN_IF_ERROR(InitializeSchemaStore(initialize_stats));
677
678 // TODO(b/156383798) : Resolve how to specify the locale.
679 language_segmenter_factory::SegmenterOptions segmenter_options(
680 ULOC_US, jni_cache_.get());
681 TC3_ASSIGN_OR_RETURN(language_segmenter_, language_segmenter_factory::Create(
682 std::move(segmenter_options)));
683
684 TC3_ASSIGN_OR_RETURN(normalizer_,
685 normalizer_factory::Create(options_.max_token_length()));
686
687 std::string marker_filepath =
688 MakeSetSchemaMarkerFilePath(options_.base_dir());
689
690 libtextclassifier3::Status index_init_status;
691 if (absl_ports::IsNotFound(schema_store_->GetSchema().status())) {
692 // The schema was either lost or never set before. Wipe out the doc store
693 // and index directories and initialize them from scratch.
694 const std::string doc_store_dir =
695 MakeDocumentDirectoryPath(options_.base_dir());
696 const std::string integer_index_dir =
697 MakeIntegerIndexWorkingPath(options_.base_dir());
698 const std::string qualified_id_join_index_dir =
699 MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
700 const std::string embedding_index_dir =
701 MakeEmbeddingIndexWorkingPath(options_.base_dir());
702 if (!filesystem_->DeleteDirectoryRecursively(doc_store_dir.c_str()) ||
703 !filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
704 !IntegerIndex::Discard(*filesystem_, integer_index_dir).ok() ||
705 !QualifiedIdJoinIndex::Discard(*filesystem_,
706 qualified_id_join_index_dir)
707 .ok() ||
708 !EmbeddingIndex::Discard(*filesystem_, embedding_index_dir).ok()) {
709 return absl_ports::InternalError(absl_ports::StrCat(
710 "Could not delete directories: ", index_dir, ", ", integer_index_dir,
711 ", ", qualified_id_join_index_dir, ", ", embedding_index_dir, " and ",
712 doc_store_dir));
713 }
714 ICING_ASSIGN_OR_RETURN(
715 bool document_store_derived_files_regenerated,
716 InitializeDocumentStore(
717 /*force_recovery_and_revalidate_documents=*/false,
718 initialize_stats));
719 index_init_status = InitializeIndex(
720 document_store_derived_files_regenerated, initialize_stats);
721 if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
722 return index_init_status;
723 }
724 } else if (filesystem_->FileExists(marker_filepath.c_str())) {
725 // If the marker file is still around then something wonky happened when we
726 // last tried to set the schema.
727 //
728 // Since we're going to rebuild all indices in this case, the return value
729 // of InitializeDocumentStore (document_store_derived_files_regenerated) is
730 // unused.
731 ICING_RETURN_IF_ERROR(InitializeDocumentStore(
732 /*force_recovery_and_revalidate_documents=*/true, initialize_stats));
733
734 // We're going to need to build the index from scratch. So just delete its
735 // directory now.
736 // Discard index directory and instantiate a new one.
737 Index::Options index_options(index_dir, options_.index_merge_size(),
738 /*lite_index_sort_at_indexing=*/true,
739 options_.lite_index_sort_size());
740 if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
741 !filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
742 return absl_ports::InternalError(
743 absl_ports::StrCat("Could not recreate directory: ", index_dir));
744 }
745 ICING_ASSIGN_OR_RETURN(index_,
746 Index::Create(index_options, filesystem_.get(),
747 icing_filesystem_.get()));
748
749 // Discard integer index directory and instantiate a new one.
750 std::string integer_index_dir =
751 MakeIntegerIndexWorkingPath(options_.base_dir());
752 ICING_RETURN_IF_ERROR(
753 IntegerIndex::Discard(*filesystem_, integer_index_dir));
754 ICING_ASSIGN_OR_RETURN(
755 integer_index_,
756 IntegerIndex::Create(*filesystem_, std::move(integer_index_dir),
757 options_.integer_index_bucket_split_threshold(),
758 options_.pre_mapping_fbv()));
759
760 // Discard qualified id join index directory and instantiate a new one.
761 std::string qualified_id_join_index_dir =
762 MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
763 ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
764 *filesystem_, qualified_id_join_index_dir));
765 ICING_ASSIGN_OR_RETURN(
766 qualified_id_join_index_,
767 CreateQualifiedIdJoinIndex(
768 *filesystem_, std::move(qualified_id_join_index_dir), options_));
769
770 // Discard embedding index directory and instantiate a new one.
771 std::string embedding_index_dir =
772 MakeEmbeddingIndexWorkingPath(options_.base_dir());
773 ICING_RETURN_IF_ERROR(
774 EmbeddingIndex::Discard(*filesystem_, embedding_index_dir));
775 ICING_ASSIGN_OR_RETURN(
776 embedding_index_,
777 EmbeddingIndex::Create(filesystem_.get(), embedding_index_dir));
778
779 std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
780 IndexRestorationResult restore_result = RestoreIndexIfNeeded();
781 index_init_status = std::move(restore_result.status);
782 // DATA_LOSS means that we have successfully initialized and re-added
783 // content to the index. Some indexed content was lost, but otherwise the
784 // index is in a valid state and can be queried.
785 if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
786 return index_init_status;
787 }
788
789 // Delete the marker file to indicate that everything is now in sync with
790 // whatever changes were made to the schema.
791 filesystem_->DeleteFile(marker_filepath.c_str());
792
793 initialize_stats->set_index_restoration_latency_ms(
794 restore_timer->GetElapsedMilliseconds());
795 initialize_stats->set_index_restoration_cause(
796 InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
797 initialize_stats->set_integer_index_restoration_cause(
798 InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
799 initialize_stats->set_qualified_id_join_index_restoration_cause(
800 InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
801 initialize_stats->set_embedding_index_restoration_cause(
802 InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
803 } else if (version_state_change != version_util::StateChange::kCompatible) {
804 ICING_ASSIGN_OR_RETURN(bool document_store_derived_files_regenerated,
805 InitializeDocumentStore(
806 /*force_recovery_and_revalidate_documents=*/true,
807 initialize_stats));
808 index_init_status = InitializeIndex(
809 document_store_derived_files_regenerated, initialize_stats);
810 if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
811 return index_init_status;
812 }
813
814 initialize_stats->set_schema_store_recovery_cause(
815 InitializeStatsProto::VERSION_CHANGED);
816 initialize_stats->set_document_store_recovery_cause(
817 InitializeStatsProto::VERSION_CHANGED);
818 initialize_stats->set_index_restoration_cause(
819 InitializeStatsProto::VERSION_CHANGED);
820 initialize_stats->set_integer_index_restoration_cause(
821 InitializeStatsProto::VERSION_CHANGED);
822 initialize_stats->set_qualified_id_join_index_restoration_cause(
823 InitializeStatsProto::VERSION_CHANGED);
824 initialize_stats->set_embedding_index_restoration_cause(
825 InitializeStatsProto::VERSION_CHANGED);
826 } else {
827 ICING_ASSIGN_OR_RETURN(
828 bool document_store_derived_files_regenerated,
829 InitializeDocumentStore(
830 /*force_recovery_and_revalidate_documents=*/false,
831 initialize_stats));
832 index_init_status = InitializeIndex(
833 document_store_derived_files_regenerated, initialize_stats);
834 if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
835 return index_init_status;
836 }
837
838 // Set recovery cause to FEATURE_FLAG_CHANGED according to the calculated
839 // required_derived_files_rebuild
840 if (required_derived_files_rebuild
841 .needs_document_store_derived_files_rebuild) {
842 initialize_stats->set_document_store_recovery_cause(
843 InitializeStatsProto::FEATURE_FLAG_CHANGED);
844 }
845 if (required_derived_files_rebuild
846 .needs_schema_store_derived_files_rebuild) {
847 initialize_stats->set_schema_store_recovery_cause(
848 InitializeStatsProto::FEATURE_FLAG_CHANGED);
849 }
850 if (required_derived_files_rebuild.needs_term_index_rebuild) {
851 initialize_stats->set_index_restoration_cause(
852 InitializeStatsProto::FEATURE_FLAG_CHANGED);
853 }
854 if (required_derived_files_rebuild.needs_integer_index_rebuild) {
855 initialize_stats->set_integer_index_restoration_cause(
856 InitializeStatsProto::FEATURE_FLAG_CHANGED);
857 }
858 if (required_derived_files_rebuild.needs_qualified_id_join_index_rebuild) {
859 initialize_stats->set_qualified_id_join_index_restoration_cause(
860 InitializeStatsProto::FEATURE_FLAG_CHANGED);
861 }
862 // TODO(b/326656531): Update version-util to consider embedding index.
863 }
864
865 if (status.ok()) {
866 status = index_init_status;
867 }
868
869 result_state_manager_ = std::make_unique<ResultStateManager>(
870 performance_configuration_.max_num_total_hits, *document_store_);
871
872 return status;
873 }
874
InitializeSchemaStore(InitializeStatsProto * initialize_stats)875 libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore(
876 InitializeStatsProto* initialize_stats) {
877 ICING_RETURN_ERROR_IF_NULL(initialize_stats);
878
879 const std::string schema_store_dir =
880 MakeSchemaDirectoryPath(options_.base_dir());
881 // Make sure the sub-directory exists
882 if (!filesystem_->CreateDirectoryRecursively(schema_store_dir.c_str())) {
883 return absl_ports::InternalError(
884 absl_ports::StrCat("Could not create directory: ", schema_store_dir));
885 }
886 ICING_ASSIGN_OR_RETURN(
887 schema_store_, SchemaStore::Create(filesystem_.get(), schema_store_dir,
888 clock_.get(), initialize_stats));
889
890 return libtextclassifier3::Status::OK;
891 }
892
InitializeDocumentStore(bool force_recovery_and_revalidate_documents,InitializeStatsProto * initialize_stats)893 libtextclassifier3::StatusOr<bool> IcingSearchEngine::InitializeDocumentStore(
894 bool force_recovery_and_revalidate_documents,
895 InitializeStatsProto* initialize_stats) {
896 ICING_RETURN_ERROR_IF_NULL(initialize_stats);
897
898 const std::string document_dir =
899 MakeDocumentDirectoryPath(options_.base_dir());
900 // Make sure the sub-directory exists
901 if (!filesystem_->CreateDirectoryRecursively(document_dir.c_str())) {
902 return absl_ports::InternalError(
903 absl_ports::StrCat("Could not create directory: ", document_dir));
904 }
905 ICING_ASSIGN_OR_RETURN(
906 DocumentStore::CreateResult create_result,
907 DocumentStore::Create(
908 filesystem_.get(), document_dir, clock_.get(), schema_store_.get(),
909 force_recovery_and_revalidate_documents,
910 /*document_store_namespace_id_fingerprint=*/true,
911 /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/true,
912 options_.compression_level(), initialize_stats));
913 document_store_ = std::move(create_result.document_store);
914
915 return create_result.derived_files_regenerated;
916 }
917
InitializeIndex(bool document_store_derived_files_regenerated,InitializeStatsProto * initialize_stats)918 libtextclassifier3::Status IcingSearchEngine::InitializeIndex(
919 bool document_store_derived_files_regenerated,
920 InitializeStatsProto* initialize_stats) {
921 ICING_RETURN_ERROR_IF_NULL(initialize_stats);
922
923 const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
924 // Make sure the sub-directory exists
925 if (!filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
926 return absl_ports::InternalError(
927 absl_ports::StrCat("Could not create directory: ", index_dir));
928 }
929 Index::Options index_options(index_dir, options_.index_merge_size(),
930 /*lite_index_sort_at_indexing=*/true,
931 options_.lite_index_sort_size());
932
933 // Term index
934 InitializeStatsProto::RecoveryCause index_recovery_cause;
935 auto index_or =
936 Index::Create(index_options, filesystem_.get(), icing_filesystem_.get());
937 if (!index_or.ok()) {
938 if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
939 !filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
940 return absl_ports::InternalError(
941 absl_ports::StrCat("Could not recreate directory: ", index_dir));
942 }
943
944 index_recovery_cause = InitializeStatsProto::IO_ERROR;
945
946 // Try recreating it from scratch and re-indexing everything.
947 ICING_ASSIGN_OR_RETURN(index_,
948 Index::Create(index_options, filesystem_.get(),
949 icing_filesystem_.get()));
950 } else {
951 // Index was created fine.
952 index_ = std::move(index_or).ValueOrDie();
953 // If a recover does have to happen, then it must be because the index is
954 // out of sync with the document store.
955 index_recovery_cause = InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
956 }
957
958 // Integer index
959 std::string integer_index_dir =
960 MakeIntegerIndexWorkingPath(options_.base_dir());
961 InitializeStatsProto::RecoveryCause integer_index_recovery_cause;
962 auto integer_index_or =
963 IntegerIndex::Create(*filesystem_, integer_index_dir,
964 options_.integer_index_bucket_split_threshold(),
965 options_.pre_mapping_fbv());
966 if (!integer_index_or.ok()) {
967 ICING_RETURN_IF_ERROR(
968 IntegerIndex::Discard(*filesystem_, integer_index_dir));
969
970 integer_index_recovery_cause = InitializeStatsProto::IO_ERROR;
971
972 // Try recreating it from scratch and re-indexing everything.
973 ICING_ASSIGN_OR_RETURN(
974 integer_index_,
975 IntegerIndex::Create(*filesystem_, std::move(integer_index_dir),
976 options_.integer_index_bucket_split_threshold(),
977 options_.pre_mapping_fbv()));
978 } else {
979 // Integer index was created fine.
980 integer_index_ = std::move(integer_index_or).ValueOrDie();
981 // If a recover does have to happen, then it must be because the index is
982 // out of sync with the document store.
983 integer_index_recovery_cause =
984 InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
985 }
986
987 // Qualified id join index
988 std::string qualified_id_join_index_dir =
989 MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
990 InitializeStatsProto::RecoveryCause qualified_id_join_index_recovery_cause;
991 if (document_store_derived_files_regenerated &&
992 IsV2QualifiedIdJoinIndexEnabled(options_)) {
993 // V2 qualified id join index depends on document store derived files, so we
994 // have to rebuild it from scratch if
995 // document_store_derived_files_regenerated is true.
996 ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
997 *filesystem_, qualified_id_join_index_dir));
998
999 ICING_ASSIGN_OR_RETURN(
1000 qualified_id_join_index_,
1001 CreateQualifiedIdJoinIndex(
1002 *filesystem_, std::move(qualified_id_join_index_dir), options_));
1003
1004 qualified_id_join_index_recovery_cause =
1005 InitializeStatsProto::DEPENDENCIES_CHANGED;
1006 } else {
1007 auto qualified_id_join_index_or = CreateQualifiedIdJoinIndex(
1008 *filesystem_, qualified_id_join_index_dir, options_);
1009 if (!qualified_id_join_index_or.ok()) {
1010 ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
1011 *filesystem_, qualified_id_join_index_dir));
1012
1013 qualified_id_join_index_recovery_cause = InitializeStatsProto::IO_ERROR;
1014
1015 // Try recreating it from scratch and rebuild everything.
1016 ICING_ASSIGN_OR_RETURN(
1017 qualified_id_join_index_,
1018 CreateQualifiedIdJoinIndex(
1019 *filesystem_, std::move(qualified_id_join_index_dir), options_));
1020 } else {
1021 // Qualified id join index was created fine.
1022 qualified_id_join_index_ =
1023 std::move(qualified_id_join_index_or).ValueOrDie();
1024 // If a recover does have to happen, then it must be because the index is
1025 // out of sync with the document store.
1026 qualified_id_join_index_recovery_cause =
1027 InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
1028 }
1029 }
1030
1031 // Embedding index
1032 const std::string embedding_dir =
1033 MakeEmbeddingIndexWorkingPath(options_.base_dir());
1034 InitializeStatsProto::RecoveryCause embedding_index_recovery_cause;
1035 auto embedding_index_or =
1036 EmbeddingIndex::Create(filesystem_.get(), embedding_dir);
1037 if (!embedding_index_or.ok()) {
1038 ICING_RETURN_IF_ERROR(EmbeddingIndex::Discard(*filesystem_, embedding_dir));
1039
1040 embedding_index_recovery_cause = InitializeStatsProto::IO_ERROR;
1041
1042 // Try recreating it from scratch and re-indexing everything.
1043 ICING_ASSIGN_OR_RETURN(
1044 embedding_index_,
1045 EmbeddingIndex::Create(filesystem_.get(), embedding_dir));
1046 } else {
1047 // Embedding index was created fine.
1048 embedding_index_ = std::move(embedding_index_or).ValueOrDie();
1049 // If a recover does have to happen, then it must be because the index is
1050 // out of sync with the document store.
1051 embedding_index_recovery_cause =
1052 InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
1053 }
1054
1055 std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
1056 IndexRestorationResult restore_result = RestoreIndexIfNeeded();
1057 if (restore_result.index_needed_restoration ||
1058 restore_result.integer_index_needed_restoration ||
1059 restore_result.qualified_id_join_index_needed_restoration) {
1060 initialize_stats->set_index_restoration_latency_ms(
1061 restore_timer->GetElapsedMilliseconds());
1062
1063 if (restore_result.index_needed_restoration) {
1064 initialize_stats->set_index_restoration_cause(index_recovery_cause);
1065 }
1066 if (restore_result.integer_index_needed_restoration) {
1067 initialize_stats->set_integer_index_restoration_cause(
1068 integer_index_recovery_cause);
1069 }
1070 if (restore_result.qualified_id_join_index_needed_restoration) {
1071 initialize_stats->set_qualified_id_join_index_restoration_cause(
1072 qualified_id_join_index_recovery_cause);
1073 }
1074 if (restore_result.embedding_index_needed_restoration) {
1075 initialize_stats->set_embedding_index_restoration_cause(
1076 embedding_index_recovery_cause);
1077 }
1078 }
1079 return restore_result.status;
1080 }
1081
SetSchema(const SchemaProto & new_schema,bool ignore_errors_and_delete_documents)1082 SetSchemaResultProto IcingSearchEngine::SetSchema(
1083 const SchemaProto& new_schema, bool ignore_errors_and_delete_documents) {
1084 return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents);
1085 }
1086
SetSchema(SchemaProto && new_schema,bool ignore_errors_and_delete_documents)1087 SetSchemaResultProto IcingSearchEngine::SetSchema(
1088 SchemaProto&& new_schema, bool ignore_errors_and_delete_documents) {
1089 ICING_VLOG(1) << "Setting new Schema";
1090
1091 SetSchemaResultProto result_proto;
1092 StatusProto* result_status = result_proto.mutable_status();
1093
1094 absl_ports::unique_lock l(&mutex_);
1095 ScopedTimer timer(clock_->GetNewTimer(), [&result_proto](int64_t t) {
1096 result_proto.set_latency_ms(t);
1097 });
1098 if (!initialized_) {
1099 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1100 result_status->set_message("IcingSearchEngine has not been initialized!");
1101 return result_proto;
1102 }
1103
1104 auto lost_previous_schema_or = LostPreviousSchema();
1105 if (!lost_previous_schema_or.ok()) {
1106 TransformStatus(lost_previous_schema_or.status(), result_status);
1107 return result_proto;
1108 }
1109 bool lost_previous_schema = lost_previous_schema_or.ValueOrDie();
1110
1111 std::string marker_filepath =
1112 MakeSetSchemaMarkerFilePath(options_.base_dir());
1113 // Create the marker file indicating that we are going to apply a schema
1114 // change. No need to write anything to the marker file - its existence is the
1115 // only thing that matters. The marker file is used to indicate if we
1116 // encountered a crash or a power loss while updating the schema and other
1117 // files. So set it up to be deleted as long as we return from this function.
1118 DestructibleFile marker_file(marker_filepath, filesystem_.get());
1119
1120 auto set_schema_result_or = schema_store_->SetSchema(
1121 std::move(new_schema), ignore_errors_and_delete_documents,
1122 options_.allow_circular_schema_definitions());
1123 if (!set_schema_result_or.ok()) {
1124 TransformStatus(set_schema_result_or.status(), result_status);
1125 return result_proto;
1126 }
1127 SchemaStore::SetSchemaResult set_schema_result =
1128 std::move(set_schema_result_or).ValueOrDie();
1129
1130 for (const std::string& deleted_type :
1131 set_schema_result.schema_types_deleted_by_name) {
1132 result_proto.add_deleted_schema_types(deleted_type);
1133 }
1134
1135 for (const std::string& incompatible_type :
1136 set_schema_result.schema_types_incompatible_by_name) {
1137 result_proto.add_incompatible_schema_types(incompatible_type);
1138 }
1139
1140 for (const std::string& new_type :
1141 set_schema_result.schema_types_new_by_name) {
1142 result_proto.add_new_schema_types(std::move(new_type));
1143 }
1144
1145 for (const std::string& compatible_type :
1146 set_schema_result.schema_types_changed_fully_compatible_by_name) {
1147 result_proto.add_fully_compatible_changed_schema_types(
1148 std::move(compatible_type));
1149 }
1150
1151 bool index_incompatible =
1152 !set_schema_result.schema_types_index_incompatible_by_name.empty();
1153 for (const std::string& index_incompatible_type :
1154 set_schema_result.schema_types_index_incompatible_by_name) {
1155 result_proto.add_index_incompatible_changed_schema_types(
1156 std::move(index_incompatible_type));
1157 }
1158
1159 bool join_incompatible =
1160 !set_schema_result.schema_types_join_incompatible_by_name.empty();
1161 for (const std::string& join_incompatible_type :
1162 set_schema_result.schema_types_join_incompatible_by_name) {
1163 result_proto.add_join_incompatible_changed_schema_types(
1164 std::move(join_incompatible_type));
1165 }
1166
1167 libtextclassifier3::Status status;
1168 if (set_schema_result.success) {
1169 if (lost_previous_schema) {
1170 // No previous schema to calculate a diff against. We have to go through
1171 // and revalidate all the Documents in the DocumentStore
1172 status = document_store_->UpdateSchemaStore(schema_store_.get());
1173 if (!status.ok()) {
1174 TransformStatus(status, result_status);
1175 return result_proto;
1176 }
1177 } else if (!set_schema_result.old_schema_type_ids_changed.empty() ||
1178 !set_schema_result.schema_types_incompatible_by_id.empty() ||
1179 !set_schema_result.schema_types_deleted_by_id.empty()) {
1180 status = document_store_->OptimizedUpdateSchemaStore(schema_store_.get(),
1181 set_schema_result);
1182 if (!status.ok()) {
1183 TransformStatus(status, result_status);
1184 return result_proto;
1185 }
1186 }
1187
1188 if (lost_previous_schema || index_incompatible) {
1189 // Clears search indices
1190 status = ClearSearchIndices();
1191 if (!status.ok()) {
1192 TransformStatus(status, result_status);
1193 return result_proto;
1194 }
1195 }
1196
1197 if (lost_previous_schema || join_incompatible) {
1198 // Clears join indices
1199 status = ClearJoinIndices();
1200 if (!status.ok()) {
1201 TransformStatus(status, result_status);
1202 return result_proto;
1203 }
1204 }
1205
1206 if (lost_previous_schema || index_incompatible || join_incompatible) {
1207 IndexRestorationResult restore_result = RestoreIndexIfNeeded();
1208 // DATA_LOSS means that we have successfully re-added content to the
1209 // index. Some indexed content was lost, but otherwise the index is in a
1210 // valid state and can be queried.
1211 if (!restore_result.status.ok() &&
1212 !absl_ports::IsDataLoss(restore_result.status)) {
1213 TransformStatus(status, result_status);
1214 return result_proto;
1215 }
1216 }
1217
1218 result_status->set_code(StatusProto::OK);
1219 } else {
1220 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1221 result_status->set_message("Schema is incompatible.");
1222 }
1223
1224 return result_proto;
1225 }
1226
GetSchema()1227 GetSchemaResultProto IcingSearchEngine::GetSchema() {
1228 GetSchemaResultProto result_proto;
1229 StatusProto* result_status = result_proto.mutable_status();
1230
1231 absl_ports::shared_lock l(&mutex_);
1232 if (!initialized_) {
1233 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1234 result_status->set_message("IcingSearchEngine has not been initialized!");
1235 return result_proto;
1236 }
1237
1238 auto schema_or = schema_store_->GetSchema();
1239 if (!schema_or.ok()) {
1240 TransformStatus(schema_or.status(), result_status);
1241 return result_proto;
1242 }
1243
1244 result_status->set_code(StatusProto::OK);
1245 *result_proto.mutable_schema() = *std::move(schema_or).ValueOrDie();
1246 return result_proto;
1247 }
1248
GetSchemaType(std::string_view schema_type)1249 GetSchemaTypeResultProto IcingSearchEngine::GetSchemaType(
1250 std::string_view schema_type) {
1251 GetSchemaTypeResultProto result_proto;
1252 StatusProto* result_status = result_proto.mutable_status();
1253
1254 absl_ports::shared_lock l(&mutex_);
1255 if (!initialized_) {
1256 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1257 result_status->set_message("IcingSearchEngine has not been initialized!");
1258 return result_proto;
1259 }
1260
1261 auto type_config_or = schema_store_->GetSchemaTypeConfig(schema_type);
1262 if (!type_config_or.ok()) {
1263 TransformStatus(type_config_or.status(), result_status);
1264 return result_proto;
1265 }
1266
1267 result_status->set_code(StatusProto::OK);
1268 *result_proto.mutable_schema_type_config() = *(type_config_or.ValueOrDie());
1269 return result_proto;
1270 }
1271
Put(const DocumentProto & document)1272 PutResultProto IcingSearchEngine::Put(const DocumentProto& document) {
1273 return Put(DocumentProto(document));
1274 }
1275
Put(DocumentProto && document)1276 PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
1277 ICING_VLOG(1) << "Writing document to document store";
1278
1279 PutResultProto result_proto;
1280 StatusProto* result_status = result_proto.mutable_status();
1281 PutDocumentStatsProto* put_document_stats =
1282 result_proto.mutable_put_document_stats();
1283 ScopedTimer put_timer(clock_->GetNewTimer(), [put_document_stats](int64_t t) {
1284 put_document_stats->set_latency_ms(t);
1285 });
1286
1287 // Lock must be acquired before validation because the DocumentStore uses
1288 // the schema file to validate, and the schema could be changed in
1289 // SetSchema() which is protected by the same mutex.
1290 absl_ports::unique_lock l(&mutex_);
1291 if (!initialized_) {
1292 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1293 result_status->set_message("IcingSearchEngine has not been initialized!");
1294 return result_proto;
1295 }
1296
1297 auto tokenized_document_or = TokenizedDocument::Create(
1298 schema_store_.get(), language_segmenter_.get(), std::move(document));
1299 if (!tokenized_document_or.ok()) {
1300 TransformStatus(tokenized_document_or.status(), result_status);
1301 return result_proto;
1302 }
1303 TokenizedDocument tokenized_document(
1304 std::move(tokenized_document_or).ValueOrDie());
1305
1306 auto document_id_or = document_store_->Put(
1307 tokenized_document.document(), tokenized_document.num_string_tokens(),
1308 put_document_stats);
1309 if (!document_id_or.ok()) {
1310 TransformStatus(document_id_or.status(), result_status);
1311 return result_proto;
1312 }
1313 DocumentId document_id = document_id_or.ValueOrDie();
1314
1315 auto data_indexing_handlers_or = CreateDataIndexingHandlers();
1316 if (!data_indexing_handlers_or.ok()) {
1317 TransformStatus(data_indexing_handlers_or.status(), result_status);
1318 return result_proto;
1319 }
1320 IndexProcessor index_processor(
1321 std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get());
1322
1323 auto index_status = index_processor.IndexDocument(
1324 tokenized_document, document_id, put_document_stats);
1325 // Getting an internal error from the index could possibly mean that the index
1326 // is broken. Try to rebuild them to recover.
1327 if (absl_ports::IsInternal(index_status)) {
1328 ICING_LOG(ERROR) << "Got an internal error from the index. Trying to "
1329 "rebuild the index!\n"
1330 << index_status.error_message();
1331 index_status = ClearAllIndices();
1332 if (index_status.ok()) {
1333 index_status = RestoreIndexIfNeeded().status;
1334 if (!index_status.ok()) {
1335 ICING_LOG(ERROR) << "Failed to reindex documents after a failure of "
1336 "indexing a document.";
1337 }
1338 } else {
1339 ICING_LOG(ERROR)
1340 << "Failed to clear indices after a failure of indexing a document.";
1341 }
1342 }
1343
1344 if (!index_status.ok()) {
1345 // If we encountered a failure or cannot resolve an internal error while
1346 // indexing this document, then mark it as deleted.
1347 int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
1348 libtextclassifier3::Status delete_status =
1349 document_store_->Delete(document_id, current_time_ms);
1350 if (!delete_status.ok()) {
1351 // This is pretty dire (and, hopefully, unlikely). We can't roll back the
1352 // document that we just added. Wipeout the whole index.
1353 ICING_LOG(ERROR) << "Cannot delete the document that is failed to index. "
1354 "Wiping out the whole Icing search engine.";
1355 ResetInternal();
1356 }
1357 }
1358
1359 TransformStatus(index_status, result_status);
1360 return result_proto;
1361 }
1362
Get(const std::string_view name_space,const std::string_view uri,const GetResultSpecProto & result_spec)1363 GetResultProto IcingSearchEngine::Get(const std::string_view name_space,
1364 const std::string_view uri,
1365 const GetResultSpecProto& result_spec) {
1366 GetResultProto result_proto;
1367 StatusProto* result_status = result_proto.mutable_status();
1368
1369 absl_ports::shared_lock l(&mutex_);
1370 if (!initialized_) {
1371 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1372 result_status->set_message("IcingSearchEngine has not been initialized!");
1373 return result_proto;
1374 }
1375
1376 auto document_or = document_store_->Get(name_space, uri);
1377 if (!document_or.ok()) {
1378 TransformStatus(document_or.status(), result_status);
1379 return result_proto;
1380 }
1381
1382 DocumentProto document = std::move(document_or).ValueOrDie();
1383 std::unique_ptr<ProjectionTree> type_projection_tree;
1384 std::unique_ptr<ProjectionTree> wildcard_projection_tree;
1385 for (const SchemaStore::ExpandedTypePropertyMask& type_field_mask :
1386 schema_store_->ExpandTypePropertyMasks(
1387 result_spec.type_property_masks())) {
1388 if (type_field_mask.schema_type == document.schema()) {
1389 type_projection_tree = std::make_unique<ProjectionTree>(type_field_mask);
1390 } else if (type_field_mask.schema_type ==
1391 SchemaStore::kSchemaTypeWildcard) {
1392 wildcard_projection_tree =
1393 std::make_unique<ProjectionTree>(type_field_mask);
1394 }
1395 }
1396
1397 // Apply projection
1398 if (type_projection_tree != nullptr) {
1399 projector::Project(type_projection_tree->root().children, &document);
1400 } else if (wildcard_projection_tree != nullptr) {
1401 projector::Project(wildcard_projection_tree->root().children, &document);
1402 }
1403
1404 result_status->set_code(StatusProto::OK);
1405 *result_proto.mutable_document() = std::move(document);
1406 return result_proto;
1407 }
1408
ReportUsage(const UsageReport & usage_report)1409 ReportUsageResultProto IcingSearchEngine::ReportUsage(
1410 const UsageReport& usage_report) {
1411 ReportUsageResultProto result_proto;
1412 StatusProto* result_status = result_proto.mutable_status();
1413
1414 absl_ports::unique_lock l(&mutex_);
1415 if (!initialized_) {
1416 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1417 result_status->set_message("IcingSearchEngine has not been initialized!");
1418 return result_proto;
1419 }
1420
1421 libtextclassifier3::Status status =
1422 document_store_->ReportUsage(usage_report);
1423 TransformStatus(status, result_status);
1424 return result_proto;
1425 }
1426
GetAllNamespaces()1427 GetAllNamespacesResultProto IcingSearchEngine::GetAllNamespaces() {
1428 GetAllNamespacesResultProto result_proto;
1429 StatusProto* result_status = result_proto.mutable_status();
1430
1431 absl_ports::shared_lock l(&mutex_);
1432 if (!initialized_) {
1433 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1434 result_status->set_message("IcingSearchEngine has not been initialized!");
1435 return result_proto;
1436 }
1437
1438 std::vector<std::string> namespaces = document_store_->GetAllNamespaces();
1439
1440 for (const std::string& namespace_ : namespaces) {
1441 result_proto.add_namespaces(namespace_);
1442 }
1443
1444 result_status->set_code(StatusProto::OK);
1445 return result_proto;
1446 }
1447
Delete(const std::string_view name_space,const std::string_view uri)1448 DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space,
1449 const std::string_view uri) {
1450 ICING_VLOG(1) << "Deleting document from doc store";
1451
1452 DeleteResultProto result_proto;
1453 StatusProto* result_status = result_proto.mutable_status();
1454
1455 absl_ports::unique_lock l(&mutex_);
1456 if (!initialized_) {
1457 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1458 result_status->set_message("IcingSearchEngine has not been initialized!");
1459 return result_proto;
1460 }
1461
1462 DeleteStatsProto* delete_stats = result_proto.mutable_delete_stats();
1463 delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SINGLE);
1464
1465 std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
1466 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
1467 // that can support error logging.
1468 int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
1469 libtextclassifier3::Status status =
1470 document_store_->Delete(name_space, uri, current_time_ms);
1471 if (!status.ok()) {
1472 LogSeverity::Code severity = ERROR;
1473 if (absl_ports::IsNotFound(status)) {
1474 severity = DBG;
1475 }
1476 ICING_LOG(severity) << status.error_message()
1477 << "Failed to delete Document. namespace: "
1478 << name_space << ", uri: " << uri;
1479 TransformStatus(status, result_status);
1480 return result_proto;
1481 }
1482
1483 result_status->set_code(StatusProto::OK);
1484 delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
1485 delete_stats->set_num_documents_deleted(1);
1486 return result_proto;
1487 }
1488
DeleteByNamespace(const std::string_view name_space)1489 DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace(
1490 const std::string_view name_space) {
1491 ICING_VLOG(1) << "Deleting namespace from doc store";
1492
1493 DeleteByNamespaceResultProto delete_result;
1494 StatusProto* result_status = delete_result.mutable_status();
1495 absl_ports::unique_lock l(&mutex_);
1496 if (!initialized_) {
1497 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1498 result_status->set_message("IcingSearchEngine has not been initialized!");
1499 return delete_result;
1500 }
1501
1502 DeleteStatsProto* delete_stats = delete_result.mutable_delete_stats();
1503 delete_stats->set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE);
1504
1505 std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
1506 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
1507 // that can support error logging.
1508 DocumentStore::DeleteByGroupResult doc_store_result =
1509 document_store_->DeleteByNamespace(name_space);
1510 if (!doc_store_result.status.ok()) {
1511 ICING_LOG(ERROR) << doc_store_result.status.error_message()
1512 << "Failed to delete Namespace: " << name_space;
1513 TransformStatus(doc_store_result.status, result_status);
1514 return delete_result;
1515 }
1516
1517 result_status->set_code(StatusProto::OK);
1518 delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
1519 delete_stats->set_num_documents_deleted(doc_store_result.num_docs_deleted);
1520 return delete_result;
1521 }
1522
DeleteBySchemaType(const std::string_view schema_type)1523 DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
1524 const std::string_view schema_type) {
1525 ICING_VLOG(1) << "Deleting type from doc store";
1526
1527 DeleteBySchemaTypeResultProto delete_result;
1528 StatusProto* result_status = delete_result.mutable_status();
1529 absl_ports::unique_lock l(&mutex_);
1530 if (!initialized_) {
1531 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1532 result_status->set_message("IcingSearchEngine has not been initialized!");
1533 return delete_result;
1534 }
1535
1536 DeleteStatsProto* delete_stats = delete_result.mutable_delete_stats();
1537 delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE);
1538
1539 std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
1540 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
1541 // that can support error logging.
1542 DocumentStore::DeleteByGroupResult doc_store_result =
1543 document_store_->DeleteBySchemaType(schema_type);
1544 if (!doc_store_result.status.ok()) {
1545 ICING_LOG(ERROR) << doc_store_result.status.error_message()
1546 << "Failed to delete SchemaType: " << schema_type;
1547 TransformStatus(doc_store_result.status, result_status);
1548 return delete_result;
1549 }
1550
1551 result_status->set_code(StatusProto::OK);
1552 delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
1553 delete_stats->set_num_documents_deleted(doc_store_result.num_docs_deleted);
1554 return delete_result;
1555 }
1556
DeleteByQuery(const SearchSpecProto & search_spec,bool return_deleted_document_info)1557 DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
1558 const SearchSpecProto& search_spec, bool return_deleted_document_info) {
1559 ICING_VLOG(1) << "Deleting documents for query " << search_spec.query()
1560 << " from doc store";
1561
1562 DeleteByQueryResultProto result_proto;
1563 StatusProto* result_status = result_proto.mutable_status();
1564
1565 absl_ports::unique_lock l(&mutex_);
1566 if (!initialized_) {
1567 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1568 result_status->set_message("IcingSearchEngine has not been initialized!");
1569 return result_proto;
1570 }
1571
1572 DeleteByQueryStatsProto* delete_stats =
1573 result_proto.mutable_delete_by_query_stats();
1574 delete_stats->set_query_length(search_spec.query().length());
1575 delete_stats->set_num_namespaces_filtered(
1576 search_spec.namespace_filters_size());
1577 delete_stats->set_num_schema_types_filtered(
1578 search_spec.schema_type_filters_size());
1579
1580 ScopedTimer delete_timer(clock_->GetNewTimer(), [delete_stats](int64_t t) {
1581 delete_stats->set_latency_ms(t);
1582 });
1583 libtextclassifier3::Status status =
1584 ValidateSearchSpec(search_spec, performance_configuration_);
1585 if (!status.ok()) {
1586 TransformStatus(status, result_status);
1587 return result_proto;
1588 }
1589
1590 std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
1591 // Gets unordered results from query processor
1592 auto query_processor_or = QueryProcessor::Create(
1593 index_.get(), integer_index_.get(), embedding_index_.get(),
1594 language_segmenter_.get(), normalizer_.get(), document_store_.get(),
1595 schema_store_.get(), clock_.get());
1596 if (!query_processor_or.ok()) {
1597 TransformStatus(query_processor_or.status(), result_status);
1598 delete_stats->set_parse_query_latency_ms(
1599 component_timer->GetElapsedMilliseconds());
1600 return result_proto;
1601 }
1602 std::unique_ptr<QueryProcessor> query_processor =
1603 std::move(query_processor_or).ValueOrDie();
1604
1605 int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
1606 auto query_results_or = query_processor->ParseSearch(
1607 search_spec, ScoringSpecProto::RankingStrategy::NONE, current_time_ms);
1608 if (!query_results_or.ok()) {
1609 TransformStatus(query_results_or.status(), result_status);
1610 delete_stats->set_parse_query_latency_ms(
1611 component_timer->GetElapsedMilliseconds());
1612 return result_proto;
1613 }
1614 QueryResults query_results = std::move(query_results_or).ValueOrDie();
1615 delete_stats->set_parse_query_latency_ms(
1616 component_timer->GetElapsedMilliseconds());
1617
1618 ICING_VLOG(2) << "Deleting the docs that matched the query.";
1619 int num_deleted = 0;
1620 // A map used to group deleted documents.
1621 // From the (namespace, type) pair to a list of uris.
1622 std::unordered_map<NamespaceTypePair,
1623 DeleteByQueryResultProto::DocumentGroupInfo*,
1624 NamespaceTypePairHasher>
1625 deleted_info_map;
1626
1627 component_timer = clock_->GetNewTimer();
1628 while (query_results.root_iterator->Advance().ok()) {
1629 ICING_VLOG(3) << "Deleting doc "
1630 << query_results.root_iterator->doc_hit_info().document_id();
1631 ++num_deleted;
1632 if (return_deleted_document_info) {
1633 status = RetrieveAndAddDocumentInfo(
1634 document_store_.get(), result_proto, deleted_info_map,
1635 query_results.root_iterator->doc_hit_info().document_id());
1636 if (!status.ok()) {
1637 TransformStatus(status, result_status);
1638 delete_stats->set_document_removal_latency_ms(
1639 component_timer->GetElapsedMilliseconds());
1640 return result_proto;
1641 }
1642 }
1643 status = document_store_->Delete(
1644 query_results.root_iterator->doc_hit_info().document_id(),
1645 current_time_ms);
1646 if (!status.ok()) {
1647 TransformStatus(status, result_status);
1648 delete_stats->set_document_removal_latency_ms(
1649 component_timer->GetElapsedMilliseconds());
1650 return result_proto;
1651 }
1652 }
1653 delete_stats->set_document_removal_latency_ms(
1654 component_timer->GetElapsedMilliseconds());
1655 int term_count = 0;
1656 for (const auto& section_and_terms : query_results.query_terms) {
1657 term_count += section_and_terms.second.size();
1658 }
1659 delete_stats->set_num_terms(term_count);
1660
1661 if (num_deleted > 0) {
1662 result_proto.mutable_status()->set_code(StatusProto::OK);
1663 } else {
1664 result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
1665 result_proto.mutable_status()->set_message(
1666 "No documents matched the query to delete by!");
1667 }
1668 delete_stats->set_num_documents_deleted(num_deleted);
1669 return result_proto;
1670 }
1671
PersistToDisk(PersistType::Code persist_type)1672 PersistToDiskResultProto IcingSearchEngine::PersistToDisk(
1673 PersistType::Code persist_type) {
1674 ICING_VLOG(1) << "Persisting data to disk";
1675
1676 PersistToDiskResultProto result_proto;
1677 StatusProto* result_status = result_proto.mutable_status();
1678
1679 absl_ports::unique_lock l(&mutex_);
1680 if (!initialized_) {
1681 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1682 result_status->set_message("IcingSearchEngine has not been initialized!");
1683 return result_proto;
1684 }
1685
1686 auto status = InternalPersistToDisk(persist_type);
1687 TransformStatus(status, result_status);
1688 return result_proto;
1689 }
1690
1691 // Optimizes Icing's storage
1692 //
1693 // Steps:
1694 // 1. Flush data to disk.
1695 // 2. Copy data needed to a tmp directory.
1696 // 3. Swap current directory and tmp directory.
Optimize()1697 OptimizeResultProto IcingSearchEngine::Optimize() {
1698 ICING_VLOG(1) << "Optimizing icing storage";
1699
1700 OptimizeResultProto result_proto;
1701 StatusProto* result_status = result_proto.mutable_status();
1702
1703 absl_ports::unique_lock l(&mutex_);
1704 if (!initialized_) {
1705 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1706 result_status->set_message("IcingSearchEngine has not been initialized!");
1707 return result_proto;
1708 }
1709
1710 OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats();
1711 ScopedTimer optimize_timer(
1712 clock_->GetNewTimer(),
1713 [optimize_stats](int64_t t) { optimize_stats->set_latency_ms(t); });
1714
1715 // Flushes data to disk before doing optimization
1716 auto status = InternalPersistToDisk(PersistType::FULL);
1717 if (!status.ok()) {
1718 TransformStatus(status, result_status);
1719 return result_proto;
1720 }
1721
1722 int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
1723 optimize_stats->set_storage_size_before(
1724 Filesystem::SanitizeFileSize(before_size));
1725
1726 // TODO(b/143646633): figure out if we need to optimize index and doc store
1727 // at the same time.
1728 std::unique_ptr<Timer> optimize_doc_store_timer = clock_->GetNewTimer();
1729 libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
1730 optimize_result_or = OptimizeDocumentStore(optimize_stats);
1731 optimize_stats->set_document_store_optimize_latency_ms(
1732 optimize_doc_store_timer->GetElapsedMilliseconds());
1733
1734 if (!optimize_result_or.ok() &&
1735 !absl_ports::IsDataLoss(optimize_result_or.status())) {
1736 // The status now is either ABORTED_ERROR or INTERNAL_ERROR.
1737 // If ABORTED_ERROR, Icing should still be working.
1738 // If INTERNAL_ERROR, we're having IO errors or other errors that we can't
1739 // recover from.
1740 TransformStatus(optimize_result_or.status(), result_status);
1741 return result_proto;
1742 }
1743
1744 // The status is either OK or DATA_LOSS. The optimized document store is
1745 // guaranteed to work, so we update index according to the new document store.
1746 std::unique_ptr<Timer> optimize_index_timer = clock_->GetNewTimer();
1747 auto doc_store_optimize_result_status = optimize_result_or.status();
1748 bool should_rebuild_index =
1749 !optimize_result_or.ok() ||
1750 optimize_result_or.ValueOrDie().should_rebuild_index ||
1751 ShouldRebuildIndex(*optimize_stats,
1752 options_.optimize_rebuild_index_threshold());
1753 if (!should_rebuild_index) {
1754 // At this point should_rebuild_index is false, so it means
1755 // optimize_result_or.ok() is true and therefore it is safe to call
1756 // ValueOrDie.
1757 DocumentStore::OptimizeResult optimize_result =
1758 std::move(optimize_result_or).ValueOrDie();
1759
1760 optimize_stats->set_index_restoration_mode(
1761 OptimizeStatsProto::INDEX_TRANSLATION);
1762 libtextclassifier3::Status index_optimize_status =
1763 index_->Optimize(optimize_result.document_id_old_to_new,
1764 document_store_->last_added_document_id());
1765 if (!index_optimize_status.ok()) {
1766 ICING_LOG(WARNING) << "Failed to optimize index. Error: "
1767 << index_optimize_status.error_message();
1768 should_rebuild_index = true;
1769 }
1770
1771 libtextclassifier3::Status integer_index_optimize_status =
1772 integer_index_->Optimize(optimize_result.document_id_old_to_new,
1773 document_store_->last_added_document_id());
1774 if (!integer_index_optimize_status.ok()) {
1775 ICING_LOG(WARNING) << "Failed to optimize integer index. Error: "
1776 << integer_index_optimize_status.error_message();
1777 should_rebuild_index = true;
1778 }
1779
1780 libtextclassifier3::Status qualified_id_join_index_optimize_status =
1781 qualified_id_join_index_->Optimize(
1782 optimize_result.document_id_old_to_new,
1783 optimize_result.namespace_id_old_to_new,
1784 document_store_->last_added_document_id());
1785 if (!qualified_id_join_index_optimize_status.ok()) {
1786 ICING_LOG(WARNING)
1787 << "Failed to optimize qualified id join index. Error: "
1788 << qualified_id_join_index_optimize_status.error_message();
1789 should_rebuild_index = true;
1790 }
1791
1792 libtextclassifier3::Status embedding_index_optimize_status =
1793 embedding_index_->Optimize(optimize_result.document_id_old_to_new,
1794 document_store_->last_added_document_id());
1795 if (!embedding_index_optimize_status.ok()) {
1796 ICING_LOG(WARNING) << "Failed to optimize embedding index. Error: "
1797 << embedding_index_optimize_status.error_message();
1798 should_rebuild_index = true;
1799 }
1800 }
1801 // If we received a DATA_LOSS error from OptimizeDocumentStore, we have a
1802 // valid document store, but it might be the old one or the new one. So throw
1803 // out the index data and rebuild from scratch.
1804 // Also rebuild index if DocumentStore::OptimizeInto hints to do so.
1805 // Likewise, if Index::Optimize failed, then attempt to recover the index by
1806 // rebuilding from scratch.
1807 // If ShouldRebuildIndex() returns true, we will also rebuild the index for
1808 // better performance.
1809 if (should_rebuild_index) {
1810 optimize_stats->set_index_restoration_mode(
1811 OptimizeStatsProto::FULL_INDEX_REBUILD);
1812 ICING_LOG(WARNING) << "Clearing the entire index!";
1813
1814 libtextclassifier3::Status index_clear_status = ClearAllIndices();
1815 if (!index_clear_status.ok()) {
1816 status = absl_ports::Annotate(
1817 absl_ports::InternalError("Failed to clear index."),
1818 index_clear_status.error_message());
1819 TransformStatus(status, result_status);
1820 optimize_stats->set_index_restoration_latency_ms(
1821 optimize_index_timer->GetElapsedMilliseconds());
1822 return result_proto;
1823 }
1824
1825 IndexRestorationResult index_restoration_status = RestoreIndexIfNeeded();
1826 // DATA_LOSS means that we have successfully re-added content to the index.
1827 // Some indexed content was lost, but otherwise the index is in a valid
1828 // state and can be queried.
1829 if (!index_restoration_status.status.ok() &&
1830 !absl_ports::IsDataLoss(index_restoration_status.status)) {
1831 status = absl_ports::Annotate(
1832 absl_ports::InternalError(
1833 "Failed to reindex documents after optimization."),
1834 index_restoration_status.status.error_message());
1835
1836 TransformStatus(status, result_status);
1837 optimize_stats->set_index_restoration_latency_ms(
1838 optimize_index_timer->GetElapsedMilliseconds());
1839 return result_proto;
1840 }
1841 }
1842 optimize_stats->set_index_restoration_latency_ms(
1843 optimize_index_timer->GetElapsedMilliseconds());
1844
1845 // Read the optimize status to get the time that we last ran.
1846 std::string optimize_status_filename =
1847 absl_ports::StrCat(options_.base_dir(), "/", kOptimizeStatusFilename);
1848 FileBackedProto<OptimizeStatusProto> optimize_status_file(
1849 *filesystem_, optimize_status_filename);
1850 auto optimize_status_or = optimize_status_file.Read();
1851 int64_t current_time = clock_->GetSystemTimeMilliseconds();
1852 if (optimize_status_or.ok()) {
1853 // If we have trouble reading the status or this is the first time that
1854 // we've ever run, don't set this field.
1855 optimize_stats->set_time_since_last_optimize_ms(
1856 current_time - optimize_status_or.ValueOrDie()
1857 ->last_successful_optimize_run_time_ms());
1858 }
1859
1860 // Update the status for this run and write it.
1861 auto optimize_status = std::make_unique<OptimizeStatusProto>();
1862 optimize_status->set_last_successful_optimize_run_time_ms(current_time);
1863 auto write_status = optimize_status_file.Write(std::move(optimize_status));
1864 if (!write_status.ok()) {
1865 ICING_LOG(ERROR) << "Failed to write optimize status:\n"
1866 << write_status.error_message();
1867 }
1868
1869 // Flushes data to disk after doing optimization
1870 status = InternalPersistToDisk(PersistType::FULL);
1871 if (!status.ok()) {
1872 TransformStatus(status, result_status);
1873 return result_proto;
1874 }
1875
1876 int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
1877 optimize_stats->set_storage_size_after(
1878 Filesystem::SanitizeFileSize(after_size));
1879
1880 TransformStatus(doc_store_optimize_result_status, result_status);
1881 return result_proto;
1882 }
1883
GetOptimizeInfo()1884 GetOptimizeInfoResultProto IcingSearchEngine::GetOptimizeInfo() {
1885 ICING_VLOG(1) << "Getting optimize info from IcingSearchEngine";
1886
1887 GetOptimizeInfoResultProto result_proto;
1888 StatusProto* result_status = result_proto.mutable_status();
1889
1890 absl_ports::shared_lock l(&mutex_);
1891 if (!initialized_) {
1892 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1893 result_status->set_message("IcingSearchEngine has not been initialized!");
1894 return result_proto;
1895 }
1896
1897 // Read the optimize status to get the time that we last ran.
1898 std::string optimize_status_filename =
1899 absl_ports::StrCat(options_.base_dir(), "/", kOptimizeStatusFilename);
1900 FileBackedProto<OptimizeStatusProto> optimize_status_file(
1901 *filesystem_, optimize_status_filename);
1902 auto optimize_status_or = optimize_status_file.Read();
1903 int64_t current_time = clock_->GetSystemTimeMilliseconds();
1904
1905 if (optimize_status_or.ok()) {
1906 // If we have trouble reading the status or this is the first time that
1907 // we've ever run, don't set this field.
1908 result_proto.set_time_since_last_optimize_ms(
1909 current_time - optimize_status_or.ValueOrDie()
1910 ->last_successful_optimize_run_time_ms());
1911 }
1912
1913 // Get stats from DocumentStore
1914 auto doc_store_optimize_info_or = document_store_->GetOptimizeInfo();
1915 if (!doc_store_optimize_info_or.ok()) {
1916 TransformStatus(doc_store_optimize_info_or.status(), result_status);
1917 return result_proto;
1918 }
1919 DocumentStore::OptimizeInfo doc_store_optimize_info =
1920 doc_store_optimize_info_or.ValueOrDie();
1921 result_proto.set_optimizable_docs(doc_store_optimize_info.optimizable_docs);
1922
1923 if (doc_store_optimize_info.optimizable_docs == 0) {
1924 // Can return early since there's nothing to calculate on the index side
1925 result_proto.set_estimated_optimizable_bytes(0);
1926 result_status->set_code(StatusProto::OK);
1927 return result_proto;
1928 }
1929
1930 // Get stats from Index.
1931 auto index_elements_size_or = index_->GetElementsSize();
1932 if (!index_elements_size_or.ok()) {
1933 TransformStatus(index_elements_size_or.status(), result_status);
1934 return result_proto;
1935 }
1936 int64_t index_elements_size = index_elements_size_or.ValueOrDie();
1937
1938 // TODO(b/259744228): add stats for integer index
1939
1940 // Sum up the optimizable sizes from DocumentStore and Index
1941 result_proto.set_estimated_optimizable_bytes(
1942 index_elements_size * doc_store_optimize_info.optimizable_docs /
1943 doc_store_optimize_info.total_docs +
1944 doc_store_optimize_info.estimated_optimizable_bytes);
1945
1946 result_status->set_code(StatusProto::OK);
1947 return result_proto;
1948 }
1949
GetStorageInfo()1950 StorageInfoResultProto IcingSearchEngine::GetStorageInfo() {
1951 StorageInfoResultProto result;
1952 absl_ports::shared_lock l(&mutex_);
1953 if (!initialized_) {
1954 result.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
1955 result.mutable_status()->set_message(
1956 "IcingSearchEngine has not been initialized!");
1957 return result;
1958 }
1959
1960 int64_t index_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
1961 result.mutable_storage_info()->set_total_storage_size(
1962 Filesystem::SanitizeFileSize(index_size));
1963 *result.mutable_storage_info()->mutable_document_storage_info() =
1964 document_store_->GetStorageInfo();
1965 *result.mutable_storage_info()->mutable_schema_store_storage_info() =
1966 schema_store_->GetStorageInfo();
1967 *result.mutable_storage_info()->mutable_index_storage_info() =
1968 index_->GetStorageInfo();
1969 // TODO(b/259744228): add stats for integer index
1970 result.mutable_status()->set_code(StatusProto::OK);
1971 return result;
1972 }
1973
GetDebugInfo(DebugInfoVerbosity::Code verbosity)1974 DebugInfoResultProto IcingSearchEngine::GetDebugInfo(
1975 DebugInfoVerbosity::Code verbosity) {
1976 DebugInfoResultProto debug_info;
1977 StatusProto* result_status = debug_info.mutable_status();
1978 absl_ports::shared_lock l(&mutex_);
1979 if (!initialized_) {
1980 debug_info.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
1981 debug_info.mutable_status()->set_message(
1982 "IcingSearchEngine has not been initialized!");
1983 return debug_info;
1984 }
1985
1986 // Index
1987 *debug_info.mutable_debug_info()->mutable_index_info() =
1988 index_->GetDebugInfo(verbosity);
1989
1990 // TODO(b/259744228): add debug info for integer index
1991
1992 // Document Store
1993 libtextclassifier3::StatusOr<DocumentDebugInfoProto> document_debug_info =
1994 document_store_->GetDebugInfo(verbosity);
1995 if (!document_debug_info.ok()) {
1996 TransformStatus(document_debug_info.status(), result_status);
1997 return debug_info;
1998 }
1999 *debug_info.mutable_debug_info()->mutable_document_info() =
2000 std::move(document_debug_info).ValueOrDie();
2001
2002 // Schema Store
2003 libtextclassifier3::StatusOr<SchemaDebugInfoProto> schema_debug_info =
2004 schema_store_->GetDebugInfo();
2005 if (!schema_debug_info.ok()) {
2006 TransformStatus(schema_debug_info.status(), result_status);
2007 return debug_info;
2008 }
2009 *debug_info.mutable_debug_info()->mutable_schema_info() =
2010 std::move(schema_debug_info).ValueOrDie();
2011
2012 result_status->set_code(StatusProto::OK);
2013 return debug_info;
2014 }
2015
InternalPersistToDisk(PersistType::Code persist_type)2016 libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk(
2017 PersistType::Code persist_type) {
2018 if (persist_type == PersistType::LITE) {
2019 return document_store_->PersistToDisk(persist_type);
2020 }
2021 ICING_RETURN_IF_ERROR(schema_store_->PersistToDisk());
2022 ICING_RETURN_IF_ERROR(document_store_->PersistToDisk(PersistType::FULL));
2023 ICING_RETURN_IF_ERROR(index_->PersistToDisk());
2024 ICING_RETURN_IF_ERROR(integer_index_->PersistToDisk());
2025 ICING_RETURN_IF_ERROR(qualified_id_join_index_->PersistToDisk());
2026 ICING_RETURN_IF_ERROR(embedding_index_->PersistToDisk());
2027
2028 return libtextclassifier3::Status::OK;
2029 }
2030
Search(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2031 SearchResultProto IcingSearchEngine::Search(
2032 const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2033 const ResultSpecProto& result_spec) {
2034 if (search_spec.use_read_only_search()) {
2035 return SearchLockedShared(search_spec, scoring_spec, result_spec);
2036 } else {
2037 return SearchLockedExclusive(search_spec, scoring_spec, result_spec);
2038 }
2039 }
2040
SearchLockedShared(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2041 SearchResultProto IcingSearchEngine::SearchLockedShared(
2042 const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2043 const ResultSpecProto& result_spec) {
2044 std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
2045
2046 // Only acquire an overall read-lock for this implementation. Finer-grained
2047 // locks are implemented around code paths that write changes to Icing's data
2048 // members.
2049 absl_ports::shared_lock l(&mutex_);
2050 int64_t lock_acquisition_latency = overall_timer->GetElapsedMilliseconds();
2051
2052 SearchResultProto result_proto =
2053 InternalSearch(search_spec, scoring_spec, result_spec);
2054
2055 result_proto.mutable_query_stats()->set_lock_acquisition_latency_ms(
2056 lock_acquisition_latency);
2057 result_proto.mutable_query_stats()->set_latency_ms(
2058 overall_timer->GetElapsedMilliseconds());
2059 return result_proto;
2060 }
2061
SearchLockedExclusive(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2062 SearchResultProto IcingSearchEngine::SearchLockedExclusive(
2063 const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2064 const ResultSpecProto& result_spec) {
2065 std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
2066
2067 // Acquire the overall write-lock for this locked implementation.
2068 absl_ports::unique_lock l(&mutex_);
2069 int64_t lock_acquisition_latency = overall_timer->GetElapsedMilliseconds();
2070
2071 SearchResultProto result_proto =
2072 InternalSearch(search_spec, scoring_spec, result_spec);
2073
2074 result_proto.mutable_query_stats()->set_lock_acquisition_latency_ms(
2075 lock_acquisition_latency);
2076 result_proto.mutable_query_stats()->set_latency_ms(
2077 overall_timer->GetElapsedMilliseconds());
2078 return result_proto;
2079 }
2080
InternalSearch(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2081 SearchResultProto IcingSearchEngine::InternalSearch(
2082 const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2083 const ResultSpecProto& result_spec) {
2084 SearchResultProto result_proto;
2085 StatusProto* result_status = result_proto.mutable_status();
2086
2087 QueryStatsProto* query_stats = result_proto.mutable_query_stats();
2088 query_stats->set_is_first_page(true);
2089 query_stats->set_requested_page_size(result_spec.num_per_page());
2090
2091 // TODO(b/305098009): deprecate search-related flat fields in query_stats.
2092 query_stats->set_num_namespaces_filtered(
2093 search_spec.namespace_filters_size());
2094 query_stats->set_num_schema_types_filtered(
2095 search_spec.schema_type_filters_size());
2096 query_stats->set_query_length(search_spec.query().length());
2097 query_stats->set_ranking_strategy(scoring_spec.rank_by());
2098
2099 if (!initialized_) {
2100 result_status->set_code(StatusProto::FAILED_PRECONDITION);
2101 result_status->set_message("IcingSearchEngine has not been initialized!");
2102 return result_proto;
2103 }
2104 index_->PublishQueryStats(query_stats);
2105
2106 libtextclassifier3::Status status =
2107 ValidateResultSpec(document_store_.get(), result_spec);
2108 if (!status.ok()) {
2109 TransformStatus(status, result_status);
2110 return result_proto;
2111 }
2112 status = ValidateSearchSpec(search_spec, performance_configuration_);
2113 if (!status.ok()) {
2114 TransformStatus(status, result_status);
2115 return result_proto;
2116 }
2117
2118 const JoinSpecProto& join_spec = search_spec.join_spec();
2119 std::unique_ptr<JoinChildrenFetcher> join_children_fetcher;
2120 std::unique_ptr<ResultAdjustmentInfo> child_result_adjustment_info;
2121 int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
2122 if (!join_spec.parent_property_expression().empty() &&
2123 !join_spec.child_property_expression().empty()) {
2124 query_stats->set_is_join_query(true);
2125 QueryStatsProto::SearchStats* child_search_stats =
2126 query_stats->mutable_child_search_stats();
2127
2128 // Process child query
2129 QueryScoringResults nested_query_scoring_results = ProcessQueryAndScore(
2130 join_spec.nested_spec().search_spec(),
2131 join_spec.nested_spec().scoring_spec(),
2132 join_spec.nested_spec().result_spec(),
2133 /*join_children_fetcher=*/nullptr, current_time_ms, child_search_stats);
2134 if (!nested_query_scoring_results.status.ok()) {
2135 TransformStatus(nested_query_scoring_results.status, result_status);
2136 return result_proto;
2137 }
2138
2139 JoinProcessor join_processor(document_store_.get(), schema_store_.get(),
2140 qualified_id_join_index_.get(),
2141 current_time_ms);
2142 // Building a JoinChildrenFetcher where child documents are grouped by
2143 // their joinable values.
2144 libtextclassifier3::StatusOr<JoinChildrenFetcher> join_children_fetcher_or =
2145 join_processor.GetChildrenFetcher(
2146 search_spec.join_spec(),
2147 std::move(nested_query_scoring_results.scored_document_hits));
2148 if (!join_children_fetcher_or.ok()) {
2149 TransformStatus(join_children_fetcher_or.status(), result_status);
2150 return result_proto;
2151 }
2152 join_children_fetcher = std::make_unique<JoinChildrenFetcher>(
2153 std::move(join_children_fetcher_or).ValueOrDie());
2154
2155 // Assign child's ResultAdjustmentInfo.
2156 child_result_adjustment_info = std::make_unique<ResultAdjustmentInfo>(
2157 join_spec.nested_spec().search_spec(),
2158 join_spec.nested_spec().scoring_spec(),
2159 join_spec.nested_spec().result_spec(), schema_store_.get(),
2160 std::move(nested_query_scoring_results.query_terms));
2161 }
2162
2163 // Process parent query
2164 QueryStatsProto::SearchStats* parent_search_stats =
2165 query_stats->mutable_parent_search_stats();
2166 QueryScoringResults query_scoring_results = ProcessQueryAndScore(
2167 search_spec, scoring_spec, result_spec, join_children_fetcher.get(),
2168 current_time_ms, parent_search_stats);
2169 // TODO(b/305098009): deprecate search-related flat fields in query_stats.
2170 query_stats->set_num_terms(parent_search_stats->num_terms());
2171 query_stats->set_parse_query_latency_ms(
2172 parent_search_stats->parse_query_latency_ms());
2173 query_stats->set_scoring_latency_ms(
2174 parent_search_stats->scoring_latency_ms());
2175 query_stats->set_num_documents_scored(
2176 parent_search_stats->num_documents_scored());
2177 if (!query_scoring_results.status.ok()) {
2178 TransformStatus(query_scoring_results.status, result_status);
2179 return result_proto;
2180 }
2181
2182 // Returns early for empty result
2183 if (query_scoring_results.scored_document_hits.empty()) {
2184 result_status->set_code(StatusProto::OK);
2185 return result_proto;
2186 }
2187
2188 // Construct parent's result adjustment info.
2189 auto parent_result_adjustment_info = std::make_unique<ResultAdjustmentInfo>(
2190 search_spec, scoring_spec, result_spec, schema_store_.get(),
2191 std::move(query_scoring_results.query_terms));
2192
2193 std::unique_ptr<ScoredDocumentHitsRanker> ranker;
2194 if (join_children_fetcher != nullptr) {
2195 std::unique_ptr<Timer> join_timer = clock_->GetNewTimer();
2196 // Join 2 scored document hits
2197 JoinProcessor join_processor(document_store_.get(), schema_store_.get(),
2198 qualified_id_join_index_.get(),
2199 current_time_ms);
2200 libtextclassifier3::StatusOr<std::vector<JoinedScoredDocumentHit>>
2201 joined_result_document_hits_or = join_processor.Join(
2202 join_spec, std::move(query_scoring_results.scored_document_hits),
2203 *join_children_fetcher);
2204 if (!joined_result_document_hits_or.ok()) {
2205 TransformStatus(joined_result_document_hits_or.status(), result_status);
2206 return result_proto;
2207 }
2208 std::vector<JoinedScoredDocumentHit> joined_result_document_hits =
2209 std::move(joined_result_document_hits_or).ValueOrDie();
2210
2211 query_stats->set_join_latency_ms(join_timer->GetElapsedMilliseconds());
2212
2213 std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2214 // Ranks results
2215 ranker = std::make_unique<
2216 PriorityQueueScoredDocumentHitsRanker<JoinedScoredDocumentHit>>(
2217 std::move(joined_result_document_hits),
2218 /*is_descending=*/scoring_spec.order_by() ==
2219 ScoringSpecProto::Order::DESC);
2220 query_stats->set_ranking_latency_ms(
2221 component_timer->GetElapsedMilliseconds());
2222 } else {
2223 // Non-join query
2224 std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2225 // Ranks results
2226 ranker = std::make_unique<
2227 PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
2228 std::move(query_scoring_results.scored_document_hits),
2229 /*is_descending=*/scoring_spec.order_by() ==
2230 ScoringSpecProto::Order::DESC);
2231 query_stats->set_ranking_latency_ms(
2232 component_timer->GetElapsedMilliseconds());
2233 }
2234
2235 std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2236 // CacheAndRetrieveFirstPage and retrieves the document protos and snippets if
2237 // requested
2238 auto result_retriever_or =
2239 ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
2240 language_segmenter_.get(), normalizer_.get());
2241 if (!result_retriever_or.ok()) {
2242 TransformStatus(result_retriever_or.status(), result_status);
2243 query_stats->set_document_retrieval_latency_ms(
2244 component_timer->GetElapsedMilliseconds());
2245 return result_proto;
2246 }
2247 std::unique_ptr<ResultRetrieverV2> result_retriever =
2248 std::move(result_retriever_or).ValueOrDie();
2249
2250 libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
2251 page_result_info_or = result_state_manager_->CacheAndRetrieveFirstPage(
2252 std::move(ranker), std::move(parent_result_adjustment_info),
2253 std::move(child_result_adjustment_info), result_spec,
2254 *document_store_, *result_retriever, current_time_ms);
2255 if (!page_result_info_or.ok()) {
2256 TransformStatus(page_result_info_or.status(), result_status);
2257 query_stats->set_document_retrieval_latency_ms(
2258 component_timer->GetElapsedMilliseconds());
2259 return result_proto;
2260 }
2261 std::pair<uint64_t, PageResult> page_result_info =
2262 std::move(page_result_info_or).ValueOrDie();
2263
2264 // Assembles the final search result proto
2265 result_proto.mutable_results()->Reserve(
2266 page_result_info.second.results.size());
2267
2268 int32_t child_count = 0;
2269 for (SearchResultProto::ResultProto& result :
2270 page_result_info.second.results) {
2271 child_count += result.joined_results_size();
2272 result_proto.mutable_results()->Add(std::move(result));
2273 }
2274
2275 result_status->set_code(StatusProto::OK);
2276 if (page_result_info.first != kInvalidNextPageToken) {
2277 result_proto.set_next_page_token(page_result_info.first);
2278 }
2279
2280 query_stats->set_document_retrieval_latency_ms(
2281 component_timer->GetElapsedMilliseconds());
2282 query_stats->set_num_results_returned_current_page(
2283 result_proto.results_size());
2284
2285 query_stats->set_num_joined_results_returned_current_page(child_count);
2286
2287 query_stats->set_num_results_with_snippets(
2288 page_result_info.second.num_results_with_snippets);
2289 return result_proto;
2290 }
2291
ProcessQueryAndScore(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec,const JoinChildrenFetcher * join_children_fetcher,int64_t current_time_ms,QueryStatsProto::SearchStats * search_stats)2292 IcingSearchEngine::QueryScoringResults IcingSearchEngine::ProcessQueryAndScore(
2293 const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2294 const ResultSpecProto& result_spec,
2295 const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms,
2296 QueryStatsProto::SearchStats* search_stats) {
2297 search_stats->set_num_namespaces_filtered(
2298 search_spec.namespace_filters_size());
2299 search_stats->set_num_schema_types_filtered(
2300 search_spec.schema_type_filters_size());
2301 search_stats->set_query_length(search_spec.query().length());
2302 search_stats->set_ranking_strategy(scoring_spec.rank_by());
2303
2304 std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2305
2306 // Gets unordered results from query processor
2307 auto query_processor_or = QueryProcessor::Create(
2308 index_.get(), integer_index_.get(), embedding_index_.get(),
2309 language_segmenter_.get(), normalizer_.get(), document_store_.get(),
2310 schema_store_.get(), clock_.get());
2311 if (!query_processor_or.ok()) {
2312 search_stats->set_parse_query_latency_ms(
2313 component_timer->GetElapsedMilliseconds());
2314 return QueryScoringResults(std::move(query_processor_or).status(),
2315 /*query_terms_in=*/{},
2316 /*scored_document_hits_in=*/{});
2317 }
2318 std::unique_ptr<QueryProcessor> query_processor =
2319 std::move(query_processor_or).ValueOrDie();
2320
2321 auto ranking_strategy_or = GetRankingStrategyFromScoringSpec(scoring_spec);
2322 libtextclassifier3::StatusOr<QueryResults> query_results_or;
2323 if (ranking_strategy_or.ok()) {
2324 query_results_or = query_processor->ParseSearch(
2325 search_spec, ranking_strategy_or.ValueOrDie(), current_time_ms,
2326 search_stats);
2327 } else {
2328 query_results_or = ranking_strategy_or.status();
2329 }
2330 search_stats->set_parse_query_latency_ms(
2331 component_timer->GetElapsedMilliseconds());
2332 if (!query_results_or.ok()) {
2333 return QueryScoringResults(std::move(query_results_or).status(),
2334 /*query_terms_in=*/{},
2335 /*scored_document_hits_in=*/{});
2336 }
2337 QueryResults query_results = std::move(query_results_or).ValueOrDie();
2338
2339 // Set SearchStats related to QueryResults.
2340 int term_count = 0;
2341 for (const auto& section_and_terms : query_results.query_terms) {
2342 term_count += section_and_terms.second.size();
2343 }
2344 search_stats->set_num_terms(term_count);
2345
2346 if (query_results.features_in_use.count(kNumericSearchFeature)) {
2347 search_stats->set_is_numeric_query(true);
2348 }
2349
2350 component_timer = clock_->GetNewTimer();
2351 // Scores but does not rank the results.
2352 libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>>
2353 scoring_processor_or = ScoringProcessor::Create(
2354 scoring_spec, /*default_semantic_metric_type=*/
2355 search_spec.embedding_query_metric_type(), document_store_.get(),
2356 schema_store_.get(), current_time_ms, join_children_fetcher,
2357 &query_results.embedding_query_results);
2358 if (!scoring_processor_or.ok()) {
2359 return QueryScoringResults(std::move(scoring_processor_or).status(),
2360 std::move(query_results.query_terms),
2361 /*scored_document_hits_in=*/{});
2362 }
2363 std::unique_ptr<ScoringProcessor> scoring_processor =
2364 std::move(scoring_processor_or).ValueOrDie();
2365 std::vector<ScoredDocumentHit> scored_document_hits =
2366 scoring_processor->Score(
2367 std::move(query_results.root_iterator), result_spec.num_to_score(),
2368 &query_results.query_term_iterators, search_stats);
2369 search_stats->set_scoring_latency_ms(
2370 component_timer->GetElapsedMilliseconds());
2371
2372 return QueryScoringResults(libtextclassifier3::Status::OK,
2373 std::move(query_results.query_terms),
2374 std::move(scored_document_hits));
2375 }
2376
GetNextPage(uint64_t next_page_token)2377 SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) {
2378 SearchResultProto result_proto;
2379 StatusProto* result_status = result_proto.mutable_status();
2380
2381 QueryStatsProto* query_stats = result_proto.mutable_query_stats();
2382 query_stats->set_is_first_page(false);
2383 std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
2384 // ResultStateManager has its own writer lock, so here we only need a reader
2385 // lock for other components.
2386 absl_ports::shared_lock l(&mutex_);
2387 query_stats->set_lock_acquisition_latency_ms(
2388 overall_timer->GetElapsedMilliseconds());
2389 if (!initialized_) {
2390 result_status->set_code(StatusProto::FAILED_PRECONDITION);
2391 result_status->set_message("IcingSearchEngine has not been initialized!");
2392 return result_proto;
2393 }
2394
2395 auto result_retriever_or =
2396 ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
2397 language_segmenter_.get(), normalizer_.get());
2398 if (!result_retriever_or.ok()) {
2399 TransformStatus(result_retriever_or.status(), result_status);
2400 return result_proto;
2401 }
2402 std::unique_ptr<ResultRetrieverV2> result_retriever =
2403 std::move(result_retriever_or).ValueOrDie();
2404
2405 int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
2406 libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
2407 page_result_info_or = result_state_manager_->GetNextPage(
2408 next_page_token, *result_retriever, current_time_ms);
2409 if (!page_result_info_or.ok()) {
2410 if (absl_ports::IsNotFound(page_result_info_or.status())) {
2411 // NOT_FOUND means an empty result.
2412 result_status->set_code(StatusProto::OK);
2413 } else {
2414 // Real error, pass up.
2415 TransformStatus(page_result_info_or.status(), result_status);
2416 }
2417 return result_proto;
2418 }
2419
2420 std::pair<uint64_t, PageResult> page_result_info =
2421 std::move(page_result_info_or).ValueOrDie();
2422 query_stats->set_requested_page_size(
2423 page_result_info.second.requested_page_size);
2424
2425 // Assembles the final search result proto
2426 result_proto.mutable_results()->Reserve(
2427 page_result_info.second.results.size());
2428
2429 int32_t child_count = 0;
2430 for (SearchResultProto::ResultProto& result :
2431 page_result_info.second.results) {
2432 child_count += result.joined_results_size();
2433 result_proto.mutable_results()->Add(std::move(result));
2434 }
2435
2436 result_status->set_code(StatusProto::OK);
2437 if (page_result_info.first != kInvalidNextPageToken) {
2438 result_proto.set_next_page_token(page_result_info.first);
2439 }
2440
2441 // The only thing that we're doing is document retrieval. So document
2442 // retrieval latency and overall latency are the same and can use the same
2443 // timer.
2444 query_stats->set_document_retrieval_latency_ms(
2445 overall_timer->GetElapsedMilliseconds());
2446 query_stats->set_latency_ms(overall_timer->GetElapsedMilliseconds());
2447 query_stats->set_num_results_returned_current_page(
2448 result_proto.results_size());
2449 query_stats->set_num_results_with_snippets(
2450 page_result_info.second.num_results_with_snippets);
2451 query_stats->set_num_joined_results_returned_current_page(child_count);
2452
2453 return result_proto;
2454 }
2455
InvalidateNextPageToken(uint64_t next_page_token)2456 void IcingSearchEngine::InvalidateNextPageToken(uint64_t next_page_token) {
2457 absl_ports::shared_lock l(&mutex_);
2458 if (!initialized_) {
2459 ICING_LOG(ERROR) << "IcingSearchEngine has not been initialized!";
2460 return;
2461 }
2462 result_state_manager_->InvalidateResultState(next_page_token);
2463 }
2464
2465 libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
OptimizeDocumentStore(OptimizeStatsProto * optimize_stats)2466 IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) {
2467 // Gets the current directory path and an empty tmp directory path for
2468 // document store optimization.
2469 const std::string current_document_dir =
2470 MakeDocumentDirectoryPath(options_.base_dir());
2471 const std::string temporary_document_dir =
2472 MakeDocumentTemporaryDirectoryPath(options_.base_dir());
2473 if (!filesystem_->DeleteDirectoryRecursively(
2474 temporary_document_dir.c_str()) ||
2475 !filesystem_->CreateDirectoryRecursively(
2476 temporary_document_dir.c_str())) {
2477 return absl_ports::AbortedError(absl_ports::StrCat(
2478 "Failed to create a tmp directory: ", temporary_document_dir));
2479 }
2480
2481 // Copies valid document data to tmp directory
2482 libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
2483 optimize_result_or = document_store_->OptimizeInto(
2484 temporary_document_dir, language_segmenter_.get(), optimize_stats);
2485
2486 // Handles error if any
2487 if (!optimize_result_or.ok()) {
2488 filesystem_->DeleteDirectoryRecursively(temporary_document_dir.c_str());
2489 return absl_ports::Annotate(
2490 absl_ports::AbortedError("Failed to optimize document store"),
2491 optimize_result_or.status().error_message());
2492 }
2493
2494 // result_state_manager_ depends on document_store_. So we need to reset it at
2495 // the same time that we reset the document_store_.
2496 result_state_manager_.reset();
2497 document_store_.reset();
2498
2499 // When swapping files, always put the current working directory at the
2500 // second place because it is renamed at the latter position so we're less
2501 // vulnerable to errors.
2502 if (!filesystem_->SwapFiles(temporary_document_dir.c_str(),
2503 current_document_dir.c_str())) {
2504 ICING_LOG(ERROR) << "Failed to swap files";
2505
2506 // Ensures that current directory is still present.
2507 if (!filesystem_->CreateDirectoryRecursively(
2508 current_document_dir.c_str())) {
2509 // Can't even create the old directory. Mark as uninitialized and return
2510 // INTERNAL.
2511 initialized_ = false;
2512 return absl_ports::InternalError(
2513 "Failed to create file directory for document store");
2514 }
2515
2516 // Tries to rebuild document store if swapping fails, to avoid leaving the
2517 // system in the broken state for future operations.
2518 auto create_result_or = DocumentStore::Create(
2519 filesystem_.get(), current_document_dir, clock_.get(),
2520 schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false,
2521 /*document_store_namespace_id_fingerprint=*/true,
2522 /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/true,
2523 options_.compression_level(), /*initialize_stats=*/nullptr);
2524 // TODO(b/144458732): Implement a more robust version of
2525 // TC_ASSIGN_OR_RETURN that can support error logging.
2526 if (!create_result_or.ok()) {
2527 // Unable to create DocumentStore from the old file. Mark as uninitialized
2528 // and return INTERNAL.
2529 initialized_ = false;
2530 ICING_LOG(ERROR) << "Failed to create document store instance";
2531 return absl_ports::Annotate(
2532 absl_ports::InternalError("Failed to create document store instance"),
2533 create_result_or.status().error_message());
2534 }
2535 document_store_ = std::move(create_result_or.ValueOrDie().document_store);
2536 result_state_manager_ = std::make_unique<ResultStateManager>(
2537 performance_configuration_.max_num_total_hits, *document_store_);
2538
2539 // Potential data loss
2540 // TODO(b/147373249): Find a way to detect true data loss error
2541 return absl_ports::DataLossError(
2542 "Failed to optimize document store, there might be data loss");
2543 }
2544
2545 // Recreates the doc store instance
2546 auto create_result_or = DocumentStore::Create(
2547 filesystem_.get(), current_document_dir, clock_.get(),
2548 schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false,
2549 /*document_store_namespace_id_fingerprint=*/true,
2550 /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/true,
2551 options_.compression_level(), /*initialize_stats=*/nullptr);
2552 if (!create_result_or.ok()) {
2553 // Unable to create DocumentStore from the new file. Mark as uninitialized
2554 // and return INTERNAL.
2555 initialized_ = false;
2556 return absl_ports::InternalError(
2557 "Document store has been optimized, but a valid document store "
2558 "instance can't be created");
2559 }
2560 DocumentStore::CreateResult create_result =
2561 std::move(create_result_or).ValueOrDie();
2562 document_store_ = std::move(create_result.document_store);
2563 result_state_manager_ = std::make_unique<ResultStateManager>(
2564 performance_configuration_.max_num_total_hits, *document_store_);
2565
2566 // Deletes tmp directory
2567 if (!filesystem_->DeleteDirectoryRecursively(
2568 temporary_document_dir.c_str())) {
2569 ICING_LOG(ERROR) << "Document store has been optimized, but it failed to "
2570 "delete temporary file directory";
2571 }
2572
2573 // Since we created new (optimized) document store with correct PersistToDisk
2574 // call, we shouldn't have data loss or regenerate derived files. Therefore,
2575 // if we really encounter any of these situations, then return DataLossError
2576 // to let the caller rebuild index.
2577 if (create_result.data_loss != DataLoss::NONE ||
2578 create_result.derived_files_regenerated) {
2579 return absl_ports::DataLossError(
2580 "Unexpected data loss or derived files regenerated for new document "
2581 "store");
2582 }
2583
2584 return optimize_result_or;
2585 }
2586
2587 IcingSearchEngine::IndexRestorationResult
RestoreIndexIfNeeded()2588 IcingSearchEngine::RestoreIndexIfNeeded() {
2589 DocumentId last_stored_document_id =
2590 document_store_->last_added_document_id();
2591 if (last_stored_document_id == index_->last_added_document_id() &&
2592 last_stored_document_id == integer_index_->last_added_document_id() &&
2593 last_stored_document_id ==
2594 qualified_id_join_index_->last_added_document_id() &&
2595 last_stored_document_id == embedding_index_->last_added_document_id()) {
2596 // No need to recover.
2597 return {libtextclassifier3::Status::OK, false, false, false, false};
2598 }
2599
2600 if (last_stored_document_id == kInvalidDocumentId) {
2601 // Document store is empty but index is not. Clear the index.
2602 return {ClearAllIndices(), false, false, false, false};
2603 }
2604
2605 // Truncate indices first.
2606 auto truncate_result_or = TruncateIndicesTo(last_stored_document_id);
2607 if (!truncate_result_or.ok()) {
2608 return {std::move(truncate_result_or).status(), false, false, false, false};
2609 }
2610 TruncateIndexResult truncate_result =
2611 std::move(truncate_result_or).ValueOrDie();
2612
2613 if (truncate_result.first_document_to_reindex > last_stored_document_id) {
2614 // Nothing to restore. Just return.
2615 return {libtextclassifier3::Status::OK, false, false, false, false};
2616 }
2617
2618 auto data_indexing_handlers_or = CreateDataIndexingHandlers();
2619 if (!data_indexing_handlers_or.ok()) {
2620 return {data_indexing_handlers_or.status(),
2621 truncate_result.index_needed_restoration,
2622 truncate_result.integer_index_needed_restoration,
2623 truncate_result.qualified_id_join_index_needed_restoration,
2624 truncate_result.embedding_index_needed_restoration};
2625 }
2626 // By using recovery_mode for IndexProcessor, we're able to replay documents
2627 // from smaller document id and it will skip documents that are already been
2628 // indexed.
2629 IndexProcessor index_processor(
2630 std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get(),
2631 /*recovery_mode=*/true);
2632
2633 ICING_VLOG(1) << "Restoring index by replaying documents from document id "
2634 << truncate_result.first_document_to_reindex
2635 << " to document id " << last_stored_document_id;
2636 libtextclassifier3::Status overall_status;
2637 for (DocumentId document_id = truncate_result.first_document_to_reindex;
2638 document_id <= last_stored_document_id; ++document_id) {
2639 libtextclassifier3::StatusOr<DocumentProto> document_or =
2640 document_store_->Get(document_id);
2641
2642 if (!document_or.ok()) {
2643 if (absl_ports::IsInvalidArgument(document_or.status()) ||
2644 absl_ports::IsNotFound(document_or.status())) {
2645 // Skips invalid and non-existing documents.
2646 continue;
2647 } else {
2648 // Returns other errors
2649 return {document_or.status(), truncate_result.index_needed_restoration,
2650 truncate_result.integer_index_needed_restoration,
2651 truncate_result.qualified_id_join_index_needed_restoration,
2652 truncate_result.embedding_index_needed_restoration};
2653 }
2654 }
2655 DocumentProto document(std::move(document_or).ValueOrDie());
2656
2657 libtextclassifier3::StatusOr<TokenizedDocument> tokenized_document_or =
2658 TokenizedDocument::Create(schema_store_.get(),
2659 language_segmenter_.get(),
2660 std::move(document));
2661 if (!tokenized_document_or.ok()) {
2662 return {tokenized_document_or.status(),
2663 truncate_result.index_needed_restoration,
2664 truncate_result.integer_index_needed_restoration,
2665 truncate_result.qualified_id_join_index_needed_restoration,
2666 truncate_result.embedding_index_needed_restoration};
2667 }
2668 TokenizedDocument tokenized_document(
2669 std::move(tokenized_document_or).ValueOrDie());
2670
2671 libtextclassifier3::Status status =
2672 index_processor.IndexDocument(tokenized_document, document_id);
2673 if (!status.ok()) {
2674 if (!absl_ports::IsDataLoss(status)) {
2675 // Real error. Stop recovering and pass it up.
2676 return {status, truncate_result.index_needed_restoration,
2677 truncate_result.integer_index_needed_restoration,
2678 truncate_result.qualified_id_join_index_needed_restoration,
2679 truncate_result.embedding_index_needed_restoration};
2680 }
2681 // FIXME: why can we skip data loss error here?
2682 // Just a data loss. Keep trying to add the remaining docs, but report the
2683 // data loss when we're done.
2684 overall_status = status;
2685 }
2686 }
2687
2688 return {overall_status, truncate_result.index_needed_restoration,
2689 truncate_result.integer_index_needed_restoration,
2690 truncate_result.qualified_id_join_index_needed_restoration,
2691 truncate_result.embedding_index_needed_restoration};
2692 }
2693
LostPreviousSchema()2694 libtextclassifier3::StatusOr<bool> IcingSearchEngine::LostPreviousSchema() {
2695 auto status_or = schema_store_->GetSchema();
2696 if (status_or.ok()) {
2697 // Found a schema.
2698 return false;
2699 }
2700
2701 if (!absl_ports::IsNotFound(status_or.status())) {
2702 // Any other type of error
2703 return status_or.status();
2704 }
2705
2706 // We know: We don't have a schema now.
2707 //
2708 // We know: If no documents have been added, then the last_added_document_id
2709 // will be invalid.
2710 //
2711 // So: If documents have been added before and we don't have a schema now,
2712 // then that means we must have had a schema at some point. Since we wouldn't
2713 // accept documents without a schema to validate them against.
2714 return document_store_->last_added_document_id() != kInvalidDocumentId;
2715 }
2716
2717 libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DataIndexingHandler>>>
CreateDataIndexingHandlers()2718 IcingSearchEngine::CreateDataIndexingHandlers() {
2719 std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
2720
2721 // Term index handler
2722 ICING_ASSIGN_OR_RETURN(
2723 std::unique_ptr<TermIndexingHandler> term_indexing_handler,
2724 TermIndexingHandler::Create(
2725 clock_.get(), normalizer_.get(), index_.get(),
2726 options_.build_property_existence_metadata_hits()));
2727 handlers.push_back(std::move(term_indexing_handler));
2728
2729 // Integer index handler
2730 ICING_ASSIGN_OR_RETURN(std::unique_ptr<IntegerSectionIndexingHandler>
2731 integer_section_indexing_handler,
2732 IntegerSectionIndexingHandler::Create(
2733 clock_.get(), integer_index_.get()));
2734 handlers.push_back(std::move(integer_section_indexing_handler));
2735
2736 // Qualified id join index handler
2737 ICING_ASSIGN_OR_RETURN(
2738 std::unique_ptr<QualifiedIdJoinIndexingHandler>
2739 qualified_id_join_indexing_handler,
2740 QualifiedIdJoinIndexingHandler::Create(
2741 clock_.get(), document_store_.get(), qualified_id_join_index_.get()));
2742 handlers.push_back(std::move(qualified_id_join_indexing_handler));
2743
2744 // Embedding index handler
2745 ICING_ASSIGN_OR_RETURN(
2746 std::unique_ptr<EmbeddingIndexingHandler> embedding_indexing_handler,
2747 EmbeddingIndexingHandler::Create(clock_.get(), embedding_index_.get()));
2748 handlers.push_back(std::move(embedding_indexing_handler));
2749 return handlers;
2750 }
2751
2752 libtextclassifier3::StatusOr<IcingSearchEngine::TruncateIndexResult>
TruncateIndicesTo(DocumentId last_stored_document_id)2753 IcingSearchEngine::TruncateIndicesTo(DocumentId last_stored_document_id) {
2754 // Attempt to truncate term index.
2755 // TruncateTo ensures that the index does not hold any data that is not
2756 // present in the ground truth. If the document store lost some documents,
2757 // TruncateTo will ensure that the index does not contain any hits from those
2758 // lost documents. If the index does not contain any hits for documents with
2759 // document id greater than last_stored_document_id, then TruncateTo will have
2760 // no effect.
2761 ICING_RETURN_IF_ERROR(index_->TruncateTo(last_stored_document_id));
2762
2763 // Get last indexed document id for term index after truncating.
2764 DocumentId term_index_last_added_document_id =
2765 index_->last_added_document_id();
2766 DocumentId first_document_to_reindex =
2767 (term_index_last_added_document_id != kInvalidDocumentId)
2768 ? term_index_last_added_document_id + 1
2769 : kMinDocumentId;
2770 bool index_needed_restoration =
2771 (last_stored_document_id != term_index_last_added_document_id);
2772
2773 // Attempt to truncate integer index.
2774 bool integer_index_needed_restoration = false;
2775 DocumentId integer_index_last_added_document_id =
2776 integer_index_->last_added_document_id();
2777 if (integer_index_last_added_document_id == kInvalidDocumentId ||
2778 last_stored_document_id > integer_index_last_added_document_id) {
2779 // If last_stored_document_id is greater than
2780 // integer_index_last_added_document_id, then we only have to replay docs
2781 // starting from integer_index_last_added_document_id + 1. Also use std::min
2782 // since we might need to replay even smaller doc ids for term index.
2783 integer_index_needed_restoration = true;
2784 if (integer_index_last_added_document_id != kInvalidDocumentId) {
2785 first_document_to_reindex = std::min(
2786 first_document_to_reindex, integer_index_last_added_document_id + 1);
2787 } else {
2788 first_document_to_reindex = kMinDocumentId;
2789 }
2790 } else if (last_stored_document_id < integer_index_last_added_document_id) {
2791 // Clear the entire integer index if last_stored_document_id is smaller than
2792 // integer_index_last_added_document_id, because there is no way to remove
2793 // data with doc_id > last_stored_document_id from integer index and we have
2794 // to rebuild.
2795 ICING_RETURN_IF_ERROR(integer_index_->Clear());
2796
2797 // Since the entire integer index is discarded, we start to rebuild it by
2798 // setting first_document_to_reindex to kMinDocumentId.
2799 integer_index_needed_restoration = true;
2800 first_document_to_reindex = kMinDocumentId;
2801 }
2802
2803 // Attempt to truncate qualified id join index
2804 bool qualified_id_join_index_needed_restoration = false;
2805 DocumentId qualified_id_join_index_last_added_document_id =
2806 qualified_id_join_index_->last_added_document_id();
2807 if (qualified_id_join_index_last_added_document_id == kInvalidDocumentId ||
2808 last_stored_document_id >
2809 qualified_id_join_index_last_added_document_id) {
2810 // If last_stored_document_id is greater than
2811 // qualified_id_join_index_last_added_document_id, then we only have to
2812 // replay docs starting from (qualified_id_join_index_last_added_document_id
2813 // + 1). Also use std::min since we might need to replay even smaller doc
2814 // ids for other components.
2815 qualified_id_join_index_needed_restoration = true;
2816 if (qualified_id_join_index_last_added_document_id != kInvalidDocumentId) {
2817 first_document_to_reindex =
2818 std::min(first_document_to_reindex,
2819 qualified_id_join_index_last_added_document_id + 1);
2820 } else {
2821 first_document_to_reindex = kMinDocumentId;
2822 }
2823 } else if (last_stored_document_id <
2824 qualified_id_join_index_last_added_document_id) {
2825 // Clear the entire qualified id join index if last_stored_document_id is
2826 // smaller than qualified_id_join_index_last_added_document_id, because
2827 // there is no way to remove data with doc_id > last_stored_document_id from
2828 // join index efficiently and we have to rebuild.
2829 ICING_RETURN_IF_ERROR(qualified_id_join_index_->Clear());
2830
2831 // Since the entire qualified id join index is discarded, we start to
2832 // rebuild it by setting first_document_to_reindex to kMinDocumentId.
2833 qualified_id_join_index_needed_restoration = true;
2834 first_document_to_reindex = kMinDocumentId;
2835 }
2836
2837 // Attempt to truncate embedding index
2838 bool embedding_index_needed_restoration = false;
2839 DocumentId embedding_index_last_added_document_id =
2840 embedding_index_->last_added_document_id();
2841 if (embedding_index_last_added_document_id == kInvalidDocumentId ||
2842 last_stored_document_id > embedding_index_last_added_document_id) {
2843 // If last_stored_document_id is greater than
2844 // embedding_index_last_added_document_id, then we only have to replay docs
2845 // starting from (embedding_index_last_added_document_id + 1). Also use
2846 // std::min since we might need to replay even smaller doc ids for other
2847 // components.
2848 embedding_index_needed_restoration = true;
2849 if (embedding_index_last_added_document_id != kInvalidDocumentId) {
2850 first_document_to_reindex =
2851 std::min(first_document_to_reindex,
2852 embedding_index_last_added_document_id + 1);
2853 } else {
2854 first_document_to_reindex = kMinDocumentId;
2855 }
2856 } else if (last_stored_document_id < embedding_index_last_added_document_id) {
2857 // Clear the entire embedding index if last_stored_document_id is
2858 // smaller than embedding_index_last_added_document_id, because
2859 // there is no way to remove data with doc_id > last_stored_document_id from
2860 // embedding index efficiently and we have to rebuild.
2861 ICING_RETURN_IF_ERROR(embedding_index_->Clear());
2862
2863 // Since the entire embedding index is discarded, we start to
2864 // rebuild it by setting first_document_to_reindex to kMinDocumentId.
2865 embedding_index_needed_restoration = true;
2866 first_document_to_reindex = kMinDocumentId;
2867 }
2868
2869 return TruncateIndexResult(first_document_to_reindex,
2870 index_needed_restoration,
2871 integer_index_needed_restoration,
2872 qualified_id_join_index_needed_restoration,
2873 embedding_index_needed_restoration);
2874 }
2875
DiscardDerivedFiles(const version_util::DerivedFilesRebuildResult & rebuild_result)2876 libtextclassifier3::Status IcingSearchEngine::DiscardDerivedFiles(
2877 const version_util::DerivedFilesRebuildResult& rebuild_result) {
2878 if (!rebuild_result.IsRebuildNeeded()) {
2879 return libtextclassifier3::Status::OK;
2880 }
2881
2882 if (schema_store_ != nullptr || document_store_ != nullptr ||
2883 index_ != nullptr || integer_index_ != nullptr ||
2884 qualified_id_join_index_ != nullptr || embedding_index_ != nullptr) {
2885 return absl_ports::FailedPreconditionError(
2886 "Cannot discard derived files while having valid instances");
2887 }
2888
2889 // Schema store
2890 if (rebuild_result.needs_schema_store_derived_files_rebuild) {
2891 ICING_RETURN_IF_ERROR(SchemaStore::DiscardDerivedFiles(
2892 filesystem_.get(), options_.base_dir()));
2893 }
2894
2895 // Document store
2896 if (rebuild_result.needs_document_store_derived_files_rebuild) {
2897 ICING_RETURN_IF_ERROR(DocumentStore::DiscardDerivedFiles(
2898 filesystem_.get(), options_.base_dir()));
2899 }
2900
2901 // Term index
2902 if (rebuild_result.needs_term_index_rebuild) {
2903 if (!filesystem_->DeleteDirectoryRecursively(
2904 MakeIndexDirectoryPath(options_.base_dir()).c_str())) {
2905 return absl_ports::InternalError("Failed to discard index");
2906 }
2907 }
2908
2909 // Integer index
2910 if (rebuild_result.needs_integer_index_rebuild) {
2911 if (!filesystem_->DeleteDirectoryRecursively(
2912 MakeIntegerIndexWorkingPath(options_.base_dir()).c_str())) {
2913 return absl_ports::InternalError("Failed to discard integer index");
2914 }
2915 }
2916
2917 // Qualified id join index
2918 if (rebuild_result.needs_qualified_id_join_index_rebuild) {
2919 if (!filesystem_->DeleteDirectoryRecursively(
2920 MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir()).c_str())) {
2921 return absl_ports::InternalError(
2922 "Failed to discard qualified id join index");
2923 }
2924 }
2925
2926 // TODO(b/326656531): Update version-util to consider embedding index.
2927
2928 return libtextclassifier3::Status::OK;
2929 }
2930
ClearSearchIndices()2931 libtextclassifier3::Status IcingSearchEngine::ClearSearchIndices() {
2932 ICING_RETURN_IF_ERROR(index_->Reset());
2933 ICING_RETURN_IF_ERROR(integer_index_->Clear());
2934 ICING_RETURN_IF_ERROR(embedding_index_->Clear());
2935 return libtextclassifier3::Status::OK;
2936 }
2937
ClearJoinIndices()2938 libtextclassifier3::Status IcingSearchEngine::ClearJoinIndices() {
2939 return qualified_id_join_index_->Clear();
2940 }
2941
ClearAllIndices()2942 libtextclassifier3::Status IcingSearchEngine::ClearAllIndices() {
2943 ICING_RETURN_IF_ERROR(ClearSearchIndices());
2944 ICING_RETURN_IF_ERROR(ClearJoinIndices());
2945 return libtextclassifier3::Status::OK;
2946 }
2947
Reset()2948 ResetResultProto IcingSearchEngine::Reset() {
2949 absl_ports::unique_lock l(&mutex_);
2950 return ResetInternal();
2951 }
2952
ResetInternal()2953 ResetResultProto IcingSearchEngine::ResetInternal() {
2954 ICING_VLOG(1) << "Resetting IcingSearchEngine";
2955
2956 ResetResultProto result_proto;
2957 StatusProto* result_status = result_proto.mutable_status();
2958
2959 initialized_ = false;
2960 ResetMembers();
2961 if (!filesystem_->DeleteDirectoryRecursively(options_.base_dir().c_str())) {
2962 result_status->set_code(StatusProto::INTERNAL);
2963 return result_proto;
2964 }
2965
2966 if (InternalInitialize().status().code() != StatusProto::OK) {
2967 // We shouldn't hit the following Initialize errors:
2968 // NOT_FOUND: all data was cleared, we aren't expecting anything
2969 // DATA_LOSS: all data was cleared, we aren't expecting anything
2970 // RESOURCE_EXHAUSTED: just deleted files, shouldn't run out of space
2971 //
2972 // We can't tell if Initialize failed and left Icing in an inconsistent
2973 // state or if it was a temporary I/O error. Group everything under INTERNAL
2974 // to be safe.
2975 //
2976 // TODO(b/147699081): Once Initialize returns the proper ABORTED/INTERNAL
2977 // status code, we can just propagate it up from here.
2978 result_status->set_code(StatusProto::INTERNAL);
2979 return result_proto;
2980 }
2981
2982 result_status->set_code(StatusProto::OK);
2983 return result_proto;
2984 }
2985
SearchSuggestions(const SuggestionSpecProto & suggestion_spec)2986 SuggestionResponse IcingSearchEngine::SearchSuggestions(
2987 const SuggestionSpecProto& suggestion_spec) {
2988 // TODO(b/146008613) Explore ideas to make this function read-only.
2989 absl_ports::unique_lock l(&mutex_);
2990 SuggestionResponse response;
2991 StatusProto* response_status = response.mutable_status();
2992 if (!initialized_) {
2993 response_status->set_code(StatusProto::FAILED_PRECONDITION);
2994 response_status->set_message("IcingSearchEngine has not been initialized!");
2995 return response;
2996 }
2997
2998 libtextclassifier3::Status status =
2999 ValidateSuggestionSpec(suggestion_spec, performance_configuration_);
3000 if (!status.ok()) {
3001 TransformStatus(status, response_status);
3002 return response;
3003 }
3004
3005 // Create the suggestion processor.
3006 auto suggestion_processor_or = SuggestionProcessor::Create(
3007 index_.get(), integer_index_.get(), embedding_index_.get(),
3008 language_segmenter_.get(), normalizer_.get(), document_store_.get(),
3009 schema_store_.get(), clock_.get());
3010 if (!suggestion_processor_or.ok()) {
3011 TransformStatus(suggestion_processor_or.status(), response_status);
3012 return response;
3013 }
3014 std::unique_ptr<SuggestionProcessor> suggestion_processor =
3015 std::move(suggestion_processor_or).ValueOrDie();
3016
3017 // Run suggestion based on given SuggestionSpec.
3018 int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
3019 libtextclassifier3::StatusOr<std::vector<TermMetadata>> terms_or =
3020 suggestion_processor->QuerySuggestions(suggestion_spec, current_time_ms);
3021 if (!terms_or.ok()) {
3022 TransformStatus(terms_or.status(), response_status);
3023 return response;
3024 }
3025
3026 // Convert vector<TermMetaData> into final SuggestionResponse proto.
3027 for (TermMetadata& term : terms_or.ValueOrDie()) {
3028 SuggestionResponse::Suggestion suggestion;
3029 suggestion.set_query(std::move(term.content));
3030 response.mutable_suggestions()->Add(std::move(suggestion));
3031 }
3032 response_status->set_code(StatusProto::OK);
3033 return response;
3034 }
3035
3036 } // namespace lib
3037 } // namespace icing
3038