• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/icing-search-engine.h"
16 
17 #include <algorithm>
18 #include <cstddef>
19 #include <cstdint>
20 #include <functional>
21 #include <memory>
22 #include <string>
23 #include <string_view>
24 #include <unordered_map>
25 #include <unordered_set>
26 #include <utility>
27 #include <vector>
28 
29 #include "icing/text_classifier/lib3/utils/base/status.h"
30 #include "icing/text_classifier/lib3/utils/base/statusor.h"
31 #include "icing/absl_ports/annotate.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/mutex.h"
34 #include "icing/absl_ports/str_cat.h"
35 #include "icing/file/destructible-file.h"
36 #include "icing/file/file-backed-proto.h"
37 #include "icing/file/filesystem.h"
38 #include "icing/file/version-util.h"
39 #include "icing/index/data-indexing-handler.h"
40 #include "icing/index/embed/embedding-index.h"
41 #include "icing/index/embedding-indexing-handler.h"
42 #include "icing/index/hit/doc-hit-info.h"
43 #include "icing/index/index-processor.h"
44 #include "icing/index/index.h"
45 #include "icing/index/integer-section-indexing-handler.h"
46 #include "icing/index/iterator/doc-hit-info-iterator.h"
47 #include "icing/index/numeric/integer-index.h"
48 #include "icing/index/term-indexing-handler.h"
49 #include "icing/index/term-metadata.h"
50 #include "icing/jni/jni-cache.h"
51 #include "icing/join/join-children-fetcher.h"
52 #include "icing/join/join-processor.h"
53 #include "icing/join/qualified-id-join-index-impl-v1.h"
54 #include "icing/join/qualified-id-join-index-impl-v2.h"
55 #include "icing/join/qualified-id-join-index.h"
56 #include "icing/join/qualified-id-join-indexing-handler.h"
57 #include "icing/legacy/index/icing-filesystem.h"
58 #include "icing/performance-configuration.h"
59 #include "icing/portable/endian.h"
60 #include "icing/proto/debug.pb.h"
61 #include "icing/proto/document.pb.h"
62 #include "icing/proto/initialize.pb.h"
63 #include "icing/proto/internal/optimize.pb.h"
64 #include "icing/proto/logging.pb.h"
65 #include "icing/proto/optimize.pb.h"
66 #include "icing/proto/persist.pb.h"
67 #include "icing/proto/reset.pb.h"
68 #include "icing/proto/schema.pb.h"
69 #include "icing/proto/scoring.pb.h"
70 #include "icing/proto/search.pb.h"
71 #include "icing/proto/status.pb.h"
72 #include "icing/proto/storage.pb.h"
73 #include "icing/proto/term.pb.h"
74 #include "icing/proto/usage.pb.h"
75 #include "icing/query/advanced_query_parser/lexer.h"
76 #include "icing/query/query-features.h"
77 #include "icing/query/query-processor.h"
78 #include "icing/query/query-results.h"
79 #include "icing/query/suggestion-processor.h"
80 #include "icing/result/page-result.h"
81 #include "icing/result/projection-tree.h"
82 #include "icing/result/projector.h"
83 #include "icing/result/result-adjustment-info.h"
84 #include "icing/result/result-retriever-v2.h"
85 #include "icing/result/result-state-manager.h"
86 #include "icing/schema/schema-store.h"
87 #include "icing/scoring/advanced_scoring/score-expression.h"
88 #include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
89 #include "icing/scoring/scored-document-hit.h"
90 #include "icing/scoring/scored-document-hits-ranker.h"
91 #include "icing/scoring/scoring-processor.h"
92 #include "icing/store/document-id.h"
93 #include "icing/store/document-store.h"
94 #include "icing/tokenization/language-segmenter-factory.h"
95 #include "icing/transform/normalizer-factory.h"
96 #include "icing/util/clock.h"
97 #include "icing/util/data-loss.h"
98 #include "icing/util/logging.h"
99 #include "icing/util/status-macros.h"
100 #include "icing/util/tokenized-document.h"
101 #include "unicode/uloc.h"
102 
103 namespace icing {
104 namespace lib {
105 
106 namespace {
107 
108 constexpr std::string_view kDocumentSubfolderName = "document_dir";
109 constexpr std::string_view kIndexSubfolderName = "index_dir";
110 constexpr std::string_view kIntegerIndexSubfolderName = "integer_index_dir";
111 constexpr std::string_view kQualifiedIdJoinIndexSubfolderName =
112     "qualified_id_join_index_dir";
113 constexpr std::string_view kEmbeddingIndexSubfolderName = "embedding_index_dir";
114 constexpr std::string_view kSchemaSubfolderName = "schema_dir";
115 constexpr std::string_view kSetSchemaMarkerFilename = "set_schema_marker";
116 constexpr std::string_view kInitMarkerFilename = "init_marker";
117 constexpr std::string_view kOptimizeStatusFilename = "optimize_status";
118 
119 // The maximum number of unsuccessful initialization attempts from the current
120 // state that we will tolerate before deleting all data and starting from a
121 // fresh state.
122 constexpr int kMaxUnsuccessfulInitAttempts = 5;
123 
124 // A pair that holds namespace and type.
125 struct NamespaceTypePair {
126   std::string namespace_;
127   std::string type;
128 
operator ==icing::lib::__anon759ef75b0111::NamespaceTypePair129   bool operator==(const NamespaceTypePair& other) const {
130     return namespace_ == other.namespace_ && type == other.type;
131   }
132 };
133 
134 struct NamespaceTypePairHasher {
operator ()icing::lib::__anon759ef75b0111::NamespaceTypePairHasher135   std::size_t operator()(const NamespaceTypePair& pair) const {
136     return std::hash<std::string>()(pair.namespace_) ^
137            std::hash<std::string>()(pair.type);
138   }
139 };
140 
ValidateResultSpec(const DocumentStore * document_store,const ResultSpecProto & result_spec)141 libtextclassifier3::Status ValidateResultSpec(
142     const DocumentStore* document_store, const ResultSpecProto& result_spec) {
143   if (result_spec.num_per_page() < 0) {
144     return absl_ports::InvalidArgumentError(
145         "ResultSpecProto.num_per_page cannot be negative.");
146   }
147   if (result_spec.num_total_bytes_per_page_threshold() <= 0) {
148     return absl_ports::InvalidArgumentError(
149         "ResultSpecProto.num_total_bytes_per_page_threshold cannot be "
150         "non-positive.");
151   }
152   if (result_spec.max_joined_children_per_parent_to_return() < 0) {
153     return absl_ports::InvalidArgumentError(
154         "ResultSpecProto.max_joined_children_per_parent_to_return cannot be "
155         "negative.");
156   }
157   if (result_spec.num_to_score() <= 0) {
158     return absl_ports::InvalidArgumentError(
159         "ResultSpecProto.num_to_score cannot be non-positive.");
160   }
161   // Validate ResultGroupings.
162   std::unordered_set<int32_t> unique_entry_ids;
163   ResultSpecProto::ResultGroupingType result_grouping_type =
164       result_spec.result_group_type();
165   for (const ResultSpecProto::ResultGrouping& result_grouping :
166        result_spec.result_groupings()) {
167     if (result_grouping.max_results() <= 0) {
168       return absl_ports::InvalidArgumentError(
169           "Cannot specify a result grouping with max results <= 0.");
170     }
171     for (const ResultSpecProto::ResultGrouping::Entry& entry :
172          result_grouping.entry_groupings()) {
173       const std::string& name_space = entry.namespace_();
174       const std::string& schema = entry.schema();
175       auto entry_id_or = document_store->GetResultGroupingEntryId(
176           result_grouping_type, name_space, schema);
177       if (!entry_id_or.ok()) {
178         continue;
179       }
180       int32_t entry_id = entry_id_or.ValueOrDie();
181       if (unique_entry_ids.find(entry_id) != unique_entry_ids.end()) {
182         return absl_ports::InvalidArgumentError(
183             "Entry Ids must be unique across result groups.");
184       }
185       unique_entry_ids.insert(entry_id);
186     }
187   }
188   return libtextclassifier3::Status::OK;
189 }
190 
ValidateSearchSpec(const SearchSpecProto & search_spec,const PerformanceConfiguration & configuration)191 libtextclassifier3::Status ValidateSearchSpec(
192     const SearchSpecProto& search_spec,
193     const PerformanceConfiguration& configuration) {
194   if (search_spec.query().size() > configuration.max_query_length) {
195     return absl_ports::InvalidArgumentError(
196         absl_ports::StrCat("SearchSpecProto.query is longer than the maximum "
197                            "allowed query length: ",
198                            std::to_string(configuration.max_query_length)));
199   }
200   // Check that no unknown features have been enabled in the search spec.
201   std::unordered_set<Feature> query_features_set = GetQueryFeaturesSet();
202   for (const Feature feature : search_spec.enabled_features()) {
203     if (query_features_set.find(feature) == query_features_set.end()) {
204       return absl_ports::InvalidArgumentError(
205           absl_ports::StrCat("Unknown feature in "
206                              "SearchSpecProto.enabled_features: ",
207                              feature));
208     }
209   }
210   return libtextclassifier3::Status::OK;
211 }
212 
ValidateSuggestionSpec(const SuggestionSpecProto & suggestion_spec,const PerformanceConfiguration & configuration)213 libtextclassifier3::Status ValidateSuggestionSpec(
214     const SuggestionSpecProto& suggestion_spec,
215     const PerformanceConfiguration& configuration) {
216   if (suggestion_spec.prefix().empty()) {
217     return absl_ports::InvalidArgumentError(
218         absl_ports::StrCat("SuggestionSpecProto.prefix is empty!"));
219   }
220   if (suggestion_spec.scoring_spec().scoring_match_type() ==
221       TermMatchType::UNKNOWN) {
222     return absl_ports::InvalidArgumentError(
223         absl_ports::StrCat("SuggestionSpecProto.term_match_type is unknown!"));
224   }
225   if (suggestion_spec.num_to_return() <= 0) {
226     return absl_ports::InvalidArgumentError(absl_ports::StrCat(
227         "SuggestionSpecProto.num_to_return must be positive."));
228   }
229   if (suggestion_spec.prefix().size() > configuration.max_query_length) {
230     return absl_ports::InvalidArgumentError(
231         absl_ports::StrCat("SuggestionSpecProto.prefix is longer than the "
232                            "maximum allowed prefix length: ",
233                            std::to_string(configuration.max_query_length)));
234   }
235   return libtextclassifier3::Status::OK;
236 }
237 
IsV2QualifiedIdJoinIndexEnabled(const IcingSearchEngineOptions & options)238 bool IsV2QualifiedIdJoinIndexEnabled(const IcingSearchEngineOptions& options) {
239   return true;
240 }
241 
242 libtextclassifier3::StatusOr<std::unique_ptr<QualifiedIdJoinIndex>>
CreateQualifiedIdJoinIndex(const Filesystem & filesystem,std::string qualified_id_join_index_dir,const IcingSearchEngineOptions & options)243 CreateQualifiedIdJoinIndex(const Filesystem& filesystem,
244                            std::string qualified_id_join_index_dir,
245                            const IcingSearchEngineOptions& options) {
246   if (IsV2QualifiedIdJoinIndexEnabled(options)) {
247     // V2
248     return QualifiedIdJoinIndexImplV2::Create(
249         filesystem, std::move(qualified_id_join_index_dir),
250         options.pre_mapping_fbv());
251   } else {
252     // V1
253     // TODO(b/275121148): deprecate this part after rollout v2.
254     return QualifiedIdJoinIndexImplV1::Create(
255         filesystem, std::move(qualified_id_join_index_dir),
256         options.pre_mapping_fbv(), options.use_persistent_hash_map());
257   }
258 }
259 
260 // Document store files are in a standalone subfolder for easier file
261 // management. We can delete and recreate the subfolder and not touch/affect
262 // anything else.
MakeDocumentDirectoryPath(const std::string & base_dir)263 std::string MakeDocumentDirectoryPath(const std::string& base_dir) {
264   return absl_ports::StrCat(base_dir, "/", kDocumentSubfolderName);
265 }
266 
267 // Makes a temporary folder path for the document store which will be used
268 // during full optimization.
MakeDocumentTemporaryDirectoryPath(const std::string & base_dir)269 std::string MakeDocumentTemporaryDirectoryPath(const std::string& base_dir) {
270   return absl_ports::StrCat(base_dir, "/", kDocumentSubfolderName,
271                             "_optimize_tmp");
272 }
273 
274 // Index files are in a standalone subfolder because for easier file management.
275 // We can delete and recreate the subfolder and not touch/affect anything
276 // else.
MakeIndexDirectoryPath(const std::string & base_dir)277 std::string MakeIndexDirectoryPath(const std::string& base_dir) {
278   return absl_ports::StrCat(base_dir, "/", kIndexSubfolderName);
279 }
280 
281 // Working path for integer index. Integer index is derived from
282 // PersistentStorage and it will take full ownership of this working path,
283 // including creation/deletion. See PersistentStorage for more details about
284 // working path.
MakeIntegerIndexWorkingPath(const std::string & base_dir)285 std::string MakeIntegerIndexWorkingPath(const std::string& base_dir) {
286   return absl_ports::StrCat(base_dir, "/", kIntegerIndexSubfolderName);
287 }
288 
289 // Working path for qualified id join index. It is derived from
290 // PersistentStorage and it will take full ownership of this working path,
291 // including creation/deletion. See PersistentStorage for more details about
292 // working path.
MakeQualifiedIdJoinIndexWorkingPath(const std::string & base_dir)293 std::string MakeQualifiedIdJoinIndexWorkingPath(const std::string& base_dir) {
294   return absl_ports::StrCat(base_dir, "/", kQualifiedIdJoinIndexSubfolderName);
295 }
296 
297 // Working path for embedding index.
MakeEmbeddingIndexWorkingPath(const std::string & base_dir)298 std::string MakeEmbeddingIndexWorkingPath(const std::string& base_dir) {
299   return absl_ports::StrCat(base_dir, "/", kEmbeddingIndexSubfolderName);
300 }
301 
302 // SchemaStore files are in a standalone subfolder for easier file management.
303 // We can delete and recreate the subfolder and not touch/affect anything
304 // else.
MakeSchemaDirectoryPath(const std::string & base_dir)305 std::string MakeSchemaDirectoryPath(const std::string& base_dir) {
306   return absl_ports::StrCat(base_dir, "/", kSchemaSubfolderName);
307 }
308 
MakeSetSchemaMarkerFilePath(const std::string & base_dir)309 std::string MakeSetSchemaMarkerFilePath(const std::string& base_dir) {
310   return absl_ports::StrCat(base_dir, "/", kSetSchemaMarkerFilename);
311 }
312 
MakeInitMarkerFilePath(const std::string & base_dir)313 std::string MakeInitMarkerFilePath(const std::string& base_dir) {
314   return absl_ports::StrCat(base_dir, "/", kInitMarkerFilename);
315 }
316 
TransformStatus(const libtextclassifier3::Status & internal_status,StatusProto * status_proto)317 void TransformStatus(const libtextclassifier3::Status& internal_status,
318                      StatusProto* status_proto) {
319   StatusProto::Code code;
320   if (!internal_status.ok()) {
321     ICING_LOG(WARNING) << "Error: " << internal_status.error_code()
322                        << ", Message: " << internal_status.error_message();
323   }
324   switch (internal_status.CanonicalCode()) {
325     case libtextclassifier3::StatusCode::OK:
326       code = StatusProto::OK;
327       break;
328     case libtextclassifier3::StatusCode::DATA_LOSS:
329       code = StatusProto::WARNING_DATA_LOSS;
330       break;
331     case libtextclassifier3::StatusCode::INVALID_ARGUMENT:
332       code = StatusProto::INVALID_ARGUMENT;
333       break;
334     case libtextclassifier3::StatusCode::NOT_FOUND:
335       code = StatusProto::NOT_FOUND;
336       break;
337     case libtextclassifier3::StatusCode::FAILED_PRECONDITION:
338       code = StatusProto::FAILED_PRECONDITION;
339       break;
340     case libtextclassifier3::StatusCode::ABORTED:
341       code = StatusProto::ABORTED;
342       break;
343     case libtextclassifier3::StatusCode::INTERNAL:
344       // TODO(b/147699081): Cleanup our internal use of INTERNAL since it
345       // doesn't match with what it *should* indicate as described in
346       // go/icing-library-apis.
347       code = StatusProto::INTERNAL;
348       break;
349     case libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED:
350       // TODO(b/147699081): Note that we don't detect all cases of OUT_OF_SPACE
351       // (e.g. if the document log is full). And we use RESOURCE_EXHAUSTED
352       // internally to indicate other resources are exhausted (e.g.
353       // DocHitInfos) - although none of these are exposed through the API.
354       // Consider separating the two cases out more clearly.
355       code = StatusProto::OUT_OF_SPACE;
356       break;
357     case libtextclassifier3::StatusCode::ALREADY_EXISTS:
358       code = StatusProto::ALREADY_EXISTS;
359       break;
360     case libtextclassifier3::StatusCode::CANCELLED:
361       [[fallthrough]];
362     case libtextclassifier3::StatusCode::UNKNOWN:
363       [[fallthrough]];
364     case libtextclassifier3::StatusCode::DEADLINE_EXCEEDED:
365       [[fallthrough]];
366     case libtextclassifier3::StatusCode::PERMISSION_DENIED:
367       [[fallthrough]];
368     case libtextclassifier3::StatusCode::OUT_OF_RANGE:
369       [[fallthrough]];
370     case libtextclassifier3::StatusCode::UNIMPLEMENTED:
371       [[fallthrough]];
372     case libtextclassifier3::StatusCode::UNAVAILABLE:
373       [[fallthrough]];
374     case libtextclassifier3::StatusCode::UNAUTHENTICATED:
375       // Other internal status codes aren't supported externally yet. If it
376       // should be supported, add another switch-case above.
377       ICING_LOG(ERROR) << "Internal status code "
378                        << internal_status.error_code()
379                        << " not supported in the external API";
380       code = StatusProto::UNKNOWN;
381       break;
382   }
383   status_proto->set_code(code);
384   status_proto->set_message(internal_status.error_message());
385 }
386 
RetrieveAndAddDocumentInfo(const DocumentStore * document_store,DeleteByQueryResultProto & result_proto,std::unordered_map<NamespaceTypePair,DeleteByQueryResultProto::DocumentGroupInfo *,NamespaceTypePairHasher> & info_map,DocumentId document_id)387 libtextclassifier3::Status RetrieveAndAddDocumentInfo(
388     const DocumentStore* document_store, DeleteByQueryResultProto& result_proto,
389     std::unordered_map<NamespaceTypePair,
390                        DeleteByQueryResultProto::DocumentGroupInfo*,
391                        NamespaceTypePairHasher>& info_map,
392     DocumentId document_id) {
393   ICING_ASSIGN_OR_RETURN(DocumentProto document,
394                          document_store->Get(document_id));
395   NamespaceTypePair key = {document.namespace_(), document.schema()};
396   auto iter = info_map.find(key);
397   if (iter == info_map.end()) {
398     auto entry = result_proto.add_deleted_documents();
399     entry->set_namespace_(std::move(document.namespace_()));
400     entry->set_schema(std::move(document.schema()));
401     entry->add_uris(std::move(document.uri()));
402     info_map[key] = entry;
403   } else {
404     iter->second->add_uris(std::move(document.uri()));
405   }
406   return libtextclassifier3::Status::OK;
407 }
408 
ShouldRebuildIndex(const OptimizeStatsProto & optimize_stats,float optimize_rebuild_index_threshold)409 bool ShouldRebuildIndex(const OptimizeStatsProto& optimize_stats,
410                         float optimize_rebuild_index_threshold) {
411   int num_invalid_documents = optimize_stats.num_deleted_documents() +
412                               optimize_stats.num_expired_documents();
413   return num_invalid_documents >= optimize_stats.num_original_documents() *
414                                       optimize_rebuild_index_threshold;
415 }
416 
ScoringExpressionHasRelevanceScoreFunction(std::string_view scoring_expression)417 libtextclassifier3::StatusOr<bool> ScoringExpressionHasRelevanceScoreFunction(
418     std::string_view scoring_expression) {
419   // TODO(b/261474063) The Lexer will be called again when creating the
420   // AdvancedScorer instance. Consider refactoring the code to allow the Lexer
421   // to be called only once.
422   Lexer lexer(scoring_expression, Lexer::Language::SCORING);
423   ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
424                          lexer.ExtractTokens());
425   for (const Lexer::LexerToken& token : lexer_tokens) {
426     if (token.type == Lexer::TokenType::FUNCTION_NAME &&
427         token.text == RelevanceScoreFunctionScoreExpression::kFunctionName) {
428       return true;
429     }
430   }
431   return false;
432 }
433 
434 // Useful method to get RankingStrategy if advanced scoring is enabled. When the
435 // "RelevanceScore" function is used in the advanced scoring expression,
436 // RankingStrategy will be treated as RELEVANCE_SCORE in order to prepare the
437 // necessary information needed for calculating relevance score.
438 libtextclassifier3::StatusOr<ScoringSpecProto::RankingStrategy::Code>
GetRankingStrategyFromScoringSpec(const ScoringSpecProto & scoring_spec)439 GetRankingStrategyFromScoringSpec(const ScoringSpecProto& scoring_spec) {
440   if (scoring_spec.advanced_scoring_expression().empty() &&
441       scoring_spec.additional_advanced_scoring_expressions().empty()) {
442     return scoring_spec.rank_by();
443   }
444 
445   ICING_ASSIGN_OR_RETURN(bool has_relevance_score_function,
446                          ScoringExpressionHasRelevanceScoreFunction(
447                              scoring_spec.advanced_scoring_expression()));
448   if (has_relevance_score_function) {
449     return ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE;
450   }
451   for (std::string_view additional_scoring_expression :
452        scoring_spec.additional_advanced_scoring_expressions()) {
453     ICING_ASSIGN_OR_RETURN(has_relevance_score_function,
454                            ScoringExpressionHasRelevanceScoreFunction(
455                                additional_scoring_expression));
456     if (has_relevance_score_function) {
457       return ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE;
458     }
459   }
460   return ScoringSpecProto::RankingStrategy::NONE;
461 }
462 
463 }  // namespace
464 
IcingSearchEngine(const IcingSearchEngineOptions & options,std::unique_ptr<const JniCache> jni_cache)465 IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options,
466                                      std::unique_ptr<const JniCache> jni_cache)
467     : IcingSearchEngine(options, std::make_unique<Filesystem>(),
468                         std::make_unique<IcingFilesystem>(),
469                         std::make_unique<Clock>(), std::move(jni_cache)) {}
470 
IcingSearchEngine(IcingSearchEngineOptions options,std::unique_ptr<const Filesystem> filesystem,std::unique_ptr<const IcingFilesystem> icing_filesystem,std::unique_ptr<Clock> clock,std::unique_ptr<const JniCache> jni_cache)471 IcingSearchEngine::IcingSearchEngine(
472     IcingSearchEngineOptions options,
473     std::unique_ptr<const Filesystem> filesystem,
474     std::unique_ptr<const IcingFilesystem> icing_filesystem,
475     std::unique_ptr<Clock> clock, std::unique_ptr<const JniCache> jni_cache)
476     : options_(std::move(options)),
477       filesystem_(std::move(filesystem)),
478       icing_filesystem_(std::move(icing_filesystem)),
479       clock_(std::move(clock)),
480       jni_cache_(std::move(jni_cache)) {
481   ICING_VLOG(1) << "Creating IcingSearchEngine in dir: " << options_.base_dir();
482 }
483 
~IcingSearchEngine()484 IcingSearchEngine::~IcingSearchEngine() {
485   if (initialized_) {
486     if (PersistToDisk(PersistType::FULL).status().code() != StatusProto::OK) {
487       ICING_LOG(ERROR)
488           << "Error persisting to disk in IcingSearchEngine destructor";
489     }
490   }
491 }
492 
Initialize()493 InitializeResultProto IcingSearchEngine::Initialize() {
494   // This method does both read and write so we need a writer lock. Using two
495   // locks (reader and writer) has the chance to be interrupted during
496   // switching.
497   absl_ports::unique_lock l(&mutex_);
498   return InternalInitialize();
499 }
500 
ResetMembers()501 void IcingSearchEngine::ResetMembers() {
502   schema_store_.reset();
503   document_store_.reset();
504   language_segmenter_.reset();
505   normalizer_.reset();
506   index_.reset();
507   integer_index_.reset();
508   qualified_id_join_index_.reset();
509   embedding_index_.reset();
510 }
511 
CheckInitMarkerFile(InitializeStatsProto * initialize_stats)512 libtextclassifier3::Status IcingSearchEngine::CheckInitMarkerFile(
513     InitializeStatsProto* initialize_stats) {
514   // Check to see if the marker file exists and if we've already passed our max
515   // number of init attempts.
516   std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir());
517   bool file_exists = filesystem_->FileExists(marker_filepath.c_str());
518   int network_init_attempts = 0;
519   int host_init_attempts = 0;
520 
521   // Read the number of previous failed init attempts from the file. If it
522   // fails, then just assume the value is zero (the most likely reason for
523   // failure would be non-existence because the last init was successful
524   // anyways).
525   std::unique_ptr<ScopedFd> marker_file_fd = std::make_unique<ScopedFd>(
526       filesystem_->OpenForWrite(marker_filepath.c_str()));
527   libtextclassifier3::Status status;
528   if (file_exists &&
529       filesystem_->PRead(marker_file_fd->get(), &network_init_attempts,
530                          sizeof(network_init_attempts), /*offset=*/0)) {
531     host_init_attempts = GNetworkToHostL(network_init_attempts);
532     if (host_init_attempts > kMaxUnsuccessfulInitAttempts) {
533       // We're tried and failed to init too many times. We need to throw
534       // everything out and start from scratch.
535       ResetMembers();
536       marker_file_fd.reset();
537 
538       // Delete the entire base directory.
539       if (!filesystem_->DeleteDirectoryRecursively(
540               options_.base_dir().c_str())) {
541         return absl_ports::InternalError("Failed to delete icing base dir!");
542       }
543 
544       // Create the base directory again and reopen marker file.
545       if (!filesystem_->CreateDirectoryRecursively(
546               options_.base_dir().c_str())) {
547         return absl_ports::InternalError("Failed to create icing base dir!");
548       }
549 
550       marker_file_fd = std::make_unique<ScopedFd>(
551           filesystem_->OpenForWrite(marker_filepath.c_str()));
552 
553       status = absl_ports::DataLossError(
554           "Encountered failed initialization limit. Cleared all data.");
555       host_init_attempts = 0;
556     }
557   }
558 
559   // Use network_init_attempts here because we might have set host_init_attempts
560   // to 0 if it exceeded the max threshold.
561   initialize_stats->set_num_previous_init_failures(
562       GNetworkToHostL(network_init_attempts));
563 
564   ++host_init_attempts;
565   network_init_attempts = GHostToNetworkL(host_init_attempts);
566   // Write the updated number of attempts before we get started.
567   if (!filesystem_->PWrite(marker_file_fd->get(), /*offset=*/0,
568                            &network_init_attempts,
569                            sizeof(network_init_attempts)) ||
570       !filesystem_->DataSync(marker_file_fd->get())) {
571     return absl_ports::InternalError(
572         "Failed to write and sync init marker file");
573   }
574 
575   return status;
576 }
577 
InternalInitialize()578 InitializeResultProto IcingSearchEngine::InternalInitialize() {
579   ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: "
580                 << options_.base_dir();
581 
582   // Measure the latency of the initialization process.
583   std::unique_ptr<Timer> initialize_timer = clock_->GetNewTimer();
584 
585   InitializeResultProto result_proto;
586   StatusProto* result_status = result_proto.mutable_status();
587   InitializeStatsProto* initialize_stats =
588       result_proto.mutable_initialize_stats();
589   if (initialized_) {
590     // Already initialized.
591     result_status->set_code(StatusProto::OK);
592     initialize_stats->set_latency_ms(
593         initialize_timer->GetElapsedMilliseconds());
594     initialize_stats->set_num_documents(document_store_->num_documents());
595     return result_proto;
596   }
597 
598   // Now go ahead and try to initialize.
599   libtextclassifier3::Status status = InitializeMembers(initialize_stats);
600   if (status.ok() || absl_ports::IsDataLoss(status)) {
601     // We successfully initialized. We should delete the init marker file to
602     // indicate a successful init.
603     std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir());
604     if (!filesystem_->DeleteFile(marker_filepath.c_str())) {
605       status = absl_ports::InternalError("Failed to delete init marker file!");
606     } else {
607       initialized_ = true;
608     }
609   }
610   TransformStatus(status, result_status);
611   initialize_stats->set_latency_ms(initialize_timer->GetElapsedMilliseconds());
612   return result_proto;
613 }
614 
InitializeMembers(InitializeStatsProto * initialize_stats)615 libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
616     InitializeStatsProto* initialize_stats) {
617   ICING_RETURN_ERROR_IF_NULL(initialize_stats);
618 
619   // Make sure the base directory exists
620   if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) {
621     return absl_ports::InternalError(absl_ports::StrCat(
622         "Could not create directory: ", options_.base_dir()));
623   }
624 
625   // Check to see if the marker file exists and if we've already passed our max
626   // number of init attempts.
627   libtextclassifier3::Status status = CheckInitMarkerFile(initialize_stats);
628   if (!status.ok() && !absl_ports::IsDataLoss(status)) {
629     return status;
630   }
631 
632   // Do version and flags compatibility check
633   // Read version file, determine the state change and rebuild derived files if
634   // needed.
635   const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
636   ICING_ASSIGN_OR_RETURN(
637       IcingSearchEngineVersionProto stored_version_proto,
638       version_util::ReadVersion(
639           *filesystem_, /*version_file_dir=*/options_.base_dir(), index_dir));
640   version_util::VersionInfo stored_version_info =
641       version_util::GetVersionInfoFromProto(stored_version_proto);
642   version_util::StateChange version_state_change =
643       version_util::GetVersionStateChange(stored_version_info);
644 
645   // Construct icing's current version proto based on the current code version
646   IcingSearchEngineVersionProto current_version_proto;
647   current_version_proto.set_version(version_util::kVersion);
648   current_version_proto.set_max_version(
649       std::max(stored_version_info.max_version, version_util::kVersion));
650   version_util::AddEnabledFeatures(options_, &current_version_proto);
651 
652   // Step 1: If versions are incompatible, migrate schema according to the
653   // version state change.
654   if (version_state_change != version_util::StateChange::kCompatible) {
655     ICING_RETURN_IF_ERROR(SchemaStore::MigrateSchema(
656         filesystem_.get(), MakeSchemaDirectoryPath(options_.base_dir()),
657         version_state_change, version_util::kVersion));
658   }
659 
660   // Step 2: Discard derived files that need to be rebuilt
661   version_util::DerivedFilesRebuildResult required_derived_files_rebuild =
662       version_util::CalculateRequiredDerivedFilesRebuild(stored_version_proto,
663                                                          current_version_proto);
664   ICING_RETURN_IF_ERROR(DiscardDerivedFiles(required_derived_files_rebuild));
665 
666   // Step 3: update version files. We need to update both the V1 and V2
667   // version files.
668   ICING_RETURN_IF_ERROR(version_util::WriteV1Version(
669       *filesystem_, /*version_file_dir=*/options_.base_dir(),
670       version_util::GetVersionInfoFromProto(current_version_proto)));
671   ICING_RETURN_IF_ERROR(version_util::WriteV2Version(
672       *filesystem_, /*version_file_dir=*/options_.base_dir(),
673       std::make_unique<IcingSearchEngineVersionProto>(
674           std::move(current_version_proto))));
675 
676   ICING_RETURN_IF_ERROR(InitializeSchemaStore(initialize_stats));
677 
678   // TODO(b/156383798) : Resolve how to specify the locale.
679   language_segmenter_factory::SegmenterOptions segmenter_options(
680       ULOC_US, jni_cache_.get());
681   TC3_ASSIGN_OR_RETURN(language_segmenter_, language_segmenter_factory::Create(
682                                                 std::move(segmenter_options)));
683 
684   TC3_ASSIGN_OR_RETURN(normalizer_,
685                        normalizer_factory::Create(options_.max_token_length()));
686 
687   std::string marker_filepath =
688       MakeSetSchemaMarkerFilePath(options_.base_dir());
689 
690   libtextclassifier3::Status index_init_status;
691   if (absl_ports::IsNotFound(schema_store_->GetSchema().status())) {
692     // The schema was either lost or never set before. Wipe out the doc store
693     // and index directories and initialize them from scratch.
694     const std::string doc_store_dir =
695         MakeDocumentDirectoryPath(options_.base_dir());
696     const std::string integer_index_dir =
697         MakeIntegerIndexWorkingPath(options_.base_dir());
698     const std::string qualified_id_join_index_dir =
699         MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
700     const std::string embedding_index_dir =
701         MakeEmbeddingIndexWorkingPath(options_.base_dir());
702     if (!filesystem_->DeleteDirectoryRecursively(doc_store_dir.c_str()) ||
703         !filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
704         !IntegerIndex::Discard(*filesystem_, integer_index_dir).ok() ||
705         !QualifiedIdJoinIndex::Discard(*filesystem_,
706                                        qualified_id_join_index_dir)
707              .ok() ||
708         !EmbeddingIndex::Discard(*filesystem_, embedding_index_dir).ok()) {
709       return absl_ports::InternalError(absl_ports::StrCat(
710           "Could not delete directories: ", index_dir, ", ", integer_index_dir,
711           ", ", qualified_id_join_index_dir, ", ", embedding_index_dir, " and ",
712           doc_store_dir));
713     }
714     ICING_ASSIGN_OR_RETURN(
715         bool document_store_derived_files_regenerated,
716         InitializeDocumentStore(
717             /*force_recovery_and_revalidate_documents=*/false,
718             initialize_stats));
719     index_init_status = InitializeIndex(
720         document_store_derived_files_regenerated, initialize_stats);
721     if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
722       return index_init_status;
723     }
724   } else if (filesystem_->FileExists(marker_filepath.c_str())) {
725     // If the marker file is still around then something wonky happened when we
726     // last tried to set the schema.
727     //
728     // Since we're going to rebuild all indices in this case, the return value
729     // of InitializeDocumentStore (document_store_derived_files_regenerated) is
730     // unused.
731     ICING_RETURN_IF_ERROR(InitializeDocumentStore(
732         /*force_recovery_and_revalidate_documents=*/true, initialize_stats));
733 
734     // We're going to need to build the index from scratch. So just delete its
735     // directory now.
736     // Discard index directory and instantiate a new one.
737     Index::Options index_options(index_dir, options_.index_merge_size(),
738                                  /*lite_index_sort_at_indexing=*/true,
739                                  options_.lite_index_sort_size());
740     if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
741         !filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
742       return absl_ports::InternalError(
743           absl_ports::StrCat("Could not recreate directory: ", index_dir));
744     }
745     ICING_ASSIGN_OR_RETURN(index_,
746                            Index::Create(index_options, filesystem_.get(),
747                                          icing_filesystem_.get()));
748 
749     // Discard integer index directory and instantiate a new one.
750     std::string integer_index_dir =
751         MakeIntegerIndexWorkingPath(options_.base_dir());
752     ICING_RETURN_IF_ERROR(
753         IntegerIndex::Discard(*filesystem_, integer_index_dir));
754     ICING_ASSIGN_OR_RETURN(
755         integer_index_,
756         IntegerIndex::Create(*filesystem_, std::move(integer_index_dir),
757                              options_.integer_index_bucket_split_threshold(),
758                              options_.pre_mapping_fbv()));
759 
760     // Discard qualified id join index directory and instantiate a new one.
761     std::string qualified_id_join_index_dir =
762         MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
763     ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
764         *filesystem_, qualified_id_join_index_dir));
765     ICING_ASSIGN_OR_RETURN(
766         qualified_id_join_index_,
767         CreateQualifiedIdJoinIndex(
768             *filesystem_, std::move(qualified_id_join_index_dir), options_));
769 
770     // Discard embedding index directory and instantiate a new one.
771     std::string embedding_index_dir =
772         MakeEmbeddingIndexWorkingPath(options_.base_dir());
773     ICING_RETURN_IF_ERROR(
774         EmbeddingIndex::Discard(*filesystem_, embedding_index_dir));
775     ICING_ASSIGN_OR_RETURN(
776         embedding_index_,
777         EmbeddingIndex::Create(filesystem_.get(), embedding_index_dir));
778 
779     std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
780     IndexRestorationResult restore_result = RestoreIndexIfNeeded();
781     index_init_status = std::move(restore_result.status);
782     // DATA_LOSS means that we have successfully initialized and re-added
783     // content to the index. Some indexed content was lost, but otherwise the
784     // index is in a valid state and can be queried.
785     if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
786       return index_init_status;
787     }
788 
789     // Delete the marker file to indicate that everything is now in sync with
790     // whatever changes were made to the schema.
791     filesystem_->DeleteFile(marker_filepath.c_str());
792 
793     initialize_stats->set_index_restoration_latency_ms(
794         restore_timer->GetElapsedMilliseconds());
795     initialize_stats->set_index_restoration_cause(
796         InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
797     initialize_stats->set_integer_index_restoration_cause(
798         InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
799     initialize_stats->set_qualified_id_join_index_restoration_cause(
800         InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
801     initialize_stats->set_embedding_index_restoration_cause(
802         InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
803   } else if (version_state_change != version_util::StateChange::kCompatible) {
804     ICING_ASSIGN_OR_RETURN(bool document_store_derived_files_regenerated,
805                            InitializeDocumentStore(
806                                /*force_recovery_and_revalidate_documents=*/true,
807                                initialize_stats));
808     index_init_status = InitializeIndex(
809         document_store_derived_files_regenerated, initialize_stats);
810     if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
811       return index_init_status;
812     }
813 
814     initialize_stats->set_schema_store_recovery_cause(
815         InitializeStatsProto::VERSION_CHANGED);
816     initialize_stats->set_document_store_recovery_cause(
817         InitializeStatsProto::VERSION_CHANGED);
818     initialize_stats->set_index_restoration_cause(
819         InitializeStatsProto::VERSION_CHANGED);
820     initialize_stats->set_integer_index_restoration_cause(
821         InitializeStatsProto::VERSION_CHANGED);
822     initialize_stats->set_qualified_id_join_index_restoration_cause(
823         InitializeStatsProto::VERSION_CHANGED);
824     initialize_stats->set_embedding_index_restoration_cause(
825         InitializeStatsProto::VERSION_CHANGED);
826   } else {
827     ICING_ASSIGN_OR_RETURN(
828         bool document_store_derived_files_regenerated,
829         InitializeDocumentStore(
830             /*force_recovery_and_revalidate_documents=*/false,
831             initialize_stats));
832     index_init_status = InitializeIndex(
833         document_store_derived_files_regenerated, initialize_stats);
834     if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
835       return index_init_status;
836     }
837 
838     // Set recovery cause to FEATURE_FLAG_CHANGED according to the calculated
839     // required_derived_files_rebuild
840     if (required_derived_files_rebuild
841             .needs_document_store_derived_files_rebuild) {
842       initialize_stats->set_document_store_recovery_cause(
843           InitializeStatsProto::FEATURE_FLAG_CHANGED);
844     }
845     if (required_derived_files_rebuild
846             .needs_schema_store_derived_files_rebuild) {
847       initialize_stats->set_schema_store_recovery_cause(
848           InitializeStatsProto::FEATURE_FLAG_CHANGED);
849     }
850     if (required_derived_files_rebuild.needs_term_index_rebuild) {
851       initialize_stats->set_index_restoration_cause(
852           InitializeStatsProto::FEATURE_FLAG_CHANGED);
853     }
854     if (required_derived_files_rebuild.needs_integer_index_rebuild) {
855       initialize_stats->set_integer_index_restoration_cause(
856           InitializeStatsProto::FEATURE_FLAG_CHANGED);
857     }
858     if (required_derived_files_rebuild.needs_qualified_id_join_index_rebuild) {
859       initialize_stats->set_qualified_id_join_index_restoration_cause(
860           InitializeStatsProto::FEATURE_FLAG_CHANGED);
861     }
862     // TODO(b/326656531): Update version-util to consider embedding index.
863   }
864 
865   if (status.ok()) {
866     status = index_init_status;
867   }
868 
869   result_state_manager_ = std::make_unique<ResultStateManager>(
870       performance_configuration_.max_num_total_hits, *document_store_);
871 
872   return status;
873 }
874 
InitializeSchemaStore(InitializeStatsProto * initialize_stats)875 libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore(
876     InitializeStatsProto* initialize_stats) {
877   ICING_RETURN_ERROR_IF_NULL(initialize_stats);
878 
879   const std::string schema_store_dir =
880       MakeSchemaDirectoryPath(options_.base_dir());
881   // Make sure the sub-directory exists
882   if (!filesystem_->CreateDirectoryRecursively(schema_store_dir.c_str())) {
883     return absl_ports::InternalError(
884         absl_ports::StrCat("Could not create directory: ", schema_store_dir));
885   }
886   ICING_ASSIGN_OR_RETURN(
887       schema_store_, SchemaStore::Create(filesystem_.get(), schema_store_dir,
888                                          clock_.get(), initialize_stats));
889 
890   return libtextclassifier3::Status::OK;
891 }
892 
InitializeDocumentStore(bool force_recovery_and_revalidate_documents,InitializeStatsProto * initialize_stats)893 libtextclassifier3::StatusOr<bool> IcingSearchEngine::InitializeDocumentStore(
894     bool force_recovery_and_revalidate_documents,
895     InitializeStatsProto* initialize_stats) {
896   ICING_RETURN_ERROR_IF_NULL(initialize_stats);
897 
898   const std::string document_dir =
899       MakeDocumentDirectoryPath(options_.base_dir());
900   // Make sure the sub-directory exists
901   if (!filesystem_->CreateDirectoryRecursively(document_dir.c_str())) {
902     return absl_ports::InternalError(
903         absl_ports::StrCat("Could not create directory: ", document_dir));
904   }
905   ICING_ASSIGN_OR_RETURN(
906       DocumentStore::CreateResult create_result,
907       DocumentStore::Create(
908           filesystem_.get(), document_dir, clock_.get(), schema_store_.get(),
909           force_recovery_and_revalidate_documents,
910           /*document_store_namespace_id_fingerprint=*/true,
911           /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/true,
912           options_.compression_level(), initialize_stats));
913   document_store_ = std::move(create_result.document_store);
914 
915   return create_result.derived_files_regenerated;
916 }
917 
InitializeIndex(bool document_store_derived_files_regenerated,InitializeStatsProto * initialize_stats)918 libtextclassifier3::Status IcingSearchEngine::InitializeIndex(
919     bool document_store_derived_files_regenerated,
920     InitializeStatsProto* initialize_stats) {
921   ICING_RETURN_ERROR_IF_NULL(initialize_stats);
922 
923   const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
924   // Make sure the sub-directory exists
925   if (!filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
926     return absl_ports::InternalError(
927         absl_ports::StrCat("Could not create directory: ", index_dir));
928   }
929   Index::Options index_options(index_dir, options_.index_merge_size(),
930                                /*lite_index_sort_at_indexing=*/true,
931                                options_.lite_index_sort_size());
932 
933   // Term index
934   InitializeStatsProto::RecoveryCause index_recovery_cause;
935   auto index_or =
936       Index::Create(index_options, filesystem_.get(), icing_filesystem_.get());
937   if (!index_or.ok()) {
938     if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
939         !filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
940       return absl_ports::InternalError(
941           absl_ports::StrCat("Could not recreate directory: ", index_dir));
942     }
943 
944     index_recovery_cause = InitializeStatsProto::IO_ERROR;
945 
946     // Try recreating it from scratch and re-indexing everything.
947     ICING_ASSIGN_OR_RETURN(index_,
948                            Index::Create(index_options, filesystem_.get(),
949                                          icing_filesystem_.get()));
950   } else {
951     // Index was created fine.
952     index_ = std::move(index_or).ValueOrDie();
953     // If a recover does have to happen, then it must be because the index is
954     // out of sync with the document store.
955     index_recovery_cause = InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
956   }
957 
958   // Integer index
959   std::string integer_index_dir =
960       MakeIntegerIndexWorkingPath(options_.base_dir());
961   InitializeStatsProto::RecoveryCause integer_index_recovery_cause;
962   auto integer_index_or =
963       IntegerIndex::Create(*filesystem_, integer_index_dir,
964                            options_.integer_index_bucket_split_threshold(),
965                            options_.pre_mapping_fbv());
966   if (!integer_index_or.ok()) {
967     ICING_RETURN_IF_ERROR(
968         IntegerIndex::Discard(*filesystem_, integer_index_dir));
969 
970     integer_index_recovery_cause = InitializeStatsProto::IO_ERROR;
971 
972     // Try recreating it from scratch and re-indexing everything.
973     ICING_ASSIGN_OR_RETURN(
974         integer_index_,
975         IntegerIndex::Create(*filesystem_, std::move(integer_index_dir),
976                              options_.integer_index_bucket_split_threshold(),
977                              options_.pre_mapping_fbv()));
978   } else {
979     // Integer index was created fine.
980     integer_index_ = std::move(integer_index_or).ValueOrDie();
981     // If a recover does have to happen, then it must be because the index is
982     // out of sync with the document store.
983     integer_index_recovery_cause =
984         InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
985   }
986 
987   // Qualified id join index
988   std::string qualified_id_join_index_dir =
989       MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
990   InitializeStatsProto::RecoveryCause qualified_id_join_index_recovery_cause;
991   if (document_store_derived_files_regenerated &&
992       IsV2QualifiedIdJoinIndexEnabled(options_)) {
993     // V2 qualified id join index depends on document store derived files, so we
994     // have to rebuild it from scratch if
995     // document_store_derived_files_regenerated is true.
996     ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
997         *filesystem_, qualified_id_join_index_dir));
998 
999     ICING_ASSIGN_OR_RETURN(
1000         qualified_id_join_index_,
1001         CreateQualifiedIdJoinIndex(
1002             *filesystem_, std::move(qualified_id_join_index_dir), options_));
1003 
1004     qualified_id_join_index_recovery_cause =
1005         InitializeStatsProto::DEPENDENCIES_CHANGED;
1006   } else {
1007     auto qualified_id_join_index_or = CreateQualifiedIdJoinIndex(
1008         *filesystem_, qualified_id_join_index_dir, options_);
1009     if (!qualified_id_join_index_or.ok()) {
1010       ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
1011           *filesystem_, qualified_id_join_index_dir));
1012 
1013       qualified_id_join_index_recovery_cause = InitializeStatsProto::IO_ERROR;
1014 
1015       // Try recreating it from scratch and rebuild everything.
1016       ICING_ASSIGN_OR_RETURN(
1017           qualified_id_join_index_,
1018           CreateQualifiedIdJoinIndex(
1019               *filesystem_, std::move(qualified_id_join_index_dir), options_));
1020     } else {
1021       // Qualified id join index was created fine.
1022       qualified_id_join_index_ =
1023           std::move(qualified_id_join_index_or).ValueOrDie();
1024       // If a recover does have to happen, then it must be because the index is
1025       // out of sync with the document store.
1026       qualified_id_join_index_recovery_cause =
1027           InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
1028     }
1029   }
1030 
1031   // Embedding index
1032   const std::string embedding_dir =
1033       MakeEmbeddingIndexWorkingPath(options_.base_dir());
1034   InitializeStatsProto::RecoveryCause embedding_index_recovery_cause;
1035   auto embedding_index_or =
1036       EmbeddingIndex::Create(filesystem_.get(), embedding_dir);
1037   if (!embedding_index_or.ok()) {
1038     ICING_RETURN_IF_ERROR(EmbeddingIndex::Discard(*filesystem_, embedding_dir));
1039 
1040     embedding_index_recovery_cause = InitializeStatsProto::IO_ERROR;
1041 
1042     // Try recreating it from scratch and re-indexing everything.
1043     ICING_ASSIGN_OR_RETURN(
1044         embedding_index_,
1045         EmbeddingIndex::Create(filesystem_.get(), embedding_dir));
1046   } else {
1047     // Embedding index was created fine.
1048     embedding_index_ = std::move(embedding_index_or).ValueOrDie();
1049     // If a recover does have to happen, then it must be because the index is
1050     // out of sync with the document store.
1051     embedding_index_recovery_cause =
1052         InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
1053   }
1054 
1055   std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
1056   IndexRestorationResult restore_result = RestoreIndexIfNeeded();
1057   if (restore_result.index_needed_restoration ||
1058       restore_result.integer_index_needed_restoration ||
1059       restore_result.qualified_id_join_index_needed_restoration) {
1060     initialize_stats->set_index_restoration_latency_ms(
1061         restore_timer->GetElapsedMilliseconds());
1062 
1063     if (restore_result.index_needed_restoration) {
1064       initialize_stats->set_index_restoration_cause(index_recovery_cause);
1065     }
1066     if (restore_result.integer_index_needed_restoration) {
1067       initialize_stats->set_integer_index_restoration_cause(
1068           integer_index_recovery_cause);
1069     }
1070     if (restore_result.qualified_id_join_index_needed_restoration) {
1071       initialize_stats->set_qualified_id_join_index_restoration_cause(
1072           qualified_id_join_index_recovery_cause);
1073     }
1074     if (restore_result.embedding_index_needed_restoration) {
1075       initialize_stats->set_embedding_index_restoration_cause(
1076           embedding_index_recovery_cause);
1077     }
1078   }
1079   return restore_result.status;
1080 }
1081 
SetSchema(const SchemaProto & new_schema,bool ignore_errors_and_delete_documents)1082 SetSchemaResultProto IcingSearchEngine::SetSchema(
1083     const SchemaProto& new_schema, bool ignore_errors_and_delete_documents) {
1084   return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents);
1085 }
1086 
SetSchema(SchemaProto && new_schema,bool ignore_errors_and_delete_documents)1087 SetSchemaResultProto IcingSearchEngine::SetSchema(
1088     SchemaProto&& new_schema, bool ignore_errors_and_delete_documents) {
1089   ICING_VLOG(1) << "Setting new Schema";
1090 
1091   SetSchemaResultProto result_proto;
1092   StatusProto* result_status = result_proto.mutable_status();
1093 
1094   absl_ports::unique_lock l(&mutex_);
1095   ScopedTimer timer(clock_->GetNewTimer(), [&result_proto](int64_t t) {
1096     result_proto.set_latency_ms(t);
1097   });
1098   if (!initialized_) {
1099     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1100     result_status->set_message("IcingSearchEngine has not been initialized!");
1101     return result_proto;
1102   }
1103 
1104   auto lost_previous_schema_or = LostPreviousSchema();
1105   if (!lost_previous_schema_or.ok()) {
1106     TransformStatus(lost_previous_schema_or.status(), result_status);
1107     return result_proto;
1108   }
1109   bool lost_previous_schema = lost_previous_schema_or.ValueOrDie();
1110 
1111   std::string marker_filepath =
1112       MakeSetSchemaMarkerFilePath(options_.base_dir());
1113   // Create the marker file indicating that we are going to apply a schema
1114   // change. No need to write anything to the marker file - its existence is the
1115   // only thing that matters. The marker file is used to indicate if we
1116   // encountered a crash or a power loss while updating the schema and other
1117   // files. So set it up to be deleted as long as we return from this function.
1118   DestructibleFile marker_file(marker_filepath, filesystem_.get());
1119 
1120   auto set_schema_result_or = schema_store_->SetSchema(
1121       std::move(new_schema), ignore_errors_and_delete_documents,
1122       options_.allow_circular_schema_definitions());
1123   if (!set_schema_result_or.ok()) {
1124     TransformStatus(set_schema_result_or.status(), result_status);
1125     return result_proto;
1126   }
1127   SchemaStore::SetSchemaResult set_schema_result =
1128       std::move(set_schema_result_or).ValueOrDie();
1129 
1130   for (const std::string& deleted_type :
1131        set_schema_result.schema_types_deleted_by_name) {
1132     result_proto.add_deleted_schema_types(deleted_type);
1133   }
1134 
1135   for (const std::string& incompatible_type :
1136        set_schema_result.schema_types_incompatible_by_name) {
1137     result_proto.add_incompatible_schema_types(incompatible_type);
1138   }
1139 
1140   for (const std::string& new_type :
1141        set_schema_result.schema_types_new_by_name) {
1142     result_proto.add_new_schema_types(std::move(new_type));
1143   }
1144 
1145   for (const std::string& compatible_type :
1146        set_schema_result.schema_types_changed_fully_compatible_by_name) {
1147     result_proto.add_fully_compatible_changed_schema_types(
1148         std::move(compatible_type));
1149   }
1150 
1151   bool index_incompatible =
1152       !set_schema_result.schema_types_index_incompatible_by_name.empty();
1153   for (const std::string& index_incompatible_type :
1154        set_schema_result.schema_types_index_incompatible_by_name) {
1155     result_proto.add_index_incompatible_changed_schema_types(
1156         std::move(index_incompatible_type));
1157   }
1158 
1159   bool join_incompatible =
1160       !set_schema_result.schema_types_join_incompatible_by_name.empty();
1161   for (const std::string& join_incompatible_type :
1162        set_schema_result.schema_types_join_incompatible_by_name) {
1163     result_proto.add_join_incompatible_changed_schema_types(
1164         std::move(join_incompatible_type));
1165   }
1166 
1167   libtextclassifier3::Status status;
1168   if (set_schema_result.success) {
1169     if (lost_previous_schema) {
1170       // No previous schema to calculate a diff against. We have to go through
1171       // and revalidate all the Documents in the DocumentStore
1172       status = document_store_->UpdateSchemaStore(schema_store_.get());
1173       if (!status.ok()) {
1174         TransformStatus(status, result_status);
1175         return result_proto;
1176       }
1177     } else if (!set_schema_result.old_schema_type_ids_changed.empty() ||
1178                !set_schema_result.schema_types_incompatible_by_id.empty() ||
1179                !set_schema_result.schema_types_deleted_by_id.empty()) {
1180       status = document_store_->OptimizedUpdateSchemaStore(schema_store_.get(),
1181                                                            set_schema_result);
1182       if (!status.ok()) {
1183         TransformStatus(status, result_status);
1184         return result_proto;
1185       }
1186     }
1187 
1188     if (lost_previous_schema || index_incompatible) {
1189       // Clears search indices
1190       status = ClearSearchIndices();
1191       if (!status.ok()) {
1192         TransformStatus(status, result_status);
1193         return result_proto;
1194       }
1195     }
1196 
1197     if (lost_previous_schema || join_incompatible) {
1198       // Clears join indices
1199       status = ClearJoinIndices();
1200       if (!status.ok()) {
1201         TransformStatus(status, result_status);
1202         return result_proto;
1203       }
1204     }
1205 
1206     if (lost_previous_schema || index_incompatible || join_incompatible) {
1207       IndexRestorationResult restore_result = RestoreIndexIfNeeded();
1208       // DATA_LOSS means that we have successfully re-added content to the
1209       // index. Some indexed content was lost, but otherwise the index is in a
1210       // valid state and can be queried.
1211       if (!restore_result.status.ok() &&
1212           !absl_ports::IsDataLoss(restore_result.status)) {
1213         TransformStatus(status, result_status);
1214         return result_proto;
1215       }
1216     }
1217 
1218     result_status->set_code(StatusProto::OK);
1219   } else {
1220     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1221     result_status->set_message("Schema is incompatible.");
1222   }
1223 
1224   return result_proto;
1225 }
1226 
GetSchema()1227 GetSchemaResultProto IcingSearchEngine::GetSchema() {
1228   GetSchemaResultProto result_proto;
1229   StatusProto* result_status = result_proto.mutable_status();
1230 
1231   absl_ports::shared_lock l(&mutex_);
1232   if (!initialized_) {
1233     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1234     result_status->set_message("IcingSearchEngine has not been initialized!");
1235     return result_proto;
1236   }
1237 
1238   auto schema_or = schema_store_->GetSchema();
1239   if (!schema_or.ok()) {
1240     TransformStatus(schema_or.status(), result_status);
1241     return result_proto;
1242   }
1243 
1244   result_status->set_code(StatusProto::OK);
1245   *result_proto.mutable_schema() = *std::move(schema_or).ValueOrDie();
1246   return result_proto;
1247 }
1248 
GetSchemaType(std::string_view schema_type)1249 GetSchemaTypeResultProto IcingSearchEngine::GetSchemaType(
1250     std::string_view schema_type) {
1251   GetSchemaTypeResultProto result_proto;
1252   StatusProto* result_status = result_proto.mutable_status();
1253 
1254   absl_ports::shared_lock l(&mutex_);
1255   if (!initialized_) {
1256     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1257     result_status->set_message("IcingSearchEngine has not been initialized!");
1258     return result_proto;
1259   }
1260 
1261   auto type_config_or = schema_store_->GetSchemaTypeConfig(schema_type);
1262   if (!type_config_or.ok()) {
1263     TransformStatus(type_config_or.status(), result_status);
1264     return result_proto;
1265   }
1266 
1267   result_status->set_code(StatusProto::OK);
1268   *result_proto.mutable_schema_type_config() = *(type_config_or.ValueOrDie());
1269   return result_proto;
1270 }
1271 
Put(const DocumentProto & document)1272 PutResultProto IcingSearchEngine::Put(const DocumentProto& document) {
1273   return Put(DocumentProto(document));
1274 }
1275 
Put(DocumentProto && document)1276 PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
1277   ICING_VLOG(1) << "Writing document to document store";
1278 
1279   PutResultProto result_proto;
1280   StatusProto* result_status = result_proto.mutable_status();
1281   PutDocumentStatsProto* put_document_stats =
1282       result_proto.mutable_put_document_stats();
1283   ScopedTimer put_timer(clock_->GetNewTimer(), [put_document_stats](int64_t t) {
1284     put_document_stats->set_latency_ms(t);
1285   });
1286 
1287   // Lock must be acquired before validation because the DocumentStore uses
1288   // the schema file to validate, and the schema could be changed in
1289   // SetSchema() which is protected by the same mutex.
1290   absl_ports::unique_lock l(&mutex_);
1291   if (!initialized_) {
1292     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1293     result_status->set_message("IcingSearchEngine has not been initialized!");
1294     return result_proto;
1295   }
1296 
1297   auto tokenized_document_or = TokenizedDocument::Create(
1298       schema_store_.get(), language_segmenter_.get(), std::move(document));
1299   if (!tokenized_document_or.ok()) {
1300     TransformStatus(tokenized_document_or.status(), result_status);
1301     return result_proto;
1302   }
1303   TokenizedDocument tokenized_document(
1304       std::move(tokenized_document_or).ValueOrDie());
1305 
1306   auto document_id_or = document_store_->Put(
1307       tokenized_document.document(), tokenized_document.num_string_tokens(),
1308       put_document_stats);
1309   if (!document_id_or.ok()) {
1310     TransformStatus(document_id_or.status(), result_status);
1311     return result_proto;
1312   }
1313   DocumentId document_id = document_id_or.ValueOrDie();
1314 
1315   auto data_indexing_handlers_or = CreateDataIndexingHandlers();
1316   if (!data_indexing_handlers_or.ok()) {
1317     TransformStatus(data_indexing_handlers_or.status(), result_status);
1318     return result_proto;
1319   }
1320   IndexProcessor index_processor(
1321       std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get());
1322 
1323   auto index_status = index_processor.IndexDocument(
1324       tokenized_document, document_id, put_document_stats);
1325   // Getting an internal error from the index could possibly mean that the index
1326   // is broken. Try to rebuild them to recover.
1327   if (absl_ports::IsInternal(index_status)) {
1328     ICING_LOG(ERROR) << "Got an internal error from the index. Trying to "
1329                         "rebuild the index!\n"
1330                      << index_status.error_message();
1331     index_status = ClearAllIndices();
1332     if (index_status.ok()) {
1333       index_status = RestoreIndexIfNeeded().status;
1334       if (!index_status.ok()) {
1335         ICING_LOG(ERROR) << "Failed to reindex documents after a failure of "
1336                             "indexing a document.";
1337       }
1338     } else {
1339       ICING_LOG(ERROR)
1340           << "Failed to clear indices after a failure of indexing a document.";
1341     }
1342   }
1343 
1344   if (!index_status.ok()) {
1345     // If we encountered a failure or cannot resolve an internal error while
1346     // indexing this document, then mark it as deleted.
1347     int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
1348     libtextclassifier3::Status delete_status =
1349         document_store_->Delete(document_id, current_time_ms);
1350     if (!delete_status.ok()) {
1351       // This is pretty dire (and, hopefully, unlikely). We can't roll back the
1352       // document that we just added. Wipeout the whole index.
1353       ICING_LOG(ERROR) << "Cannot delete the document that is failed to index. "
1354                           "Wiping out the whole Icing search engine.";
1355       ResetInternal();
1356     }
1357   }
1358 
1359   TransformStatus(index_status, result_status);
1360   return result_proto;
1361 }
1362 
Get(const std::string_view name_space,const std::string_view uri,const GetResultSpecProto & result_spec)1363 GetResultProto IcingSearchEngine::Get(const std::string_view name_space,
1364                                       const std::string_view uri,
1365                                       const GetResultSpecProto& result_spec) {
1366   GetResultProto result_proto;
1367   StatusProto* result_status = result_proto.mutable_status();
1368 
1369   absl_ports::shared_lock l(&mutex_);
1370   if (!initialized_) {
1371     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1372     result_status->set_message("IcingSearchEngine has not been initialized!");
1373     return result_proto;
1374   }
1375 
1376   auto document_or = document_store_->Get(name_space, uri);
1377   if (!document_or.ok()) {
1378     TransformStatus(document_or.status(), result_status);
1379     return result_proto;
1380   }
1381 
1382   DocumentProto document = std::move(document_or).ValueOrDie();
1383   std::unique_ptr<ProjectionTree> type_projection_tree;
1384   std::unique_ptr<ProjectionTree> wildcard_projection_tree;
1385   for (const SchemaStore::ExpandedTypePropertyMask& type_field_mask :
1386        schema_store_->ExpandTypePropertyMasks(
1387            result_spec.type_property_masks())) {
1388     if (type_field_mask.schema_type == document.schema()) {
1389       type_projection_tree = std::make_unique<ProjectionTree>(type_field_mask);
1390     } else if (type_field_mask.schema_type ==
1391                SchemaStore::kSchemaTypeWildcard) {
1392       wildcard_projection_tree =
1393           std::make_unique<ProjectionTree>(type_field_mask);
1394     }
1395   }
1396 
1397   // Apply projection
1398   if (type_projection_tree != nullptr) {
1399     projector::Project(type_projection_tree->root().children, &document);
1400   } else if (wildcard_projection_tree != nullptr) {
1401     projector::Project(wildcard_projection_tree->root().children, &document);
1402   }
1403 
1404   result_status->set_code(StatusProto::OK);
1405   *result_proto.mutable_document() = std::move(document);
1406   return result_proto;
1407 }
1408 
ReportUsage(const UsageReport & usage_report)1409 ReportUsageResultProto IcingSearchEngine::ReportUsage(
1410     const UsageReport& usage_report) {
1411   ReportUsageResultProto result_proto;
1412   StatusProto* result_status = result_proto.mutable_status();
1413 
1414   absl_ports::unique_lock l(&mutex_);
1415   if (!initialized_) {
1416     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1417     result_status->set_message("IcingSearchEngine has not been initialized!");
1418     return result_proto;
1419   }
1420 
1421   libtextclassifier3::Status status =
1422       document_store_->ReportUsage(usage_report);
1423   TransformStatus(status, result_status);
1424   return result_proto;
1425 }
1426 
GetAllNamespaces()1427 GetAllNamespacesResultProto IcingSearchEngine::GetAllNamespaces() {
1428   GetAllNamespacesResultProto result_proto;
1429   StatusProto* result_status = result_proto.mutable_status();
1430 
1431   absl_ports::shared_lock l(&mutex_);
1432   if (!initialized_) {
1433     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1434     result_status->set_message("IcingSearchEngine has not been initialized!");
1435     return result_proto;
1436   }
1437 
1438   std::vector<std::string> namespaces = document_store_->GetAllNamespaces();
1439 
1440   for (const std::string& namespace_ : namespaces) {
1441     result_proto.add_namespaces(namespace_);
1442   }
1443 
1444   result_status->set_code(StatusProto::OK);
1445   return result_proto;
1446 }
1447 
Delete(const std::string_view name_space,const std::string_view uri)1448 DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space,
1449                                             const std::string_view uri) {
1450   ICING_VLOG(1) << "Deleting document from doc store";
1451 
1452   DeleteResultProto result_proto;
1453   StatusProto* result_status = result_proto.mutable_status();
1454 
1455   absl_ports::unique_lock l(&mutex_);
1456   if (!initialized_) {
1457     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1458     result_status->set_message("IcingSearchEngine has not been initialized!");
1459     return result_proto;
1460   }
1461 
1462   DeleteStatsProto* delete_stats = result_proto.mutable_delete_stats();
1463   delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SINGLE);
1464 
1465   std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
1466   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
1467   // that can support error logging.
1468   int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
1469   libtextclassifier3::Status status =
1470       document_store_->Delete(name_space, uri, current_time_ms);
1471   if (!status.ok()) {
1472     LogSeverity::Code severity = ERROR;
1473     if (absl_ports::IsNotFound(status)) {
1474       severity = DBG;
1475     }
1476     ICING_LOG(severity) << status.error_message()
1477                         << "Failed to delete Document. namespace: "
1478                         << name_space << ", uri: " << uri;
1479     TransformStatus(status, result_status);
1480     return result_proto;
1481   }
1482 
1483   result_status->set_code(StatusProto::OK);
1484   delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
1485   delete_stats->set_num_documents_deleted(1);
1486   return result_proto;
1487 }
1488 
DeleteByNamespace(const std::string_view name_space)1489 DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace(
1490     const std::string_view name_space) {
1491   ICING_VLOG(1) << "Deleting namespace from doc store";
1492 
1493   DeleteByNamespaceResultProto delete_result;
1494   StatusProto* result_status = delete_result.mutable_status();
1495   absl_ports::unique_lock l(&mutex_);
1496   if (!initialized_) {
1497     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1498     result_status->set_message("IcingSearchEngine has not been initialized!");
1499     return delete_result;
1500   }
1501 
1502   DeleteStatsProto* delete_stats = delete_result.mutable_delete_stats();
1503   delete_stats->set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE);
1504 
1505   std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
1506   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
1507   // that can support error logging.
1508   DocumentStore::DeleteByGroupResult doc_store_result =
1509       document_store_->DeleteByNamespace(name_space);
1510   if (!doc_store_result.status.ok()) {
1511     ICING_LOG(ERROR) << doc_store_result.status.error_message()
1512                      << "Failed to delete Namespace: " << name_space;
1513     TransformStatus(doc_store_result.status, result_status);
1514     return delete_result;
1515   }
1516 
1517   result_status->set_code(StatusProto::OK);
1518   delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
1519   delete_stats->set_num_documents_deleted(doc_store_result.num_docs_deleted);
1520   return delete_result;
1521 }
1522 
DeleteBySchemaType(const std::string_view schema_type)1523 DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
1524     const std::string_view schema_type) {
1525   ICING_VLOG(1) << "Deleting type from doc store";
1526 
1527   DeleteBySchemaTypeResultProto delete_result;
1528   StatusProto* result_status = delete_result.mutable_status();
1529   absl_ports::unique_lock l(&mutex_);
1530   if (!initialized_) {
1531     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1532     result_status->set_message("IcingSearchEngine has not been initialized!");
1533     return delete_result;
1534   }
1535 
1536   DeleteStatsProto* delete_stats = delete_result.mutable_delete_stats();
1537   delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE);
1538 
1539   std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
1540   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
1541   // that can support error logging.
1542   DocumentStore::DeleteByGroupResult doc_store_result =
1543       document_store_->DeleteBySchemaType(schema_type);
1544   if (!doc_store_result.status.ok()) {
1545     ICING_LOG(ERROR) << doc_store_result.status.error_message()
1546                      << "Failed to delete SchemaType: " << schema_type;
1547     TransformStatus(doc_store_result.status, result_status);
1548     return delete_result;
1549   }
1550 
1551   result_status->set_code(StatusProto::OK);
1552   delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
1553   delete_stats->set_num_documents_deleted(doc_store_result.num_docs_deleted);
1554   return delete_result;
1555 }
1556 
DeleteByQuery(const SearchSpecProto & search_spec,bool return_deleted_document_info)1557 DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
1558     const SearchSpecProto& search_spec, bool return_deleted_document_info) {
1559   ICING_VLOG(1) << "Deleting documents for query " << search_spec.query()
1560                 << " from doc store";
1561 
1562   DeleteByQueryResultProto result_proto;
1563   StatusProto* result_status = result_proto.mutable_status();
1564 
1565   absl_ports::unique_lock l(&mutex_);
1566   if (!initialized_) {
1567     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1568     result_status->set_message("IcingSearchEngine has not been initialized!");
1569     return result_proto;
1570   }
1571 
1572   DeleteByQueryStatsProto* delete_stats =
1573       result_proto.mutable_delete_by_query_stats();
1574   delete_stats->set_query_length(search_spec.query().length());
1575   delete_stats->set_num_namespaces_filtered(
1576       search_spec.namespace_filters_size());
1577   delete_stats->set_num_schema_types_filtered(
1578       search_spec.schema_type_filters_size());
1579 
1580   ScopedTimer delete_timer(clock_->GetNewTimer(), [delete_stats](int64_t t) {
1581     delete_stats->set_latency_ms(t);
1582   });
1583   libtextclassifier3::Status status =
1584       ValidateSearchSpec(search_spec, performance_configuration_);
1585   if (!status.ok()) {
1586     TransformStatus(status, result_status);
1587     return result_proto;
1588   }
1589 
1590   std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
1591   // Gets unordered results from query processor
1592   auto query_processor_or = QueryProcessor::Create(
1593       index_.get(), integer_index_.get(), embedding_index_.get(),
1594       language_segmenter_.get(), normalizer_.get(), document_store_.get(),
1595       schema_store_.get(), clock_.get());
1596   if (!query_processor_or.ok()) {
1597     TransformStatus(query_processor_or.status(), result_status);
1598     delete_stats->set_parse_query_latency_ms(
1599         component_timer->GetElapsedMilliseconds());
1600     return result_proto;
1601   }
1602   std::unique_ptr<QueryProcessor> query_processor =
1603       std::move(query_processor_or).ValueOrDie();
1604 
1605   int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
1606   auto query_results_or = query_processor->ParseSearch(
1607       search_spec, ScoringSpecProto::RankingStrategy::NONE, current_time_ms);
1608   if (!query_results_or.ok()) {
1609     TransformStatus(query_results_or.status(), result_status);
1610     delete_stats->set_parse_query_latency_ms(
1611         component_timer->GetElapsedMilliseconds());
1612     return result_proto;
1613   }
1614   QueryResults query_results = std::move(query_results_or).ValueOrDie();
1615   delete_stats->set_parse_query_latency_ms(
1616       component_timer->GetElapsedMilliseconds());
1617 
1618   ICING_VLOG(2) << "Deleting the docs that matched the query.";
1619   int num_deleted = 0;
1620   // A map used to group deleted documents.
1621   // From the (namespace, type) pair to a list of uris.
1622   std::unordered_map<NamespaceTypePair,
1623                      DeleteByQueryResultProto::DocumentGroupInfo*,
1624                      NamespaceTypePairHasher>
1625       deleted_info_map;
1626 
1627   component_timer = clock_->GetNewTimer();
1628   while (query_results.root_iterator->Advance().ok()) {
1629     ICING_VLOG(3) << "Deleting doc "
1630                   << query_results.root_iterator->doc_hit_info().document_id();
1631     ++num_deleted;
1632     if (return_deleted_document_info) {
1633       status = RetrieveAndAddDocumentInfo(
1634           document_store_.get(), result_proto, deleted_info_map,
1635           query_results.root_iterator->doc_hit_info().document_id());
1636       if (!status.ok()) {
1637         TransformStatus(status, result_status);
1638         delete_stats->set_document_removal_latency_ms(
1639             component_timer->GetElapsedMilliseconds());
1640         return result_proto;
1641       }
1642     }
1643     status = document_store_->Delete(
1644         query_results.root_iterator->doc_hit_info().document_id(),
1645         current_time_ms);
1646     if (!status.ok()) {
1647       TransformStatus(status, result_status);
1648       delete_stats->set_document_removal_latency_ms(
1649           component_timer->GetElapsedMilliseconds());
1650       return result_proto;
1651     }
1652   }
1653   delete_stats->set_document_removal_latency_ms(
1654       component_timer->GetElapsedMilliseconds());
1655   int term_count = 0;
1656   for (const auto& section_and_terms : query_results.query_terms) {
1657     term_count += section_and_terms.second.size();
1658   }
1659   delete_stats->set_num_terms(term_count);
1660 
1661   if (num_deleted > 0) {
1662     result_proto.mutable_status()->set_code(StatusProto::OK);
1663   } else {
1664     result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
1665     result_proto.mutable_status()->set_message(
1666         "No documents matched the query to delete by!");
1667   }
1668   delete_stats->set_num_documents_deleted(num_deleted);
1669   return result_proto;
1670 }
1671 
PersistToDisk(PersistType::Code persist_type)1672 PersistToDiskResultProto IcingSearchEngine::PersistToDisk(
1673     PersistType::Code persist_type) {
1674   ICING_VLOG(1) << "Persisting data to disk";
1675 
1676   PersistToDiskResultProto result_proto;
1677   StatusProto* result_status = result_proto.mutable_status();
1678 
1679   absl_ports::unique_lock l(&mutex_);
1680   if (!initialized_) {
1681     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1682     result_status->set_message("IcingSearchEngine has not been initialized!");
1683     return result_proto;
1684   }
1685 
1686   auto status = InternalPersistToDisk(persist_type);
1687   TransformStatus(status, result_status);
1688   return result_proto;
1689 }
1690 
1691 // Optimizes Icing's storage
1692 //
1693 // Steps:
1694 // 1. Flush data to disk.
1695 // 2. Copy data needed to a tmp directory.
1696 // 3. Swap current directory and tmp directory.
Optimize()1697 OptimizeResultProto IcingSearchEngine::Optimize() {
1698   ICING_VLOG(1) << "Optimizing icing storage";
1699 
1700   OptimizeResultProto result_proto;
1701   StatusProto* result_status = result_proto.mutable_status();
1702 
1703   absl_ports::unique_lock l(&mutex_);
1704   if (!initialized_) {
1705     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1706     result_status->set_message("IcingSearchEngine has not been initialized!");
1707     return result_proto;
1708   }
1709 
1710   OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats();
1711   ScopedTimer optimize_timer(
1712       clock_->GetNewTimer(),
1713       [optimize_stats](int64_t t) { optimize_stats->set_latency_ms(t); });
1714 
1715   // Flushes data to disk before doing optimization
1716   auto status = InternalPersistToDisk(PersistType::FULL);
1717   if (!status.ok()) {
1718     TransformStatus(status, result_status);
1719     return result_proto;
1720   }
1721 
1722   int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
1723   optimize_stats->set_storage_size_before(
1724       Filesystem::SanitizeFileSize(before_size));
1725 
1726   // TODO(b/143646633): figure out if we need to optimize index and doc store
1727   // at the same time.
1728   std::unique_ptr<Timer> optimize_doc_store_timer = clock_->GetNewTimer();
1729   libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
1730       optimize_result_or = OptimizeDocumentStore(optimize_stats);
1731   optimize_stats->set_document_store_optimize_latency_ms(
1732       optimize_doc_store_timer->GetElapsedMilliseconds());
1733 
1734   if (!optimize_result_or.ok() &&
1735       !absl_ports::IsDataLoss(optimize_result_or.status())) {
1736     // The status now is either ABORTED_ERROR or INTERNAL_ERROR.
1737     // If ABORTED_ERROR, Icing should still be working.
1738     // If INTERNAL_ERROR, we're having IO errors or other errors that we can't
1739     // recover from.
1740     TransformStatus(optimize_result_or.status(), result_status);
1741     return result_proto;
1742   }
1743 
1744   // The status is either OK or DATA_LOSS. The optimized document store is
1745   // guaranteed to work, so we update index according to the new document store.
1746   std::unique_ptr<Timer> optimize_index_timer = clock_->GetNewTimer();
1747   auto doc_store_optimize_result_status = optimize_result_or.status();
1748   bool should_rebuild_index =
1749       !optimize_result_or.ok() ||
1750       optimize_result_or.ValueOrDie().should_rebuild_index ||
1751       ShouldRebuildIndex(*optimize_stats,
1752                          options_.optimize_rebuild_index_threshold());
1753   if (!should_rebuild_index) {
1754     // At this point should_rebuild_index is false, so it means
1755     // optimize_result_or.ok() is true and therefore it is safe to call
1756     // ValueOrDie.
1757     DocumentStore::OptimizeResult optimize_result =
1758         std::move(optimize_result_or).ValueOrDie();
1759 
1760     optimize_stats->set_index_restoration_mode(
1761         OptimizeStatsProto::INDEX_TRANSLATION);
1762     libtextclassifier3::Status index_optimize_status =
1763         index_->Optimize(optimize_result.document_id_old_to_new,
1764                          document_store_->last_added_document_id());
1765     if (!index_optimize_status.ok()) {
1766       ICING_LOG(WARNING) << "Failed to optimize index. Error: "
1767                          << index_optimize_status.error_message();
1768       should_rebuild_index = true;
1769     }
1770 
1771     libtextclassifier3::Status integer_index_optimize_status =
1772         integer_index_->Optimize(optimize_result.document_id_old_to_new,
1773                                  document_store_->last_added_document_id());
1774     if (!integer_index_optimize_status.ok()) {
1775       ICING_LOG(WARNING) << "Failed to optimize integer index. Error: "
1776                          << integer_index_optimize_status.error_message();
1777       should_rebuild_index = true;
1778     }
1779 
1780     libtextclassifier3::Status qualified_id_join_index_optimize_status =
1781         qualified_id_join_index_->Optimize(
1782             optimize_result.document_id_old_to_new,
1783             optimize_result.namespace_id_old_to_new,
1784             document_store_->last_added_document_id());
1785     if (!qualified_id_join_index_optimize_status.ok()) {
1786       ICING_LOG(WARNING)
1787           << "Failed to optimize qualified id join index. Error: "
1788           << qualified_id_join_index_optimize_status.error_message();
1789       should_rebuild_index = true;
1790     }
1791 
1792     libtextclassifier3::Status embedding_index_optimize_status =
1793         embedding_index_->Optimize(optimize_result.document_id_old_to_new,
1794                                    document_store_->last_added_document_id());
1795     if (!embedding_index_optimize_status.ok()) {
1796       ICING_LOG(WARNING) << "Failed to optimize embedding index. Error: "
1797                          << embedding_index_optimize_status.error_message();
1798       should_rebuild_index = true;
1799     }
1800   }
1801   // If we received a DATA_LOSS error from OptimizeDocumentStore, we have a
1802   // valid document store, but it might be the old one or the new one. So throw
1803   // out the index data and rebuild from scratch.
1804   // Also rebuild index if DocumentStore::OptimizeInto hints to do so.
1805   // Likewise, if Index::Optimize failed, then attempt to recover the index by
1806   // rebuilding from scratch.
1807   // If ShouldRebuildIndex() returns true, we will also rebuild the index for
1808   // better performance.
1809   if (should_rebuild_index) {
1810     optimize_stats->set_index_restoration_mode(
1811         OptimizeStatsProto::FULL_INDEX_REBUILD);
1812     ICING_LOG(WARNING) << "Clearing the entire index!";
1813 
1814     libtextclassifier3::Status index_clear_status = ClearAllIndices();
1815     if (!index_clear_status.ok()) {
1816       status = absl_ports::Annotate(
1817           absl_ports::InternalError("Failed to clear index."),
1818           index_clear_status.error_message());
1819       TransformStatus(status, result_status);
1820       optimize_stats->set_index_restoration_latency_ms(
1821           optimize_index_timer->GetElapsedMilliseconds());
1822       return result_proto;
1823     }
1824 
1825     IndexRestorationResult index_restoration_status = RestoreIndexIfNeeded();
1826     // DATA_LOSS means that we have successfully re-added content to the index.
1827     // Some indexed content was lost, but otherwise the index is in a valid
1828     // state and can be queried.
1829     if (!index_restoration_status.status.ok() &&
1830         !absl_ports::IsDataLoss(index_restoration_status.status)) {
1831       status = absl_ports::Annotate(
1832           absl_ports::InternalError(
1833               "Failed to reindex documents after optimization."),
1834           index_restoration_status.status.error_message());
1835 
1836       TransformStatus(status, result_status);
1837       optimize_stats->set_index_restoration_latency_ms(
1838           optimize_index_timer->GetElapsedMilliseconds());
1839       return result_proto;
1840     }
1841   }
1842   optimize_stats->set_index_restoration_latency_ms(
1843       optimize_index_timer->GetElapsedMilliseconds());
1844 
1845   // Read the optimize status to get the time that we last ran.
1846   std::string optimize_status_filename =
1847       absl_ports::StrCat(options_.base_dir(), "/", kOptimizeStatusFilename);
1848   FileBackedProto<OptimizeStatusProto> optimize_status_file(
1849       *filesystem_, optimize_status_filename);
1850   auto optimize_status_or = optimize_status_file.Read();
1851   int64_t current_time = clock_->GetSystemTimeMilliseconds();
1852   if (optimize_status_or.ok()) {
1853     // If we have trouble reading the status or this is the first time that
1854     // we've ever run, don't set this field.
1855     optimize_stats->set_time_since_last_optimize_ms(
1856         current_time - optimize_status_or.ValueOrDie()
1857                            ->last_successful_optimize_run_time_ms());
1858   }
1859 
1860   // Update the status for this run and write it.
1861   auto optimize_status = std::make_unique<OptimizeStatusProto>();
1862   optimize_status->set_last_successful_optimize_run_time_ms(current_time);
1863   auto write_status = optimize_status_file.Write(std::move(optimize_status));
1864   if (!write_status.ok()) {
1865     ICING_LOG(ERROR) << "Failed to write optimize status:\n"
1866                      << write_status.error_message();
1867   }
1868 
1869   // Flushes data to disk after doing optimization
1870   status = InternalPersistToDisk(PersistType::FULL);
1871   if (!status.ok()) {
1872     TransformStatus(status, result_status);
1873     return result_proto;
1874   }
1875 
1876   int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
1877   optimize_stats->set_storage_size_after(
1878       Filesystem::SanitizeFileSize(after_size));
1879 
1880   TransformStatus(doc_store_optimize_result_status, result_status);
1881   return result_proto;
1882 }
1883 
GetOptimizeInfo()1884 GetOptimizeInfoResultProto IcingSearchEngine::GetOptimizeInfo() {
1885   ICING_VLOG(1) << "Getting optimize info from IcingSearchEngine";
1886 
1887   GetOptimizeInfoResultProto result_proto;
1888   StatusProto* result_status = result_proto.mutable_status();
1889 
1890   absl_ports::shared_lock l(&mutex_);
1891   if (!initialized_) {
1892     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1893     result_status->set_message("IcingSearchEngine has not been initialized!");
1894     return result_proto;
1895   }
1896 
1897   // Read the optimize status to get the time that we last ran.
1898   std::string optimize_status_filename =
1899       absl_ports::StrCat(options_.base_dir(), "/", kOptimizeStatusFilename);
1900   FileBackedProto<OptimizeStatusProto> optimize_status_file(
1901       *filesystem_, optimize_status_filename);
1902   auto optimize_status_or = optimize_status_file.Read();
1903   int64_t current_time = clock_->GetSystemTimeMilliseconds();
1904 
1905   if (optimize_status_or.ok()) {
1906     // If we have trouble reading the status or this is the first time that
1907     // we've ever run, don't set this field.
1908     result_proto.set_time_since_last_optimize_ms(
1909         current_time - optimize_status_or.ValueOrDie()
1910                            ->last_successful_optimize_run_time_ms());
1911   }
1912 
1913   // Get stats from DocumentStore
1914   auto doc_store_optimize_info_or = document_store_->GetOptimizeInfo();
1915   if (!doc_store_optimize_info_or.ok()) {
1916     TransformStatus(doc_store_optimize_info_or.status(), result_status);
1917     return result_proto;
1918   }
1919   DocumentStore::OptimizeInfo doc_store_optimize_info =
1920       doc_store_optimize_info_or.ValueOrDie();
1921   result_proto.set_optimizable_docs(doc_store_optimize_info.optimizable_docs);
1922 
1923   if (doc_store_optimize_info.optimizable_docs == 0) {
1924     // Can return early since there's nothing to calculate on the index side
1925     result_proto.set_estimated_optimizable_bytes(0);
1926     result_status->set_code(StatusProto::OK);
1927     return result_proto;
1928   }
1929 
1930   // Get stats from Index.
1931   auto index_elements_size_or = index_->GetElementsSize();
1932   if (!index_elements_size_or.ok()) {
1933     TransformStatus(index_elements_size_or.status(), result_status);
1934     return result_proto;
1935   }
1936   int64_t index_elements_size = index_elements_size_or.ValueOrDie();
1937 
1938   // TODO(b/259744228): add stats for integer index
1939 
1940   // Sum up the optimizable sizes from DocumentStore and Index
1941   result_proto.set_estimated_optimizable_bytes(
1942       index_elements_size * doc_store_optimize_info.optimizable_docs /
1943           doc_store_optimize_info.total_docs +
1944       doc_store_optimize_info.estimated_optimizable_bytes);
1945 
1946   result_status->set_code(StatusProto::OK);
1947   return result_proto;
1948 }
1949 
GetStorageInfo()1950 StorageInfoResultProto IcingSearchEngine::GetStorageInfo() {
1951   StorageInfoResultProto result;
1952   absl_ports::shared_lock l(&mutex_);
1953   if (!initialized_) {
1954     result.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
1955     result.mutable_status()->set_message(
1956         "IcingSearchEngine has not been initialized!");
1957     return result;
1958   }
1959 
1960   int64_t index_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
1961   result.mutable_storage_info()->set_total_storage_size(
1962       Filesystem::SanitizeFileSize(index_size));
1963   *result.mutable_storage_info()->mutable_document_storage_info() =
1964       document_store_->GetStorageInfo();
1965   *result.mutable_storage_info()->mutable_schema_store_storage_info() =
1966       schema_store_->GetStorageInfo();
1967   *result.mutable_storage_info()->mutable_index_storage_info() =
1968       index_->GetStorageInfo();
1969   // TODO(b/259744228): add stats for integer index
1970   result.mutable_status()->set_code(StatusProto::OK);
1971   return result;
1972 }
1973 
GetDebugInfo(DebugInfoVerbosity::Code verbosity)1974 DebugInfoResultProto IcingSearchEngine::GetDebugInfo(
1975     DebugInfoVerbosity::Code verbosity) {
1976   DebugInfoResultProto debug_info;
1977   StatusProto* result_status = debug_info.mutable_status();
1978   absl_ports::shared_lock l(&mutex_);
1979   if (!initialized_) {
1980     debug_info.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
1981     debug_info.mutable_status()->set_message(
1982         "IcingSearchEngine has not been initialized!");
1983     return debug_info;
1984   }
1985 
1986   // Index
1987   *debug_info.mutable_debug_info()->mutable_index_info() =
1988       index_->GetDebugInfo(verbosity);
1989 
1990   // TODO(b/259744228): add debug info for integer index
1991 
1992   // Document Store
1993   libtextclassifier3::StatusOr<DocumentDebugInfoProto> document_debug_info =
1994       document_store_->GetDebugInfo(verbosity);
1995   if (!document_debug_info.ok()) {
1996     TransformStatus(document_debug_info.status(), result_status);
1997     return debug_info;
1998   }
1999   *debug_info.mutable_debug_info()->mutable_document_info() =
2000       std::move(document_debug_info).ValueOrDie();
2001 
2002   // Schema Store
2003   libtextclassifier3::StatusOr<SchemaDebugInfoProto> schema_debug_info =
2004       schema_store_->GetDebugInfo();
2005   if (!schema_debug_info.ok()) {
2006     TransformStatus(schema_debug_info.status(), result_status);
2007     return debug_info;
2008   }
2009   *debug_info.mutable_debug_info()->mutable_schema_info() =
2010       std::move(schema_debug_info).ValueOrDie();
2011 
2012   result_status->set_code(StatusProto::OK);
2013   return debug_info;
2014 }
2015 
InternalPersistToDisk(PersistType::Code persist_type)2016 libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk(
2017     PersistType::Code persist_type) {
2018   if (persist_type == PersistType::LITE) {
2019     return document_store_->PersistToDisk(persist_type);
2020   }
2021   ICING_RETURN_IF_ERROR(schema_store_->PersistToDisk());
2022   ICING_RETURN_IF_ERROR(document_store_->PersistToDisk(PersistType::FULL));
2023   ICING_RETURN_IF_ERROR(index_->PersistToDisk());
2024   ICING_RETURN_IF_ERROR(integer_index_->PersistToDisk());
2025   ICING_RETURN_IF_ERROR(qualified_id_join_index_->PersistToDisk());
2026   ICING_RETURN_IF_ERROR(embedding_index_->PersistToDisk());
2027 
2028   return libtextclassifier3::Status::OK;
2029 }
2030 
Search(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2031 SearchResultProto IcingSearchEngine::Search(
2032     const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2033     const ResultSpecProto& result_spec) {
2034   if (search_spec.use_read_only_search()) {
2035     return SearchLockedShared(search_spec, scoring_spec, result_spec);
2036   } else {
2037     return SearchLockedExclusive(search_spec, scoring_spec, result_spec);
2038   }
2039 }
2040 
SearchLockedShared(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2041 SearchResultProto IcingSearchEngine::SearchLockedShared(
2042     const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2043     const ResultSpecProto& result_spec) {
2044   std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
2045 
2046   // Only acquire an overall read-lock for this implementation. Finer-grained
2047   // locks are implemented around code paths that write changes to Icing's data
2048   // members.
2049   absl_ports::shared_lock l(&mutex_);
2050   int64_t lock_acquisition_latency = overall_timer->GetElapsedMilliseconds();
2051 
2052   SearchResultProto result_proto =
2053       InternalSearch(search_spec, scoring_spec, result_spec);
2054 
2055   result_proto.mutable_query_stats()->set_lock_acquisition_latency_ms(
2056       lock_acquisition_latency);
2057   result_proto.mutable_query_stats()->set_latency_ms(
2058       overall_timer->GetElapsedMilliseconds());
2059   return result_proto;
2060 }
2061 
SearchLockedExclusive(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2062 SearchResultProto IcingSearchEngine::SearchLockedExclusive(
2063     const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2064     const ResultSpecProto& result_spec) {
2065   std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
2066 
2067   // Acquire the overall write-lock for this locked implementation.
2068   absl_ports::unique_lock l(&mutex_);
2069   int64_t lock_acquisition_latency = overall_timer->GetElapsedMilliseconds();
2070 
2071   SearchResultProto result_proto =
2072       InternalSearch(search_spec, scoring_spec, result_spec);
2073 
2074   result_proto.mutable_query_stats()->set_lock_acquisition_latency_ms(
2075       lock_acquisition_latency);
2076   result_proto.mutable_query_stats()->set_latency_ms(
2077       overall_timer->GetElapsedMilliseconds());
2078   return result_proto;
2079 }
2080 
InternalSearch(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2081 SearchResultProto IcingSearchEngine::InternalSearch(
2082     const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2083     const ResultSpecProto& result_spec) {
2084   SearchResultProto result_proto;
2085   StatusProto* result_status = result_proto.mutable_status();
2086 
2087   QueryStatsProto* query_stats = result_proto.mutable_query_stats();
2088   query_stats->set_is_first_page(true);
2089   query_stats->set_requested_page_size(result_spec.num_per_page());
2090 
2091   // TODO(b/305098009): deprecate search-related flat fields in query_stats.
2092   query_stats->set_num_namespaces_filtered(
2093       search_spec.namespace_filters_size());
2094   query_stats->set_num_schema_types_filtered(
2095       search_spec.schema_type_filters_size());
2096   query_stats->set_query_length(search_spec.query().length());
2097   query_stats->set_ranking_strategy(scoring_spec.rank_by());
2098 
2099   if (!initialized_) {
2100     result_status->set_code(StatusProto::FAILED_PRECONDITION);
2101     result_status->set_message("IcingSearchEngine has not been initialized!");
2102     return result_proto;
2103   }
2104   index_->PublishQueryStats(query_stats);
2105 
2106   libtextclassifier3::Status status =
2107       ValidateResultSpec(document_store_.get(), result_spec);
2108   if (!status.ok()) {
2109     TransformStatus(status, result_status);
2110     return result_proto;
2111   }
2112   status = ValidateSearchSpec(search_spec, performance_configuration_);
2113   if (!status.ok()) {
2114     TransformStatus(status, result_status);
2115     return result_proto;
2116   }
2117 
2118   const JoinSpecProto& join_spec = search_spec.join_spec();
2119   std::unique_ptr<JoinChildrenFetcher> join_children_fetcher;
2120   std::unique_ptr<ResultAdjustmentInfo> child_result_adjustment_info;
2121   int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
2122   if (!join_spec.parent_property_expression().empty() &&
2123       !join_spec.child_property_expression().empty()) {
2124     query_stats->set_is_join_query(true);
2125     QueryStatsProto::SearchStats* child_search_stats =
2126         query_stats->mutable_child_search_stats();
2127 
2128     // Process child query
2129     QueryScoringResults nested_query_scoring_results = ProcessQueryAndScore(
2130         join_spec.nested_spec().search_spec(),
2131         join_spec.nested_spec().scoring_spec(),
2132         join_spec.nested_spec().result_spec(),
2133         /*join_children_fetcher=*/nullptr, current_time_ms, child_search_stats);
2134     if (!nested_query_scoring_results.status.ok()) {
2135       TransformStatus(nested_query_scoring_results.status, result_status);
2136       return result_proto;
2137     }
2138 
2139     JoinProcessor join_processor(document_store_.get(), schema_store_.get(),
2140                                  qualified_id_join_index_.get(),
2141                                  current_time_ms);
2142     // Building a JoinChildrenFetcher where child documents are grouped by
2143     // their joinable values.
2144     libtextclassifier3::StatusOr<JoinChildrenFetcher> join_children_fetcher_or =
2145         join_processor.GetChildrenFetcher(
2146             search_spec.join_spec(),
2147             std::move(nested_query_scoring_results.scored_document_hits));
2148     if (!join_children_fetcher_or.ok()) {
2149       TransformStatus(join_children_fetcher_or.status(), result_status);
2150       return result_proto;
2151     }
2152     join_children_fetcher = std::make_unique<JoinChildrenFetcher>(
2153         std::move(join_children_fetcher_or).ValueOrDie());
2154 
2155     // Assign child's ResultAdjustmentInfo.
2156     child_result_adjustment_info = std::make_unique<ResultAdjustmentInfo>(
2157         join_spec.nested_spec().search_spec(),
2158         join_spec.nested_spec().scoring_spec(),
2159         join_spec.nested_spec().result_spec(), schema_store_.get(),
2160         std::move(nested_query_scoring_results.query_terms));
2161   }
2162 
2163   // Process parent query
2164   QueryStatsProto::SearchStats* parent_search_stats =
2165       query_stats->mutable_parent_search_stats();
2166   QueryScoringResults query_scoring_results = ProcessQueryAndScore(
2167       search_spec, scoring_spec, result_spec, join_children_fetcher.get(),
2168       current_time_ms, parent_search_stats);
2169   // TODO(b/305098009): deprecate search-related flat fields in query_stats.
2170   query_stats->set_num_terms(parent_search_stats->num_terms());
2171   query_stats->set_parse_query_latency_ms(
2172       parent_search_stats->parse_query_latency_ms());
2173   query_stats->set_scoring_latency_ms(
2174       parent_search_stats->scoring_latency_ms());
2175   query_stats->set_num_documents_scored(
2176       parent_search_stats->num_documents_scored());
2177   if (!query_scoring_results.status.ok()) {
2178     TransformStatus(query_scoring_results.status, result_status);
2179     return result_proto;
2180   }
2181 
2182   // Returns early for empty result
2183   if (query_scoring_results.scored_document_hits.empty()) {
2184     result_status->set_code(StatusProto::OK);
2185     return result_proto;
2186   }
2187 
2188   // Construct parent's result adjustment info.
2189   auto parent_result_adjustment_info = std::make_unique<ResultAdjustmentInfo>(
2190       search_spec, scoring_spec, result_spec, schema_store_.get(),
2191       std::move(query_scoring_results.query_terms));
2192 
2193   std::unique_ptr<ScoredDocumentHitsRanker> ranker;
2194   if (join_children_fetcher != nullptr) {
2195     std::unique_ptr<Timer> join_timer = clock_->GetNewTimer();
2196     // Join 2 scored document hits
2197     JoinProcessor join_processor(document_store_.get(), schema_store_.get(),
2198                                  qualified_id_join_index_.get(),
2199                                  current_time_ms);
2200     libtextclassifier3::StatusOr<std::vector<JoinedScoredDocumentHit>>
2201         joined_result_document_hits_or = join_processor.Join(
2202             join_spec, std::move(query_scoring_results.scored_document_hits),
2203             *join_children_fetcher);
2204     if (!joined_result_document_hits_or.ok()) {
2205       TransformStatus(joined_result_document_hits_or.status(), result_status);
2206       return result_proto;
2207     }
2208     std::vector<JoinedScoredDocumentHit> joined_result_document_hits =
2209         std::move(joined_result_document_hits_or).ValueOrDie();
2210 
2211     query_stats->set_join_latency_ms(join_timer->GetElapsedMilliseconds());
2212 
2213     std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2214     // Ranks results
2215     ranker = std::make_unique<
2216         PriorityQueueScoredDocumentHitsRanker<JoinedScoredDocumentHit>>(
2217         std::move(joined_result_document_hits),
2218         /*is_descending=*/scoring_spec.order_by() ==
2219             ScoringSpecProto::Order::DESC);
2220     query_stats->set_ranking_latency_ms(
2221         component_timer->GetElapsedMilliseconds());
2222   } else {
2223     // Non-join query
2224     std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2225     // Ranks results
2226     ranker = std::make_unique<
2227         PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
2228         std::move(query_scoring_results.scored_document_hits),
2229         /*is_descending=*/scoring_spec.order_by() ==
2230             ScoringSpecProto::Order::DESC);
2231     query_stats->set_ranking_latency_ms(
2232         component_timer->GetElapsedMilliseconds());
2233   }
2234 
2235   std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2236   // CacheAndRetrieveFirstPage and retrieves the document protos and snippets if
2237   // requested
2238   auto result_retriever_or =
2239       ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
2240                                 language_segmenter_.get(), normalizer_.get());
2241   if (!result_retriever_or.ok()) {
2242     TransformStatus(result_retriever_or.status(), result_status);
2243     query_stats->set_document_retrieval_latency_ms(
2244         component_timer->GetElapsedMilliseconds());
2245     return result_proto;
2246   }
2247   std::unique_ptr<ResultRetrieverV2> result_retriever =
2248       std::move(result_retriever_or).ValueOrDie();
2249 
2250   libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
2251       page_result_info_or = result_state_manager_->CacheAndRetrieveFirstPage(
2252           std::move(ranker), std::move(parent_result_adjustment_info),
2253           std::move(child_result_adjustment_info), result_spec,
2254           *document_store_, *result_retriever, current_time_ms);
2255   if (!page_result_info_or.ok()) {
2256     TransformStatus(page_result_info_or.status(), result_status);
2257     query_stats->set_document_retrieval_latency_ms(
2258         component_timer->GetElapsedMilliseconds());
2259     return result_proto;
2260   }
2261   std::pair<uint64_t, PageResult> page_result_info =
2262       std::move(page_result_info_or).ValueOrDie();
2263 
2264   // Assembles the final search result proto
2265   result_proto.mutable_results()->Reserve(
2266       page_result_info.second.results.size());
2267 
2268   int32_t child_count = 0;
2269   for (SearchResultProto::ResultProto& result :
2270        page_result_info.second.results) {
2271     child_count += result.joined_results_size();
2272     result_proto.mutable_results()->Add(std::move(result));
2273   }
2274 
2275   result_status->set_code(StatusProto::OK);
2276   if (page_result_info.first != kInvalidNextPageToken) {
2277     result_proto.set_next_page_token(page_result_info.first);
2278   }
2279 
2280   query_stats->set_document_retrieval_latency_ms(
2281       component_timer->GetElapsedMilliseconds());
2282   query_stats->set_num_results_returned_current_page(
2283       result_proto.results_size());
2284 
2285   query_stats->set_num_joined_results_returned_current_page(child_count);
2286 
2287   query_stats->set_num_results_with_snippets(
2288       page_result_info.second.num_results_with_snippets);
2289   return result_proto;
2290 }
2291 
ProcessQueryAndScore(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec,const JoinChildrenFetcher * join_children_fetcher,int64_t current_time_ms,QueryStatsProto::SearchStats * search_stats)2292 IcingSearchEngine::QueryScoringResults IcingSearchEngine::ProcessQueryAndScore(
2293     const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2294     const ResultSpecProto& result_spec,
2295     const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms,
2296     QueryStatsProto::SearchStats* search_stats) {
2297   search_stats->set_num_namespaces_filtered(
2298       search_spec.namespace_filters_size());
2299   search_stats->set_num_schema_types_filtered(
2300       search_spec.schema_type_filters_size());
2301   search_stats->set_query_length(search_spec.query().length());
2302   search_stats->set_ranking_strategy(scoring_spec.rank_by());
2303 
2304   std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2305 
2306   // Gets unordered results from query processor
2307   auto query_processor_or = QueryProcessor::Create(
2308       index_.get(), integer_index_.get(), embedding_index_.get(),
2309       language_segmenter_.get(), normalizer_.get(), document_store_.get(),
2310       schema_store_.get(), clock_.get());
2311   if (!query_processor_or.ok()) {
2312     search_stats->set_parse_query_latency_ms(
2313         component_timer->GetElapsedMilliseconds());
2314     return QueryScoringResults(std::move(query_processor_or).status(),
2315                                /*query_terms_in=*/{},
2316                                /*scored_document_hits_in=*/{});
2317   }
2318   std::unique_ptr<QueryProcessor> query_processor =
2319       std::move(query_processor_or).ValueOrDie();
2320 
2321   auto ranking_strategy_or = GetRankingStrategyFromScoringSpec(scoring_spec);
2322   libtextclassifier3::StatusOr<QueryResults> query_results_or;
2323   if (ranking_strategy_or.ok()) {
2324     query_results_or = query_processor->ParseSearch(
2325         search_spec, ranking_strategy_or.ValueOrDie(), current_time_ms,
2326         search_stats);
2327   } else {
2328     query_results_or = ranking_strategy_or.status();
2329   }
2330   search_stats->set_parse_query_latency_ms(
2331       component_timer->GetElapsedMilliseconds());
2332   if (!query_results_or.ok()) {
2333     return QueryScoringResults(std::move(query_results_or).status(),
2334                                /*query_terms_in=*/{},
2335                                /*scored_document_hits_in=*/{});
2336   }
2337   QueryResults query_results = std::move(query_results_or).ValueOrDie();
2338 
2339   // Set SearchStats related to QueryResults.
2340   int term_count = 0;
2341   for (const auto& section_and_terms : query_results.query_terms) {
2342     term_count += section_and_terms.second.size();
2343   }
2344   search_stats->set_num_terms(term_count);
2345 
2346   if (query_results.features_in_use.count(kNumericSearchFeature)) {
2347     search_stats->set_is_numeric_query(true);
2348   }
2349 
2350   component_timer = clock_->GetNewTimer();
2351   // Scores but does not rank the results.
2352   libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>>
2353       scoring_processor_or = ScoringProcessor::Create(
2354           scoring_spec, /*default_semantic_metric_type=*/
2355           search_spec.embedding_query_metric_type(), document_store_.get(),
2356           schema_store_.get(), current_time_ms, join_children_fetcher,
2357           &query_results.embedding_query_results);
2358   if (!scoring_processor_or.ok()) {
2359     return QueryScoringResults(std::move(scoring_processor_or).status(),
2360                                std::move(query_results.query_terms),
2361                                /*scored_document_hits_in=*/{});
2362   }
2363   std::unique_ptr<ScoringProcessor> scoring_processor =
2364       std::move(scoring_processor_or).ValueOrDie();
2365   std::vector<ScoredDocumentHit> scored_document_hits =
2366       scoring_processor->Score(
2367           std::move(query_results.root_iterator), result_spec.num_to_score(),
2368           &query_results.query_term_iterators, search_stats);
2369   search_stats->set_scoring_latency_ms(
2370       component_timer->GetElapsedMilliseconds());
2371 
2372   return QueryScoringResults(libtextclassifier3::Status::OK,
2373                              std::move(query_results.query_terms),
2374                              std::move(scored_document_hits));
2375 }
2376 
GetNextPage(uint64_t next_page_token)2377 SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) {
2378   SearchResultProto result_proto;
2379   StatusProto* result_status = result_proto.mutable_status();
2380 
2381   QueryStatsProto* query_stats = result_proto.mutable_query_stats();
2382   query_stats->set_is_first_page(false);
2383   std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
2384   // ResultStateManager has its own writer lock, so here we only need a reader
2385   // lock for other components.
2386   absl_ports::shared_lock l(&mutex_);
2387   query_stats->set_lock_acquisition_latency_ms(
2388       overall_timer->GetElapsedMilliseconds());
2389   if (!initialized_) {
2390     result_status->set_code(StatusProto::FAILED_PRECONDITION);
2391     result_status->set_message("IcingSearchEngine has not been initialized!");
2392     return result_proto;
2393   }
2394 
2395   auto result_retriever_or =
2396       ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
2397                                 language_segmenter_.get(), normalizer_.get());
2398   if (!result_retriever_or.ok()) {
2399     TransformStatus(result_retriever_or.status(), result_status);
2400     return result_proto;
2401   }
2402   std::unique_ptr<ResultRetrieverV2> result_retriever =
2403       std::move(result_retriever_or).ValueOrDie();
2404 
2405   int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
2406   libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
2407       page_result_info_or = result_state_manager_->GetNextPage(
2408           next_page_token, *result_retriever, current_time_ms);
2409   if (!page_result_info_or.ok()) {
2410     if (absl_ports::IsNotFound(page_result_info_or.status())) {
2411       // NOT_FOUND means an empty result.
2412       result_status->set_code(StatusProto::OK);
2413     } else {
2414       // Real error, pass up.
2415       TransformStatus(page_result_info_or.status(), result_status);
2416     }
2417     return result_proto;
2418   }
2419 
2420   std::pair<uint64_t, PageResult> page_result_info =
2421       std::move(page_result_info_or).ValueOrDie();
2422   query_stats->set_requested_page_size(
2423       page_result_info.second.requested_page_size);
2424 
2425   // Assembles the final search result proto
2426   result_proto.mutable_results()->Reserve(
2427       page_result_info.second.results.size());
2428 
2429   int32_t child_count = 0;
2430   for (SearchResultProto::ResultProto& result :
2431        page_result_info.second.results) {
2432     child_count += result.joined_results_size();
2433     result_proto.mutable_results()->Add(std::move(result));
2434   }
2435 
2436   result_status->set_code(StatusProto::OK);
2437   if (page_result_info.first != kInvalidNextPageToken) {
2438     result_proto.set_next_page_token(page_result_info.first);
2439   }
2440 
2441   // The only thing that we're doing is document retrieval. So document
2442   // retrieval latency and overall latency are the same and can use the same
2443   // timer.
2444   query_stats->set_document_retrieval_latency_ms(
2445       overall_timer->GetElapsedMilliseconds());
2446   query_stats->set_latency_ms(overall_timer->GetElapsedMilliseconds());
2447   query_stats->set_num_results_returned_current_page(
2448       result_proto.results_size());
2449   query_stats->set_num_results_with_snippets(
2450       page_result_info.second.num_results_with_snippets);
2451   query_stats->set_num_joined_results_returned_current_page(child_count);
2452 
2453   return result_proto;
2454 }
2455 
InvalidateNextPageToken(uint64_t next_page_token)2456 void IcingSearchEngine::InvalidateNextPageToken(uint64_t next_page_token) {
2457   absl_ports::shared_lock l(&mutex_);
2458   if (!initialized_) {
2459     ICING_LOG(ERROR) << "IcingSearchEngine has not been initialized!";
2460     return;
2461   }
2462   result_state_manager_->InvalidateResultState(next_page_token);
2463 }
2464 
2465 libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
OptimizeDocumentStore(OptimizeStatsProto * optimize_stats)2466 IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) {
2467   // Gets the current directory path and an empty tmp directory path for
2468   // document store optimization.
2469   const std::string current_document_dir =
2470       MakeDocumentDirectoryPath(options_.base_dir());
2471   const std::string temporary_document_dir =
2472       MakeDocumentTemporaryDirectoryPath(options_.base_dir());
2473   if (!filesystem_->DeleteDirectoryRecursively(
2474           temporary_document_dir.c_str()) ||
2475       !filesystem_->CreateDirectoryRecursively(
2476           temporary_document_dir.c_str())) {
2477     return absl_ports::AbortedError(absl_ports::StrCat(
2478         "Failed to create a tmp directory: ", temporary_document_dir));
2479   }
2480 
2481   // Copies valid document data to tmp directory
2482   libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
2483       optimize_result_or = document_store_->OptimizeInto(
2484           temporary_document_dir, language_segmenter_.get(), optimize_stats);
2485 
2486   // Handles error if any
2487   if (!optimize_result_or.ok()) {
2488     filesystem_->DeleteDirectoryRecursively(temporary_document_dir.c_str());
2489     return absl_ports::Annotate(
2490         absl_ports::AbortedError("Failed to optimize document store"),
2491         optimize_result_or.status().error_message());
2492   }
2493 
2494   // result_state_manager_ depends on document_store_. So we need to reset it at
2495   // the same time that we reset the document_store_.
2496   result_state_manager_.reset();
2497   document_store_.reset();
2498 
2499   // When swapping files, always put the current working directory at the
2500   // second place because it is renamed at the latter position so we're less
2501   // vulnerable to errors.
2502   if (!filesystem_->SwapFiles(temporary_document_dir.c_str(),
2503                               current_document_dir.c_str())) {
2504     ICING_LOG(ERROR) << "Failed to swap files";
2505 
2506     // Ensures that current directory is still present.
2507     if (!filesystem_->CreateDirectoryRecursively(
2508             current_document_dir.c_str())) {
2509       // Can't even create the old directory. Mark as uninitialized and return
2510       // INTERNAL.
2511       initialized_ = false;
2512       return absl_ports::InternalError(
2513           "Failed to create file directory for document store");
2514     }
2515 
2516     // Tries to rebuild document store if swapping fails, to avoid leaving the
2517     // system in the broken state for future operations.
2518     auto create_result_or = DocumentStore::Create(
2519         filesystem_.get(), current_document_dir, clock_.get(),
2520         schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false,
2521         /*document_store_namespace_id_fingerprint=*/true,
2522         /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/true,
2523         options_.compression_level(), /*initialize_stats=*/nullptr);
2524     // TODO(b/144458732): Implement a more robust version of
2525     // TC_ASSIGN_OR_RETURN that can support error logging.
2526     if (!create_result_or.ok()) {
2527       // Unable to create DocumentStore from the old file. Mark as uninitialized
2528       // and return INTERNAL.
2529       initialized_ = false;
2530       ICING_LOG(ERROR) << "Failed to create document store instance";
2531       return absl_ports::Annotate(
2532           absl_ports::InternalError("Failed to create document store instance"),
2533           create_result_or.status().error_message());
2534     }
2535     document_store_ = std::move(create_result_or.ValueOrDie().document_store);
2536     result_state_manager_ = std::make_unique<ResultStateManager>(
2537         performance_configuration_.max_num_total_hits, *document_store_);
2538 
2539     // Potential data loss
2540     // TODO(b/147373249): Find a way to detect true data loss error
2541     return absl_ports::DataLossError(
2542         "Failed to optimize document store, there might be data loss");
2543   }
2544 
2545   // Recreates the doc store instance
2546   auto create_result_or = DocumentStore::Create(
2547       filesystem_.get(), current_document_dir, clock_.get(),
2548       schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false,
2549       /*document_store_namespace_id_fingerprint=*/true,
2550       /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/true,
2551       options_.compression_level(), /*initialize_stats=*/nullptr);
2552   if (!create_result_or.ok()) {
2553     // Unable to create DocumentStore from the new file. Mark as uninitialized
2554     // and return INTERNAL.
2555     initialized_ = false;
2556     return absl_ports::InternalError(
2557         "Document store has been optimized, but a valid document store "
2558         "instance can't be created");
2559   }
2560   DocumentStore::CreateResult create_result =
2561       std::move(create_result_or).ValueOrDie();
2562   document_store_ = std::move(create_result.document_store);
2563   result_state_manager_ = std::make_unique<ResultStateManager>(
2564       performance_configuration_.max_num_total_hits, *document_store_);
2565 
2566   // Deletes tmp directory
2567   if (!filesystem_->DeleteDirectoryRecursively(
2568           temporary_document_dir.c_str())) {
2569     ICING_LOG(ERROR) << "Document store has been optimized, but it failed to "
2570                         "delete temporary file directory";
2571   }
2572 
2573   // Since we created new (optimized) document store with correct PersistToDisk
2574   // call, we shouldn't have data loss or regenerate derived files. Therefore,
2575   // if we really encounter any of these situations, then return DataLossError
2576   // to let the caller rebuild index.
2577   if (create_result.data_loss != DataLoss::NONE ||
2578       create_result.derived_files_regenerated) {
2579     return absl_ports::DataLossError(
2580         "Unexpected data loss or derived files regenerated for new document "
2581         "store");
2582   }
2583 
2584   return optimize_result_or;
2585 }
2586 
2587 IcingSearchEngine::IndexRestorationResult
RestoreIndexIfNeeded()2588 IcingSearchEngine::RestoreIndexIfNeeded() {
2589   DocumentId last_stored_document_id =
2590       document_store_->last_added_document_id();
2591   if (last_stored_document_id == index_->last_added_document_id() &&
2592       last_stored_document_id == integer_index_->last_added_document_id() &&
2593       last_stored_document_id ==
2594           qualified_id_join_index_->last_added_document_id() &&
2595       last_stored_document_id == embedding_index_->last_added_document_id()) {
2596     // No need to recover.
2597     return {libtextclassifier3::Status::OK, false, false, false, false};
2598   }
2599 
2600   if (last_stored_document_id == kInvalidDocumentId) {
2601     // Document store is empty but index is not. Clear the index.
2602     return {ClearAllIndices(), false, false, false, false};
2603   }
2604 
2605   // Truncate indices first.
2606   auto truncate_result_or = TruncateIndicesTo(last_stored_document_id);
2607   if (!truncate_result_or.ok()) {
2608     return {std::move(truncate_result_or).status(), false, false, false, false};
2609   }
2610   TruncateIndexResult truncate_result =
2611       std::move(truncate_result_or).ValueOrDie();
2612 
2613   if (truncate_result.first_document_to_reindex > last_stored_document_id) {
2614     // Nothing to restore. Just return.
2615     return {libtextclassifier3::Status::OK, false, false, false, false};
2616   }
2617 
2618   auto data_indexing_handlers_or = CreateDataIndexingHandlers();
2619   if (!data_indexing_handlers_or.ok()) {
2620     return {data_indexing_handlers_or.status(),
2621             truncate_result.index_needed_restoration,
2622             truncate_result.integer_index_needed_restoration,
2623             truncate_result.qualified_id_join_index_needed_restoration,
2624             truncate_result.embedding_index_needed_restoration};
2625   }
2626   // By using recovery_mode for IndexProcessor, we're able to replay documents
2627   // from smaller document id and it will skip documents that are already been
2628   // indexed.
2629   IndexProcessor index_processor(
2630       std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get(),
2631       /*recovery_mode=*/true);
2632 
2633   ICING_VLOG(1) << "Restoring index by replaying documents from document id "
2634                 << truncate_result.first_document_to_reindex
2635                 << " to document id " << last_stored_document_id;
2636   libtextclassifier3::Status overall_status;
2637   for (DocumentId document_id = truncate_result.first_document_to_reindex;
2638        document_id <= last_stored_document_id; ++document_id) {
2639     libtextclassifier3::StatusOr<DocumentProto> document_or =
2640         document_store_->Get(document_id);
2641 
2642     if (!document_or.ok()) {
2643       if (absl_ports::IsInvalidArgument(document_or.status()) ||
2644           absl_ports::IsNotFound(document_or.status())) {
2645         // Skips invalid and non-existing documents.
2646         continue;
2647       } else {
2648         // Returns other errors
2649         return {document_or.status(), truncate_result.index_needed_restoration,
2650                 truncate_result.integer_index_needed_restoration,
2651                 truncate_result.qualified_id_join_index_needed_restoration,
2652                 truncate_result.embedding_index_needed_restoration};
2653       }
2654     }
2655     DocumentProto document(std::move(document_or).ValueOrDie());
2656 
2657     libtextclassifier3::StatusOr<TokenizedDocument> tokenized_document_or =
2658         TokenizedDocument::Create(schema_store_.get(),
2659                                   language_segmenter_.get(),
2660                                   std::move(document));
2661     if (!tokenized_document_or.ok()) {
2662       return {tokenized_document_or.status(),
2663               truncate_result.index_needed_restoration,
2664               truncate_result.integer_index_needed_restoration,
2665               truncate_result.qualified_id_join_index_needed_restoration,
2666               truncate_result.embedding_index_needed_restoration};
2667     }
2668     TokenizedDocument tokenized_document(
2669         std::move(tokenized_document_or).ValueOrDie());
2670 
2671     libtextclassifier3::Status status =
2672         index_processor.IndexDocument(tokenized_document, document_id);
2673     if (!status.ok()) {
2674       if (!absl_ports::IsDataLoss(status)) {
2675         // Real error. Stop recovering and pass it up.
2676         return {status, truncate_result.index_needed_restoration,
2677                 truncate_result.integer_index_needed_restoration,
2678                 truncate_result.qualified_id_join_index_needed_restoration,
2679                 truncate_result.embedding_index_needed_restoration};
2680       }
2681       // FIXME: why can we skip data loss error here?
2682       // Just a data loss. Keep trying to add the remaining docs, but report the
2683       // data loss when we're done.
2684       overall_status = status;
2685     }
2686   }
2687 
2688   return {overall_status, truncate_result.index_needed_restoration,
2689           truncate_result.integer_index_needed_restoration,
2690           truncate_result.qualified_id_join_index_needed_restoration,
2691           truncate_result.embedding_index_needed_restoration};
2692 }
2693 
LostPreviousSchema()2694 libtextclassifier3::StatusOr<bool> IcingSearchEngine::LostPreviousSchema() {
2695   auto status_or = schema_store_->GetSchema();
2696   if (status_or.ok()) {
2697     // Found a schema.
2698     return false;
2699   }
2700 
2701   if (!absl_ports::IsNotFound(status_or.status())) {
2702     // Any other type of error
2703     return status_or.status();
2704   }
2705 
2706   // We know: We don't have a schema now.
2707   //
2708   // We know: If no documents have been added, then the last_added_document_id
2709   // will be invalid.
2710   //
2711   // So: If documents have been added before and we don't have a schema now,
2712   // then that means we must have had a schema at some point. Since we wouldn't
2713   // accept documents without a schema to validate them against.
2714   return document_store_->last_added_document_id() != kInvalidDocumentId;
2715 }
2716 
2717 libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DataIndexingHandler>>>
CreateDataIndexingHandlers()2718 IcingSearchEngine::CreateDataIndexingHandlers() {
2719   std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
2720 
2721   // Term index handler
2722   ICING_ASSIGN_OR_RETURN(
2723       std::unique_ptr<TermIndexingHandler> term_indexing_handler,
2724       TermIndexingHandler::Create(
2725           clock_.get(), normalizer_.get(), index_.get(),
2726           options_.build_property_existence_metadata_hits()));
2727   handlers.push_back(std::move(term_indexing_handler));
2728 
2729   // Integer index handler
2730   ICING_ASSIGN_OR_RETURN(std::unique_ptr<IntegerSectionIndexingHandler>
2731                              integer_section_indexing_handler,
2732                          IntegerSectionIndexingHandler::Create(
2733                              clock_.get(), integer_index_.get()));
2734   handlers.push_back(std::move(integer_section_indexing_handler));
2735 
2736   // Qualified id join index handler
2737   ICING_ASSIGN_OR_RETURN(
2738       std::unique_ptr<QualifiedIdJoinIndexingHandler>
2739           qualified_id_join_indexing_handler,
2740       QualifiedIdJoinIndexingHandler::Create(
2741           clock_.get(), document_store_.get(), qualified_id_join_index_.get()));
2742   handlers.push_back(std::move(qualified_id_join_indexing_handler));
2743 
2744   // Embedding index handler
2745   ICING_ASSIGN_OR_RETURN(
2746       std::unique_ptr<EmbeddingIndexingHandler> embedding_indexing_handler,
2747       EmbeddingIndexingHandler::Create(clock_.get(), embedding_index_.get()));
2748   handlers.push_back(std::move(embedding_indexing_handler));
2749   return handlers;
2750 }
2751 
2752 libtextclassifier3::StatusOr<IcingSearchEngine::TruncateIndexResult>
TruncateIndicesTo(DocumentId last_stored_document_id)2753 IcingSearchEngine::TruncateIndicesTo(DocumentId last_stored_document_id) {
2754   // Attempt to truncate term index.
2755   // TruncateTo ensures that the index does not hold any data that is not
2756   // present in the ground truth. If the document store lost some documents,
2757   // TruncateTo will ensure that the index does not contain any hits from those
2758   // lost documents. If the index does not contain any hits for documents with
2759   // document id greater than last_stored_document_id, then TruncateTo will have
2760   // no effect.
2761   ICING_RETURN_IF_ERROR(index_->TruncateTo(last_stored_document_id));
2762 
2763   // Get last indexed document id for term index after truncating.
2764   DocumentId term_index_last_added_document_id =
2765       index_->last_added_document_id();
2766   DocumentId first_document_to_reindex =
2767       (term_index_last_added_document_id != kInvalidDocumentId)
2768           ? term_index_last_added_document_id + 1
2769           : kMinDocumentId;
2770   bool index_needed_restoration =
2771       (last_stored_document_id != term_index_last_added_document_id);
2772 
2773   // Attempt to truncate integer index.
2774   bool integer_index_needed_restoration = false;
2775   DocumentId integer_index_last_added_document_id =
2776       integer_index_->last_added_document_id();
2777   if (integer_index_last_added_document_id == kInvalidDocumentId ||
2778       last_stored_document_id > integer_index_last_added_document_id) {
2779     // If last_stored_document_id is greater than
2780     // integer_index_last_added_document_id, then we only have to replay docs
2781     // starting from integer_index_last_added_document_id + 1. Also use std::min
2782     // since we might need to replay even smaller doc ids for term index.
2783     integer_index_needed_restoration = true;
2784     if (integer_index_last_added_document_id != kInvalidDocumentId) {
2785       first_document_to_reindex = std::min(
2786           first_document_to_reindex, integer_index_last_added_document_id + 1);
2787     } else {
2788       first_document_to_reindex = kMinDocumentId;
2789     }
2790   } else if (last_stored_document_id < integer_index_last_added_document_id) {
2791     // Clear the entire integer index if last_stored_document_id is smaller than
2792     // integer_index_last_added_document_id, because there is no way to remove
2793     // data with doc_id > last_stored_document_id from integer index and we have
2794     // to rebuild.
2795     ICING_RETURN_IF_ERROR(integer_index_->Clear());
2796 
2797     // Since the entire integer index is discarded, we start to rebuild it by
2798     // setting first_document_to_reindex to kMinDocumentId.
2799     integer_index_needed_restoration = true;
2800     first_document_to_reindex = kMinDocumentId;
2801   }
2802 
2803   // Attempt to truncate qualified id join index
2804   bool qualified_id_join_index_needed_restoration = false;
2805   DocumentId qualified_id_join_index_last_added_document_id =
2806       qualified_id_join_index_->last_added_document_id();
2807   if (qualified_id_join_index_last_added_document_id == kInvalidDocumentId ||
2808       last_stored_document_id >
2809           qualified_id_join_index_last_added_document_id) {
2810     // If last_stored_document_id is greater than
2811     // qualified_id_join_index_last_added_document_id, then we only have to
2812     // replay docs starting from (qualified_id_join_index_last_added_document_id
2813     // + 1). Also use std::min since we might need to replay even smaller doc
2814     // ids for other components.
2815     qualified_id_join_index_needed_restoration = true;
2816     if (qualified_id_join_index_last_added_document_id != kInvalidDocumentId) {
2817       first_document_to_reindex =
2818           std::min(first_document_to_reindex,
2819                    qualified_id_join_index_last_added_document_id + 1);
2820     } else {
2821       first_document_to_reindex = kMinDocumentId;
2822     }
2823   } else if (last_stored_document_id <
2824              qualified_id_join_index_last_added_document_id) {
2825     // Clear the entire qualified id join index if last_stored_document_id is
2826     // smaller than qualified_id_join_index_last_added_document_id, because
2827     // there is no way to remove data with doc_id > last_stored_document_id from
2828     // join index efficiently and we have to rebuild.
2829     ICING_RETURN_IF_ERROR(qualified_id_join_index_->Clear());
2830 
2831     // Since the entire qualified id join index is discarded, we start to
2832     // rebuild it by setting first_document_to_reindex to kMinDocumentId.
2833     qualified_id_join_index_needed_restoration = true;
2834     first_document_to_reindex = kMinDocumentId;
2835   }
2836 
2837   // Attempt to truncate embedding index
2838   bool embedding_index_needed_restoration = false;
2839   DocumentId embedding_index_last_added_document_id =
2840       embedding_index_->last_added_document_id();
2841   if (embedding_index_last_added_document_id == kInvalidDocumentId ||
2842       last_stored_document_id > embedding_index_last_added_document_id) {
2843     // If last_stored_document_id is greater than
2844     // embedding_index_last_added_document_id, then we only have to replay docs
2845     // starting from (embedding_index_last_added_document_id + 1). Also use
2846     // std::min since we might need to replay even smaller doc ids for other
2847     // components.
2848     embedding_index_needed_restoration = true;
2849     if (embedding_index_last_added_document_id != kInvalidDocumentId) {
2850       first_document_to_reindex =
2851           std::min(first_document_to_reindex,
2852                    embedding_index_last_added_document_id + 1);
2853     } else {
2854       first_document_to_reindex = kMinDocumentId;
2855     }
2856   } else if (last_stored_document_id < embedding_index_last_added_document_id) {
2857     // Clear the entire embedding index if last_stored_document_id is
2858     // smaller than embedding_index_last_added_document_id, because
2859     // there is no way to remove data with doc_id > last_stored_document_id from
2860     // embedding index efficiently and we have to rebuild.
2861     ICING_RETURN_IF_ERROR(embedding_index_->Clear());
2862 
2863     // Since the entire embedding index is discarded, we start to
2864     // rebuild it by setting first_document_to_reindex to kMinDocumentId.
2865     embedding_index_needed_restoration = true;
2866     first_document_to_reindex = kMinDocumentId;
2867   }
2868 
2869   return TruncateIndexResult(first_document_to_reindex,
2870                              index_needed_restoration,
2871                              integer_index_needed_restoration,
2872                              qualified_id_join_index_needed_restoration,
2873                              embedding_index_needed_restoration);
2874 }
2875 
DiscardDerivedFiles(const version_util::DerivedFilesRebuildResult & rebuild_result)2876 libtextclassifier3::Status IcingSearchEngine::DiscardDerivedFiles(
2877     const version_util::DerivedFilesRebuildResult& rebuild_result) {
2878   if (!rebuild_result.IsRebuildNeeded()) {
2879     return libtextclassifier3::Status::OK;
2880   }
2881 
2882   if (schema_store_ != nullptr || document_store_ != nullptr ||
2883       index_ != nullptr || integer_index_ != nullptr ||
2884       qualified_id_join_index_ != nullptr || embedding_index_ != nullptr) {
2885     return absl_ports::FailedPreconditionError(
2886         "Cannot discard derived files while having valid instances");
2887   }
2888 
2889   // Schema store
2890   if (rebuild_result.needs_schema_store_derived_files_rebuild) {
2891     ICING_RETURN_IF_ERROR(SchemaStore::DiscardDerivedFiles(
2892         filesystem_.get(), options_.base_dir()));
2893   }
2894 
2895   // Document store
2896   if (rebuild_result.needs_document_store_derived_files_rebuild) {
2897     ICING_RETURN_IF_ERROR(DocumentStore::DiscardDerivedFiles(
2898         filesystem_.get(), options_.base_dir()));
2899   }
2900 
2901   // Term index
2902   if (rebuild_result.needs_term_index_rebuild) {
2903     if (!filesystem_->DeleteDirectoryRecursively(
2904             MakeIndexDirectoryPath(options_.base_dir()).c_str())) {
2905       return absl_ports::InternalError("Failed to discard index");
2906     }
2907   }
2908 
2909   // Integer index
2910   if (rebuild_result.needs_integer_index_rebuild) {
2911     if (!filesystem_->DeleteDirectoryRecursively(
2912             MakeIntegerIndexWorkingPath(options_.base_dir()).c_str())) {
2913       return absl_ports::InternalError("Failed to discard integer index");
2914     }
2915   }
2916 
2917   // Qualified id join index
2918   if (rebuild_result.needs_qualified_id_join_index_rebuild) {
2919     if (!filesystem_->DeleteDirectoryRecursively(
2920             MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir()).c_str())) {
2921       return absl_ports::InternalError(
2922           "Failed to discard qualified id join index");
2923     }
2924   }
2925 
2926   // TODO(b/326656531): Update version-util to consider embedding index.
2927 
2928   return libtextclassifier3::Status::OK;
2929 }
2930 
ClearSearchIndices()2931 libtextclassifier3::Status IcingSearchEngine::ClearSearchIndices() {
2932   ICING_RETURN_IF_ERROR(index_->Reset());
2933   ICING_RETURN_IF_ERROR(integer_index_->Clear());
2934   ICING_RETURN_IF_ERROR(embedding_index_->Clear());
2935   return libtextclassifier3::Status::OK;
2936 }
2937 
ClearJoinIndices()2938 libtextclassifier3::Status IcingSearchEngine::ClearJoinIndices() {
2939   return qualified_id_join_index_->Clear();
2940 }
2941 
ClearAllIndices()2942 libtextclassifier3::Status IcingSearchEngine::ClearAllIndices() {
2943   ICING_RETURN_IF_ERROR(ClearSearchIndices());
2944   ICING_RETURN_IF_ERROR(ClearJoinIndices());
2945   return libtextclassifier3::Status::OK;
2946 }
2947 
Reset()2948 ResetResultProto IcingSearchEngine::Reset() {
2949   absl_ports::unique_lock l(&mutex_);
2950   return ResetInternal();
2951 }
2952 
ResetInternal()2953 ResetResultProto IcingSearchEngine::ResetInternal() {
2954   ICING_VLOG(1) << "Resetting IcingSearchEngine";
2955 
2956   ResetResultProto result_proto;
2957   StatusProto* result_status = result_proto.mutable_status();
2958 
2959   initialized_ = false;
2960   ResetMembers();
2961   if (!filesystem_->DeleteDirectoryRecursively(options_.base_dir().c_str())) {
2962     result_status->set_code(StatusProto::INTERNAL);
2963     return result_proto;
2964   }
2965 
2966   if (InternalInitialize().status().code() != StatusProto::OK) {
2967     // We shouldn't hit the following Initialize errors:
2968     //   NOT_FOUND: all data was cleared, we aren't expecting anything
2969     //   DATA_LOSS: all data was cleared, we aren't expecting anything
2970     //   RESOURCE_EXHAUSTED: just deleted files, shouldn't run out of space
2971     //
2972     // We can't tell if Initialize failed and left Icing in an inconsistent
2973     // state or if it was a temporary I/O error. Group everything under INTERNAL
2974     // to be safe.
2975     //
2976     // TODO(b/147699081): Once Initialize returns the proper ABORTED/INTERNAL
2977     // status code, we can just propagate it up from here.
2978     result_status->set_code(StatusProto::INTERNAL);
2979     return result_proto;
2980   }
2981 
2982   result_status->set_code(StatusProto::OK);
2983   return result_proto;
2984 }
2985 
SearchSuggestions(const SuggestionSpecProto & suggestion_spec)2986 SuggestionResponse IcingSearchEngine::SearchSuggestions(
2987     const SuggestionSpecProto& suggestion_spec) {
2988   // TODO(b/146008613) Explore ideas to make this function read-only.
2989   absl_ports::unique_lock l(&mutex_);
2990   SuggestionResponse response;
2991   StatusProto* response_status = response.mutable_status();
2992   if (!initialized_) {
2993     response_status->set_code(StatusProto::FAILED_PRECONDITION);
2994     response_status->set_message("IcingSearchEngine has not been initialized!");
2995     return response;
2996   }
2997 
2998   libtextclassifier3::Status status =
2999       ValidateSuggestionSpec(suggestion_spec, performance_configuration_);
3000   if (!status.ok()) {
3001     TransformStatus(status, response_status);
3002     return response;
3003   }
3004 
3005   // Create the suggestion processor.
3006   auto suggestion_processor_or = SuggestionProcessor::Create(
3007       index_.get(), integer_index_.get(), embedding_index_.get(),
3008       language_segmenter_.get(), normalizer_.get(), document_store_.get(),
3009       schema_store_.get(), clock_.get());
3010   if (!suggestion_processor_or.ok()) {
3011     TransformStatus(suggestion_processor_or.status(), response_status);
3012     return response;
3013   }
3014   std::unique_ptr<SuggestionProcessor> suggestion_processor =
3015       std::move(suggestion_processor_or).ValueOrDie();
3016 
3017   // Run suggestion based on given SuggestionSpec.
3018   int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
3019   libtextclassifier3::StatusOr<std::vector<TermMetadata>> terms_or =
3020       suggestion_processor->QuerySuggestions(suggestion_spec, current_time_ms);
3021   if (!terms_or.ok()) {
3022     TransformStatus(terms_or.status(), response_status);
3023     return response;
3024   }
3025 
3026   // Convert vector<TermMetaData> into final SuggestionResponse proto.
3027   for (TermMetadata& term : terms_or.ValueOrDie()) {
3028     SuggestionResponse::Suggestion suggestion;
3029     suggestion.set_query(std::move(term.content));
3030     response.mutable_suggestions()->Add(std::move(suggestion));
3031   }
3032   response_status->set_code(StatusProto::OK);
3033   return response;
3034 }
3035 
3036 }  // namespace lib
3037 }  // namespace icing
3038