• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_ICING_SEARCH_ENGINE_H_
16 #define ICING_ICING_SEARCH_ENGINE_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string_view>
21 #include <utility>
22 #include <vector>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/absl_ports/mutex.h"
27 #include "icing/absl_ports/thread_annotations.h"
28 #include "icing/file/filesystem.h"
29 #include "icing/file/version-util.h"
30 #include "icing/index/data-indexing-handler.h"
31 #include "icing/index/embed/embedding-index.h"
32 #include "icing/index/index.h"
33 #include "icing/index/numeric/numeric-index.h"
34 #include "icing/jni/jni-cache.h"
35 #include "icing/join/join-children-fetcher.h"
36 #include "icing/join/qualified-id-join-index.h"
37 #include "icing/legacy/index/icing-filesystem.h"
38 #include "icing/performance-configuration.h"
39 #include "icing/proto/debug.pb.h"
40 #include "icing/proto/document.pb.h"
41 #include "icing/proto/initialize.pb.h"
42 #include "icing/proto/logging.pb.h"
43 #include "icing/proto/optimize.pb.h"
44 #include "icing/proto/persist.pb.h"
45 #include "icing/proto/reset.pb.h"
46 #include "icing/proto/schema.pb.h"
47 #include "icing/proto/scoring.pb.h"
48 #include "icing/proto/search.pb.h"
49 #include "icing/proto/storage.pb.h"
50 #include "icing/proto/usage.pb.h"
51 #include "icing/query/query-terms.h"
52 #include "icing/result/result-state-manager.h"
53 #include "icing/schema/schema-store.h"
54 #include "icing/scoring/scored-document-hit.h"
55 #include "icing/store/document-id.h"
56 #include "icing/store/document-store.h"
57 #include "icing/tokenization/language-segmenter.h"
58 #include "icing/transform/normalizer.h"
59 #include "icing/util/clock.h"
60 
61 namespace icing {
62 namespace lib {
63 
64 // TODO(cassiewang) Top-level comments and links to design-doc.
65 class IcingSearchEngine {
66  public:
67   // Note: It is only required to provide a pointer to a valid instance of
68   // JniCache if this instance needs to perform reverse-jni calls. Users on
69   // Linux and iOS should always provide a nullptr.
70   explicit IcingSearchEngine(
71       const IcingSearchEngineOptions& options,
72       std::unique_ptr<const JniCache> jni_cache = nullptr);
73 
74   // Calculates integrity checks and persists files to disk.
75   ~IcingSearchEngine();
76 
77   // Loads & verifies the contents previously indexed from disk and gets ready
78   // to handle read/write requests.
79   //
80   // WARNING: This is expected to be fast if Icing had a clean shutdown.
81   // Otherwise, it can take longer as it runs integrity checks and attempts
82   // to bring the index to a consistent state. If the data on disk is not
83   // consistent, it restores the state when PersistToDisk() was last called.
84   //
85   // TODO(cassiewang): We shouldn't return NOT_FOUND here, this is a symptom
86   // of some other error. We should return a broader error group, i.e. data
87   // inconsistency or something
88   //
89   // Returns:
90   //   OK on success
91   //   DATA_LOSS if encountered any inconsistencies in data and had to restore
92   //     its state back to the last time PersistToDisk was called. Or if any
93   //     persisted data was lost and could not be recovered.
94   //   INTERNAL if any internal state was left in an inconsistent. The instance
95   //     of IcingSearchEngine is unusable if this happens. It's recommended to
96   //     clear the underlying directory provided in
97   //     IcingSearchEngineOptions.base_dir and reinitialize.
98   //   RESOURCE_EXHAUSTED if not enough storage space
99   //   NOT_FOUND if missing some internal data
100   InitializeResultProto Initialize() ICING_LOCKS_EXCLUDED(mutex_);
101 
102   // Specifies the schema to be applied on all Documents that are already
103   // stored as well as future documents. A schema can be 'invalid' and/or
104   // 'incompatible'. These are two independent concepts.
105   //
106   // An 'invalid' schema is one that is not constructed properly. For example,
107   // a PropertyConfigProto is missing the property name field. A schema can be
108   // 'invalid' even if there is no previously existing schema.
109   //
110   // An 'incompatible' schema is one that is incompatible with a previously
111   // existing schema. If there is no previously existing schema, then a new
112   // schema cannot be incompatible. An incompatible schema is one that
113   // invalidates pre-existing data. For example, a previously OPTIONAL field is
114   // now REQUIRED in the new schema, and pre-existing data is considered invalid
115   // against the new schema now.
116   //
117   // Default behavior will not allow a new schema to be set if it is invalid or
118   // incompatible.
119   //
120   // The argument 'ignore_errors_and_delete_documents' can be set to true to
121   // force set an incompatible schema. In that case, documents that are
122   // invalidated by the new schema would be deleted from Icing. This cannot be
123   // used to force set an invalid schema.
124   //
125   // This schema is persisted to disk and used across multiple instances.
126   // So, callers should only have to call this if the schema changed.
127   // However, calling it multiple times with the same schema is a no-op.
128   //
129   // On some errors, Icing will keep using the older schema, but on
130   // INTERNAL_ERROR, it is undefined to continue using Icing.
131   //
132   // Returns:
133   //   OK on success
134   //   ALREADY_EXISTS if 'new_schema' contains multiple definitions of the same
135   //     type or contains a type that has multiple properties with the same
136   //     name.
137   //   INVALID_ARGUMENT if 'new_schema' is invalid
138   //   FAILED_PRECONDITION if 'new_schema' is incompatible, or IcingSearchEngine
139   //     has not been initialized yet.
140   //   INTERNAL_ERROR if Icing failed to store the new schema or upgrade
141   //     existing data based on the new schema. Using Icing beyond this error is
142   //     undefined and may cause crashes.
143   //   DATA_LOSS_ERROR if 'new_schema' requires the index to be rebuilt and an
144   //     IO error leads to some documents being excluded from the index. These
145   //     documents will still be retrievable via Get, but won't match queries.
146   //
147   // TODO(cassiewang) Figure out, document (and maybe even enforce) the best
148   // way ordering of calls between Initialize() and SetSchema(), both when
149   // the caller is creating an instance of IcingSearchEngine for the first
150   // time and when the caller is reinitializing an existing index on disk.
151   SetSchemaResultProto SetSchema(
152       SchemaProto&& new_schema, bool ignore_errors_and_delete_documents = false)
153       ICING_LOCKS_EXCLUDED(mutex_);
154 
155   // This function makes a copy of the schema and calls SetSchema(SchemaProto&&
156   // new_schema, bool ignore_errors_and_delete_documents)
157   //
158   // NOTE: It's recommended to call SetSchema(SchemaProto&& new_schema, bool
159   // ignore_errors_and_delete_documents) directly to avoid a copy if the caller
160   // can make an rvalue SchemaProto.
161   SetSchemaResultProto SetSchema(const SchemaProto& new_schema,
162                                  bool ignore_errors_and_delete_documents =
163                                      false) ICING_LOCKS_EXCLUDED(mutex_);
164 
165   // Get Icing's current copy of the schema.
166   //
167   // Returns:
168   //   SchemaProto on success
169   //   NOT_FOUND if a schema has not been set yet
170   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet.
171   //   INTERNAL_ERROR on IO error
172   GetSchemaResultProto GetSchema() ICING_LOCKS_EXCLUDED(mutex_);
173 
174   // Get Icing's copy of the SchemaTypeConfigProto of name schema_type
175   //
176   // Returns:
177   //   SchemaTypeConfigProto on success
178   //   FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine
179   //     has not been initialized yet.
180   //   NOT_FOUND if there is no SchemaTypeConfig of schema_type in the
181   //     SchemaProto
182   //   INTERNAL_ERROR on IO error
183   GetSchemaTypeResultProto GetSchemaType(std::string_view schema_type)
184       ICING_LOCKS_EXCLUDED(mutex_);
185 
186   // Puts the document into icing search engine so that it's stored and
187   // indexed. Documents are automatically written to disk, callers can also
188   // call PersistToDisk() to flush changes immediately.
189   //
190   // Returns:
191   //   OK on success
192   //   OUT_OF_SPACE if exceeds maximum number of allowed documents
193   //   FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine
194   //     has not been initialized yet.
195   //   NOT_FOUND if there is no SchemaTypeConfig in the SchemaProto that matches
196   //     the document's schema
197   //   DATA_LOSS if an IO error occurs while merging document into the index and
198   //     the index is lost. These documents will still be retrievable via Get,
199   //     but won't match queries.
200   //   INTERNAL_ERROR on IO error
201   PutResultProto Put(DocumentProto&& document) ICING_LOCKS_EXCLUDED(mutex_);
202 
203   // This function makes a copy of document and calls Put(DocumentProto&&
204   // document).
205   //
206   // NOTE: It's recommended to call Put(DocumentProto&& document) directly to
207   // avoid a copy if the caller can make an rvalue DocumentProto.
208   PutResultProto Put(const DocumentProto& document)
209       ICING_LOCKS_EXCLUDED(mutex_);
210 
211   // Finds and returns the document identified by the given key (namespace +
212   // uri)
213   //
214   // Returns:
215   //   The document found on success
216   //   NOT_FOUND if the key doesn't exist or doc has been deleted
217   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
218   //   INTERNAL_ERROR on IO error
219   GetResultProto Get(std::string_view name_space, std::string_view uri,
220                      const GetResultSpecProto& result_spec);
221 
222   // Reports usage. The corresponding usage scores of the specified document in
223   // the report will be updated.
224   //
225   // Returns:
226   //   OK on success
227   //   NOT_FOUND if the [namesapce + uri] key in the report doesn't exist
228   //   INTERNAL_ERROR on I/O errors.
229   ReportUsageResultProto ReportUsage(const UsageReport& usage_report);
230 
231   // Returns all the namespaces that have at least one valid document in it.
232   //
233   // Returns:
234   //   All namespaces on success
235   GetAllNamespacesResultProto GetAllNamespaces();
236 
237   // Deletes the Document specified by the given namespace / uri pair from the
238   // search engine. Delete changes are automatically applied to disk, callers
239   // can also call PersistToDisk() to flush changes immediately.
240   //
241   // NOTE: Space is not reclaimed for deleted documents until Optimize() is
242   // called.
243   //
244   // Returns:
245   //   OK on success
246   //   NOT_FOUND if no document exists with namespace, uri
247   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
248   //   INTERNAL_ERROR on IO error
249   DeleteResultProto Delete(std::string_view name_space, std::string_view uri)
250       ICING_LOCKS_EXCLUDED(mutex_);
251 
252   // Deletes all Documents belonging to the specified namespace from the search
253   // engine. Delete changes are automatically applied to disk, callers can also
254   // call PersistToDisk() to flush changes immediately.
255   //
256   // NOTE: Space is not reclaimed for deleted documents until Optimize() is
257   // called.
258   //
259   // Returns:
260   //   OK on success
261   //   NOT_FOUND if namespace doesn't exist
262   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
263   //   INTERNAL_ERROR on IO error
264   DeleteByNamespaceResultProto DeleteByNamespace(std::string_view name_space)
265       ICING_LOCKS_EXCLUDED(mutex_);
266 
267   // Deletes all Documents belonging to the specified type from the search
268   // engine. Delete changes are automatically applied to disk, callers can also
269   // call PersistToDisk() to flush changes immediately.
270   //
271   // NOTE: Space is not reclaimed for deleted documents until Optimize() is
272   // called.
273   //
274   // Returns:
275   //   OK on success
276   //   NOT_FOUND if schema type doesn't exist
277   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
278   //   INTERNAL_ERROR on IO error
279   DeleteBySchemaTypeResultProto DeleteBySchemaType(std::string_view schema_type)
280       ICING_LOCKS_EXCLUDED(mutex_);
281 
282   // Deletes all Documents that match the query specified in search_spec. Delete
283   // changes are automatically applied to disk, callers can also call
284   // PersistToDisk() to flush changes immediately.
285   //
286   // NOTE: Space is not reclaimed for deleted documents until Optimize() is
287   // called.
288   //
289   // Returns:
290   //   OK on success
291   //   NOT_FOUND if the query doesn't match any documents
292   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
293   //   INTERNAL_ERROR on IO error
294   DeleteByQueryResultProto DeleteByQuery(
295       const SearchSpecProto& search_spec,
296       bool return_deleted_document_info = false) ICING_LOCKS_EXCLUDED(mutex_);
297 
298   // Retrieves, scores, ranks, and returns the results according to the specs.
299   // Results can be empty. If there're multiple pages of results,
300   // SearchResultProto.next_page_token will be set to a non-zero token and can
301   // be used to fetch more pages via GetNextPage() method. Clients should call
302   // InvalidateNextPageToken() after they get the pages they need to release
303   // result cache in memory. Please refer to each proto file for spec
304   // definitions.
305   //
306   // Returns a SearchResultProto with status:
307   //   OK with results on success
308   //   INVALID_ARGUMENT if any of specs is invalid
309   //   ABORTED if failed to perform search but existing data is not affected
310   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
311   //   INTERNAL_ERROR on any other errors
312   SearchResultProto Search(const SearchSpecProto& search_spec,
313                            const ScoringSpecProto& scoring_spec,
314                            const ResultSpecProto& result_spec)
315       ICING_LOCKS_EXCLUDED(mutex_);
316 
317   // Retrieves, scores, ranks and returns the suggested query string according
318   // to the specs. Results can be empty.
319   //
320   // Returns a SuggestionResponse with status:
321   //   OK with results on success
322   //   INVALID_ARGUMENT if any of specs is invalid
323   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
324   //   INTERNAL_ERROR on any other errors
325   SuggestionResponse SearchSuggestions(
326       const SuggestionSpecProto& suggestion_spec) ICING_LOCKS_EXCLUDED(mutex_);
327 
328   // Fetches the next page of results of a previously executed query. Results
329   // can be empty if next-page token is invalid. Invalid next page tokens are
330   // tokens that are either zero or were previously passed to
331   // InvalidateNextPageToken. If there are pages of results remaining after the
332   // one retrieved by this call, SearchResultProto.next_page_token will be
333   // set to a non-zero token and can be used to fetch more pages via
334   // GetNextPage() method.
335   //
336   // Returns a SearchResultProto with status:
337   //   OK with results on success
338   //   ABORTED if failed to get results but existing data is not affected
339   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
340   //   INTERNAL_ERROR on any other errors
341   SearchResultProto GetNextPage(uint64_t next_page_token)
342       ICING_LOCKS_EXCLUDED(mutex_);
343 
344   // Invalidates the next-page token so that no more results of the related
345   // query can be returned.
346   void InvalidateNextPageToken(uint64_t next_page_token)
347       ICING_LOCKS_EXCLUDED(mutex_);
348 
349   // Makes sure that every update/delete received till this point is flushed
350   // to disk. If the app crashes after a call to PersistToDisk(), Icing
351   // would be able to fully recover all data written up to this point.
352   //
353   // If persist_type is PersistType::LITE, then only the ground truth will be
354   // synced. This should be relatively lightweight to do (order of microseconds)
355   // and ensures that there will be no data loss. At worst, Icing may need to
356   // recover internal data structures by replaying the document log upon the
357   // next startup. Clients should call PersistToDisk(LITE) after each batch of
358   // mutations.
359   //
360   // If persist_type is PersistType::FULL, then all internal data structures in
361   // Icing will be synced. This is a heavier operation (order of milliseconds).
362   // It ensures that Icing will not need to recover internal data structures
363   // upon the next startup. Clients should call PersistToDisk(FULL) before their
364   // process dies.
365   //
366   // NOTE: It is not necessary to call PersistToDisk() to read back data
367   // that was recently written. All read APIs will include the most recent
368   // updates/deletes regardless of the data being flushed to disk.
369   //
370   // Returns:
371   //   OK on success
372   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
373   //   INTERNAL on I/O error
374   PersistToDiskResultProto PersistToDisk(PersistType::Code persist_type)
375       ICING_LOCKS_EXCLUDED(mutex_);
376 
377   // Allows Icing to run tasks that are too expensive and/or unnecessary to be
378   // executed in real-time, but are useful to keep it fast and be
379   // resource-efficient. This method purely optimizes the internal files and
380   // has no functional impact on what gets accepted/returned.
381   //
382   // WARNING: This method is CPU and IO intensive and depending on the
383   // contents stored, it can take from a few seconds to a few minutes.
384   // This call also blocks all read/write operations on Icing.
385   //
386   // SUGGESTION: Assuming the client has no restrictions on their side, it's
387   // recommended to call this method about once every 24 hours when the
388   // device is idle and charging. It can also be called when the system needs
389   // to free up extra disk-space.
390   //
391   // Returns:
392   //   OK on success
393   //   ABORTED_ERROR if optimization is aborted due to non-fatal errors before
394   //                 actual modifications are made.
395   //   DATA_LOSS_ERROR on errors that could potentially cause data loss,
396   //                   IcingSearchEngine is still functioning.
397   //   INTERNAL_ERROR on any IO errors or other unrecoverable errors. Continued
398   //                  use of Icing is undefined.
399   //                  Clients could clear and reinitialize IcingSearchEngine.
400   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
401   OptimizeResultProto Optimize() ICING_LOCKS_EXCLUDED(mutex_);
402 
403   // Returns potential size and document savings if Optimize were called.
404   //
405   // Returns:
406   //   OK on success
407   //   FAILED_PRECONDITION if IcingSearchEngine has not been initialized yet
408   //   INTERNAL_ERROR on IO error
409   GetOptimizeInfoResultProto GetOptimizeInfo() ICING_LOCKS_EXCLUDED(mutex_);
410 
411   // Calculates the StorageInfo for Icing.
412   //
413   // If an IO error occurs while trying to calculate the value for a field, then
414   // that field will be set to -1.
415   StorageInfoResultProto GetStorageInfo() ICING_LOCKS_EXCLUDED(mutex_);
416 
417   // Get debug information for Icing.
418   DebugInfoResultProto GetDebugInfo(DebugInfoVerbosity::Code verbosity)
419       ICING_LOCKS_EXCLUDED(mutex_);
420 
421   // Clears all data from Icing and re-initializes. Clients DO NOT need to call
422   // Initialize again.
423   //
424   // Returns:
425   //   OK on success
426   //   ABORTED_ERROR if failed to delete underlying files
427   //   INTERNAL_ERROR if internal state is no longer consistent
428   ResetResultProto Reset() ICING_LOCKS_EXCLUDED(mutex_);
429 
430   // Disallow copy and move.
431   IcingSearchEngine(const IcingSearchEngine&) = delete;
432   IcingSearchEngine& operator=(const IcingSearchEngine&) = delete;
433 
434  protected:
435   IcingSearchEngine(IcingSearchEngineOptions options,
436                     std::unique_ptr<const Filesystem> filesystem,
437                     std::unique_ptr<const IcingFilesystem> icing_filesystem,
438                     std::unique_ptr<Clock> clock,
439                     std::unique_ptr<const JniCache> jni_cache = nullptr);
440 
441  private:
442   const IcingSearchEngineOptions options_;
443   const std::unique_ptr<const Filesystem> filesystem_;
444   const std::unique_ptr<const IcingFilesystem> icing_filesystem_;
445   bool initialized_ ICING_GUARDED_BY(mutex_) = false;
446 
447   // Abstraction for accessing time values.
448   const std::unique_ptr<const Clock> clock_;
449 
450   // Provides key thresholds that affects the running time and memory of major
451   // components in Icing search engine.
452   const PerformanceConfiguration performance_configuration_;
453 
454   // Used to manage pagination state of query results. Even though
455   // ResultStateManager has its own reader-writer lock, mutex_ must still be
456   // acquired first in order to adhere to the global lock ordering:
457   //   1. mutex_
458   //   2. result_state_manager_.lock_
459   std::unique_ptr<ResultStateManager> result_state_manager_
460       ICING_GUARDED_BY(mutex_);
461 
462   // Used to provide reader and writer locks
463   absl_ports::shared_mutex mutex_;
464 
465   // Stores and processes the schema
466   std::unique_ptr<SchemaStore> schema_store_ ICING_GUARDED_BY(mutex_);
467 
468   // Used to store all valid documents
469   std::unique_ptr<DocumentStore> document_store_ ICING_GUARDED_BY(mutex_);
470 
471   std::unique_ptr<const LanguageSegmenter> language_segmenter_
472       ICING_GUARDED_BY(mutex_);
473 
474   std::unique_ptr<const Normalizer> normalizer_ ICING_GUARDED_BY(mutex_);
475 
476   // Storage for all hits of string contents from the document store.
477   std::unique_ptr<Index> index_ ICING_GUARDED_BY(mutex_);
478 
479   // Storage for all hits of numeric contents from the document store.
480   std::unique_ptr<NumericIndex<int64_t>> integer_index_
481       ICING_GUARDED_BY(mutex_);
482 
483   // Storage for all join qualified ids from the document store.
484   std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_
485       ICING_GUARDED_BY(mutex_);
486 
487   // Storage for all hits of embedding contents from the document store.
488   std::unique_ptr<EmbeddingIndex> embedding_index_ ICING_GUARDED_BY(mutex_);
489 
490   // Pointer to JNI class references
491   const std::unique_ptr<const JniCache> jni_cache_;
492 
493   // Resets all members that are created during Initialize.
494   void ResetMembers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
495 
496   // Resets all members that are created during Initialize, deletes all
497   // underlying files and initializes a fresh index.
498   ResetResultProto ResetInternal() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
499 
500   // Checks for the existence of the init marker file. If the failed init count
501   // exceeds kMaxUnsuccessfulInitAttempts, all data is deleted and the index is
502   // initialized from scratch. The updated count (original failed init count + 1
503   // ) is written to the marker file.
504   //
505   // RETURNS
506   //   OK on success
507   //   INTERNAL if an IO error occurs while trying to update the marker file.
508   libtextclassifier3::Status CheckInitMarkerFile(
509       InitializeStatsProto* initialize_stats)
510       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
511 
512   // Helper method to do the actual work to persist data to disk. We need this
513   // separate method so that other public methods don't need to call
514   // PersistToDisk(). Public methods calling each other may cause deadlock
515   // issues.
516   libtextclassifier3::Status InternalPersistToDisk(
517       PersistType::Code persist_type) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
518 
519   // Helper method to the actual work to Initialize. We need this separate
520   // method so that other public methods don't need to call Initialize(). Public
521   // methods calling each other may cause deadlock issues.
522   InitializeResultProto InternalInitialize()
523       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
524 
525   // Helper method to initialize member variables.
526   //
527   // Returns:
528   //   OK on success
529   //   FAILED_PRECONDITION if initialize_stats is null
530   //   RESOURCE_EXHAUSTED if the index runs out of storage
531   //   NOT_FOUND if some Document's schema type is not in the SchemaStore
532   //   INTERNAL on any I/O errors
533   libtextclassifier3::Status InitializeMembers(
534       InitializeStatsProto* initialize_stats)
535       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
536 
537   // Do any initialization/recovery necessary to create a SchemaStore instance.
538   //
539   // Returns:
540   //   OK on success
541   //   FAILED_PRECONDITION if initialize_stats is null
542   //   INTERNAL on I/O error
543   libtextclassifier3::Status InitializeSchemaStore(
544       InitializeStatsProto* initialize_stats)
545       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
546 
547   // Do any initialization/recovery necessary to create a DocumentStore
548   // instance.
549   //
550   // See comments on DocumentStore::Create for explanation of
551   // force_recovery_and_revalidate_documents.
552   //
553   // Returns:
554   //   On success, a boolean flag indicating whether derived files of the
555   //     document store have been regenerated or not. If true, any other
556   //     components depending on them should also be rebuilt if true.
557   //   FAILED_PRECONDITION if initialize_stats is null
558   //   INTERNAL on I/O error
559   libtextclassifier3::StatusOr<bool> InitializeDocumentStore(
560       bool force_recovery_and_revalidate_documents,
561       InitializeStatsProto* initialize_stats)
562       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
563 
564   // Do any initialization/recovery necessary to create term index, integer
565   // index, and qualified id join index instances.
566   //
567   // If document_store_derived_files_regenerated is true, then we have to
568   // rebuild qualified id join index since NamespaceIds were reassigned.
569   //
570   // Returns:
571   //   OK on success
572   //   FAILED_PRECONDITION if initialize_stats is null
573   //   RESOURCE_EXHAUSTED if the index runs out of storage
574   //   NOT_FOUND if some Document's schema type is not in the SchemaStore
575   //   INTERNAL on I/O error
576   libtextclassifier3::Status InitializeIndex(
577       bool document_store_derived_files_regenerated,
578       InitializeStatsProto* initialize_stats)
579       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
580 
581   // Implementation of IcingSearchEngine::Search that only grabs the overall
582   // read-lock, allowing for parallel non-exclusive operations.
583   // This implementation is used if search_spec.use_read_only_search is true.
584   SearchResultProto SearchLockedShared(const SearchSpecProto& search_spec,
585                                        const ScoringSpecProto& scoring_spec,
586                                        const ResultSpecProto& result_spec)
587       ICING_LOCKS_EXCLUDED(mutex_);
588 
589   // Implementation of IcingSearchEngine::Search that requires the overall
590   // write lock. No other operations of any kind can be executed in parallel if
591   // this version is used.
592   // This implementation is used if search_spec.use_read_only_search is false.
593   SearchResultProto SearchLockedExclusive(const SearchSpecProto& search_spec,
594                                           const ScoringSpecProto& scoring_spec,
595                                           const ResultSpecProto& result_spec)
596       ICING_LOCKS_EXCLUDED(mutex_);
597 
598   // Helper method for the actual work to Search. We need this separate
599   // method to manage locking for Search.
600   SearchResultProto InternalSearch(const SearchSpecProto& search_spec,
601                                    const ScoringSpecProto& scoring_spec,
602                                    const ResultSpecProto& result_spec)
603       ICING_SHARED_LOCKS_REQUIRED(mutex_);
604 
605   // Processes query and scores according to the specs. It is a helper function
606   // (called by Search) to process and score normal query and the nested child
607   // query for join search.
608   //
609   // Returns a QueryScoringResults
610   //   OK on success with a vector of ScoredDocumentHits,
611   //      SectionRestrictQueryTermsMap, and other stats fields for logging.
612   //   Any other errors when processing the query or scoring
613   struct QueryScoringResults {
614     libtextclassifier3::Status status;
615     SectionRestrictQueryTermsMap query_terms;
616     std::vector<ScoredDocumentHit> scored_document_hits;
617 
QueryScoringResultsQueryScoringResults618     explicit QueryScoringResults(
619         libtextclassifier3::Status status_in,
620         SectionRestrictQueryTermsMap&& query_terms_in,
621         std::vector<ScoredDocumentHit>&& scored_document_hits_in)
622         : status(std::move(status_in)),
623           query_terms(std::move(query_terms_in)),
624           scored_document_hits(std::move(scored_document_hits_in)) {}
625   };
626   QueryScoringResults ProcessQueryAndScore(
627       const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
628       const ResultSpecProto& result_spec,
629       const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms,
630       QueryStatsProto::SearchStats* search_stats)
631       ICING_SHARED_LOCKS_REQUIRED(mutex_);
632 
633   // Many of the internal components rely on other components' derived data.
634   // Check that everything is consistent with each other so that we're not
635   // using outdated derived data in some parts of our system.
636   //
637   // NOTE: this method can be called only at startup time or after
638   // PersistToDisk(), otherwise the check could fail due to any changes that are
639   // not persisted.
640   //
641   // Returns:
642   //   OK on success
643   //   NOT_FOUND if missing header file
644   //   INTERNAL_ERROR on any IO errors or if header is inconsistent
645   libtextclassifier3::Status CheckConsistency()
646       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
647 
648   // Discards derived data that requires rebuild based on rebuild_result.
649   //
650   // Returns:
651   //   OK on success
652   //   FAILED_PRECONDITION_ERROR if those instances are valid (non nullptr)
653   //   INTERNAL_ERROR on any I/O errors
654   libtextclassifier3::Status DiscardDerivedFiles(
655       const version_util::DerivedFilesRebuildResult& rebuild_result)
656       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
657 
658   // Repopulates derived data off our ground truths.
659   //
660   // Returns:
661   //   OK on success
662   //   INTERNAL_ERROR on any IO errors
663   libtextclassifier3::Status RegenerateDerivedFiles(
664       InitializeStatsProto* initialize_stats = nullptr,
665       bool log_document_store_stats = false)
666       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
667 
668   // Optimizes the DocumentStore by removing any unneeded documents (i.e.
669   // deleted, expired, etc.) from the filesystem storage.
670   //
671   // NOTE: This may leave the DocumentStore in an invalid/uncreated state. Users
672   // would need call Initialize() to reinitialize everything into a valid state.
673   //
674   // Returns:
675   //   On success, OptimizeResult which contains a vector mapping from old
676   //   document id to new document id and another vector mapping from old
677   //   namespace id to new namespace id. A value of kInvalidDocumentId indicates
678   //   that the old document id has been deleted.
679   //   ABORTED_ERROR if any error happens before the actual optimization, the
680   //                 original document store should be still available
681   //   DATA_LOSS_ERROR on errors that could potentially cause data loss,
682   //                   document store is still available
683   //   INTERNAL_ERROR on any IO errors or other errors that we can't recover
684   //                  from
685   libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
686   OptimizeDocumentStore(OptimizeStatsProto* optimize_stats)
687       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
688 
689   // Helper method to restore missing document data in index_, integer_index_,
690   // and qualified_id_join_index_. All documents will be reindexed. This does
691   // not clear the index, so it is recommended to call ClearAllIndices,
692   // ClearSearchIndices, or ClearJoinIndices first if needed.
693   //
694   // Returns:
695   //   On success, OK and a bool indicating whether or not restoration was
696   //     needed.
697   //   DATA_LOSS, if an error during index merging caused us to lose indexed
698   //     data in the main index. Despite the data loss, this is still considered
699   //     a successful run and needed_restoration will be set to true.
700   //   RESOURCE_EXHAUSTED if the index fills up before finishing indexing
701   //   NOT_FOUND if some Document's schema type is not in the SchemaStore
702   //   INTERNAL_ERROR on any IO errors
703   struct IndexRestorationResult {
704     libtextclassifier3::Status status;
705     bool index_needed_restoration;
706     bool integer_index_needed_restoration;
707     bool qualified_id_join_index_needed_restoration;
708     bool embedding_index_needed_restoration;
709   };
710   IndexRestorationResult RestoreIndexIfNeeded()
711       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
712 
713   // If we lost the schema during a previous failure, it may "look" the same as
714   // not having a schema set before: we don't have a schema proto file. So do
715   // some extra checks to differentiate between having-lost the schema, and
716   // never having a schema before. This may determine if we need to do extra
717   // recovery steps.
718   //
719   // Returns:
720   //   bool indicating if we had a schema and unintentionally lost it
721   //   INTERNAL_ERROR on I/O error
722   libtextclassifier3::StatusOr<bool> LostPreviousSchema()
723       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
724 
725   // Helper method to create all types of data indexing handlers to index term,
726   // integer, and join qualified ids.
727   libtextclassifier3::StatusOr<
728       std::vector<std::unique_ptr<DataIndexingHandler>>>
729   CreateDataIndexingHandlers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
730 
731   // Helper method to discard parts of (term, integer, qualified id join)
732   // indices if they contain data for document ids greater than
733   // last_stored_document_id.
734   //
735   // REQUIRES: last_stored_document_id is valid (!= kInvalidDocumentId). Note:
736   //   if we want to truncate everything in the index, then please call
737   //   ClearSearchIndices/ClearJoinIndices/ClearAllIndices instead.
738   //
739   // Returns:
740   //   On success, a DocumentId indicating the first document to start for
741   //     reindexing and 2 bool flags indicating whether term or integer index
742   //     needs restoration.
743   //   INTERNAL on any I/O errors
744   struct TruncateIndexResult {
745     DocumentId first_document_to_reindex;
746     bool index_needed_restoration;
747     bool integer_index_needed_restoration;
748     bool qualified_id_join_index_needed_restoration;
749     bool embedding_index_needed_restoration;
750 
TruncateIndexResultTruncateIndexResult751     explicit TruncateIndexResult(
752         DocumentId first_document_to_reindex_in,
753         bool index_needed_restoration_in,
754         bool integer_index_needed_restoration_in,
755         bool qualified_id_join_index_needed_restoration_in,
756         bool embedding_index_needed_restoration_in)
757         : first_document_to_reindex(first_document_to_reindex_in),
758           index_needed_restoration(index_needed_restoration_in),
759           integer_index_needed_restoration(integer_index_needed_restoration_in),
760           qualified_id_join_index_needed_restoration(
761               qualified_id_join_index_needed_restoration_in),
762           embedding_index_needed_restoration(
763               embedding_index_needed_restoration_in) {}
764   };
765   libtextclassifier3::StatusOr<TruncateIndexResult> TruncateIndicesTo(
766       DocumentId last_stored_document_id)
767       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
768 
769   // Helper method to discard search (term, integer) indices.
770   //
771   // Returns:
772   //   OK on success
773   //   INTERNAL_ERROR on any I/O errors
774   libtextclassifier3::Status ClearSearchIndices()
775       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
776 
777   // Helper method to discard join (qualified id) indices.
778   //
779   // Returns:
780   //   OK on success
781   //   INTERNAL_ERROR on any I/O errors
782   libtextclassifier3::Status ClearJoinIndices()
783       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
784 
785   // Helper method to discard all search and join indices.
786   //
787   // Returns:
788   //   OK on success
789   //   INTERNAL_ERROR on any I/O errors
790   libtextclassifier3::Status ClearAllIndices()
791       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
792 };
793 
794 }  // namespace lib
795 }  // namespace icing
796 
797 #endif  // ICING_ICING_SEARCH_ENGINE_H_
798