1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_ICING_SEARCH_ENGINE_H_ 16 #define ICING_ICING_SEARCH_ENGINE_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string_view> 21 #include <utility> 22 #include <vector> 23 24 #include "icing/text_classifier/lib3/utils/base/status.h" 25 #include "icing/text_classifier/lib3/utils/base/statusor.h" 26 #include "icing/absl_ports/mutex.h" 27 #include "icing/absl_ports/thread_annotations.h" 28 #include "icing/file/filesystem.h" 29 #include "icing/file/version-util.h" 30 #include "icing/index/data-indexing-handler.h" 31 #include "icing/index/embed/embedding-index.h" 32 #include "icing/index/index.h" 33 #include "icing/index/numeric/numeric-index.h" 34 #include "icing/jni/jni-cache.h" 35 #include "icing/join/join-children-fetcher.h" 36 #include "icing/join/qualified-id-join-index.h" 37 #include "icing/legacy/index/icing-filesystem.h" 38 #include "icing/performance-configuration.h" 39 #include "icing/proto/debug.pb.h" 40 #include "icing/proto/document.pb.h" 41 #include "icing/proto/initialize.pb.h" 42 #include "icing/proto/logging.pb.h" 43 #include "icing/proto/optimize.pb.h" 44 #include "icing/proto/persist.pb.h" 45 #include "icing/proto/reset.pb.h" 46 #include "icing/proto/schema.pb.h" 47 #include "icing/proto/scoring.pb.h" 48 #include "icing/proto/search.pb.h" 49 #include "icing/proto/storage.pb.h" 50 #include "icing/proto/usage.pb.h" 51 #include "icing/query/query-terms.h" 52 #include "icing/result/result-state-manager.h" 53 #include "icing/schema/schema-store.h" 54 #include "icing/scoring/scored-document-hit.h" 55 #include "icing/store/document-id.h" 56 #include "icing/store/document-store.h" 57 #include "icing/tokenization/language-segmenter.h" 58 #include "icing/transform/normalizer.h" 59 #include "icing/util/clock.h" 60 61 namespace icing { 62 namespace lib { 63 64 // TODO(cassiewang) Top-level comments and links to design-doc. 65 class IcingSearchEngine { 66 public: 67 // Note: It is only required to provide a pointer to a valid instance of 68 // JniCache if this instance needs to perform reverse-jni calls. Users on 69 // Linux and iOS should always provide a nullptr. 70 explicit IcingSearchEngine( 71 const IcingSearchEngineOptions& options, 72 std::unique_ptr<const JniCache> jni_cache = nullptr); 73 74 // Calculates integrity checks and persists files to disk. 75 ~IcingSearchEngine(); 76 77 // Loads & verifies the contents previously indexed from disk and gets ready 78 // to handle read/write requests. 79 // 80 // WARNING: This is expected to be fast if Icing had a clean shutdown. 81 // Otherwise, it can take longer as it runs integrity checks and attempts 82 // to bring the index to a consistent state. If the data on disk is not 83 // consistent, it restores the state when PersistToDisk() was last called. 84 // 85 // TODO(cassiewang): We shouldn't return NOT_FOUND here, this is a symptom 86 // of some other error. We should return a broader error group, i.e. data 87 // inconsistency or something 88 // 89 // Returns: 90 // OK on success 91 // DATA_LOSS if encountered any inconsistencies in data and had to restore 92 // its state back to the last time PersistToDisk was called. Or if any 93 // persisted data was lost and could not be recovered. 94 // INTERNAL if any internal state was left in an inconsistent. The instance 95 // of IcingSearchEngine is unusable if this happens. It's recommended to 96 // clear the underlying directory provided in 97 // IcingSearchEngineOptions.base_dir and reinitialize. 98 // RESOURCE_EXHAUSTED if not enough storage space 99 // NOT_FOUND if missing some internal data 100 InitializeResultProto Initialize() ICING_LOCKS_EXCLUDED(mutex_); 101 102 // Specifies the schema to be applied on all Documents that are already 103 // stored as well as future documents. A schema can be 'invalid' and/or 104 // 'incompatible'. These are two independent concepts. 105 // 106 // An 'invalid' schema is one that is not constructed properly. For example, 107 // a PropertyConfigProto is missing the property name field. A schema can be 108 // 'invalid' even if there is no previously existing schema. 109 // 110 // An 'incompatible' schema is one that is incompatible with a previously 111 // existing schema. If there is no previously existing schema, then a new 112 // schema cannot be incompatible. An incompatible schema is one that 113 // invalidates pre-existing data. For example, a previously OPTIONAL field is 114 // now REQUIRED in the new schema, and pre-existing data is considered invalid 115 // against the new schema now. 116 // 117 // Default behavior will not allow a new schema to be set if it is invalid or 118 // incompatible. 119 // 120 // The argument 'ignore_errors_and_delete_documents' can be set to true to 121 // force set an incompatible schema. In that case, documents that are 122 // invalidated by the new schema would be deleted from Icing. This cannot be 123 // used to force set an invalid schema. 124 // 125 // This schema is persisted to disk and used across multiple instances. 126 // So, callers should only have to call this if the schema changed. 127 // However, calling it multiple times with the same schema is a no-op. 128 // 129 // On some errors, Icing will keep using the older schema, but on 130 // INTERNAL_ERROR, it is undefined to continue using Icing. 131 // 132 // Returns: 133 // OK on success 134 // ALREADY_EXISTS if 'new_schema' contains multiple definitions of the same 135 // type or contains a type that has multiple properties with the same 136 // name. 137 // INVALID_ARGUMENT if 'new_schema' is invalid 138 // FAILED_PRECONDITION if 'new_schema' is incompatible, or IcingSearchEngine 139 // has not been initialized yet. 140 // INTERNAL_ERROR if Icing failed to store the new schema or upgrade 141 // existing data based on the new schema. Using Icing beyond this error is 142 // undefined and may cause crashes. 143 // DATA_LOSS_ERROR if 'new_schema' requires the index to be rebuilt and an 144 // IO error leads to some documents being excluded from the index. These 145 // documents will still be retrievable via Get, but won't match queries. 146 // 147 // TODO(cassiewang) Figure out, document (and maybe even enforce) the best 148 // way ordering of calls between Initialize() and SetSchema(), both when 149 // the caller is creating an instance of IcingSearchEngine for the first 150 // time and when the caller is reinitializing an existing index on disk. 151 SetSchemaResultProto SetSchema( 152 SchemaProto&& new_schema, bool ignore_errors_and_delete_documents = false) 153 ICING_LOCKS_EXCLUDED(mutex_); 154 155 // This function makes a copy of the schema and calls SetSchema(SchemaProto&& 156 // new_schema, bool ignore_errors_and_delete_documents) 157 // 158 // NOTE: It's recommended to call SetSchema(SchemaProto&& new_schema, bool 159 // ignore_errors_and_delete_documents) directly to avoid a copy if the caller 160 // can make an rvalue SchemaProto. 161 SetSchemaResultProto SetSchema(const SchemaProto& new_schema, 162 bool ignore_errors_and_delete_documents = 163 false) ICING_LOCKS_EXCLUDED(mutex_); 164 165 // Get Icing's current copy of the schema. 166 // 167 // Returns: 168 // SchemaProto on success 169 // NOT_FOUND if a schema has not been set yet 170 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet. 171 // INTERNAL_ERROR on IO error 172 GetSchemaResultProto GetSchema() ICING_LOCKS_EXCLUDED(mutex_); 173 174 // Get Icing's copy of the SchemaTypeConfigProto of name schema_type 175 // 176 // Returns: 177 // SchemaTypeConfigProto on success 178 // FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine 179 // has not been initialized yet. 180 // NOT_FOUND if there is no SchemaTypeConfig of schema_type in the 181 // SchemaProto 182 // INTERNAL_ERROR on IO error 183 GetSchemaTypeResultProto GetSchemaType(std::string_view schema_type) 184 ICING_LOCKS_EXCLUDED(mutex_); 185 186 // Puts the document into icing search engine so that it's stored and 187 // indexed. Documents are automatically written to disk, callers can also 188 // call PersistToDisk() to flush changes immediately. 189 // 190 // Returns: 191 // OK on success 192 // OUT_OF_SPACE if exceeds maximum number of allowed documents 193 // FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine 194 // has not been initialized yet. 195 // NOT_FOUND if there is no SchemaTypeConfig in the SchemaProto that matches 196 // the document's schema 197 // DATA_LOSS if an IO error occurs while merging document into the index and 198 // the index is lost. These documents will still be retrievable via Get, 199 // but won't match queries. 200 // INTERNAL_ERROR on IO error 201 PutResultProto Put(DocumentProto&& document) ICING_LOCKS_EXCLUDED(mutex_); 202 203 // This function makes a copy of document and calls Put(DocumentProto&& 204 // document). 205 // 206 // NOTE: It's recommended to call Put(DocumentProto&& document) directly to 207 // avoid a copy if the caller can make an rvalue DocumentProto. 208 PutResultProto Put(const DocumentProto& document) 209 ICING_LOCKS_EXCLUDED(mutex_); 210 211 // Finds and returns the document identified by the given key (namespace + 212 // uri) 213 // 214 // Returns: 215 // The document found on success 216 // NOT_FOUND if the key doesn't exist or doc has been deleted 217 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 218 // INTERNAL_ERROR on IO error 219 GetResultProto Get(std::string_view name_space, std::string_view uri, 220 const GetResultSpecProto& result_spec); 221 222 // Reports usage. The corresponding usage scores of the specified document in 223 // the report will be updated. 224 // 225 // Returns: 226 // OK on success 227 // NOT_FOUND if the [namesapce + uri] key in the report doesn't exist 228 // INTERNAL_ERROR on I/O errors. 229 ReportUsageResultProto ReportUsage(const UsageReport& usage_report); 230 231 // Returns all the namespaces that have at least one valid document in it. 232 // 233 // Returns: 234 // All namespaces on success 235 GetAllNamespacesResultProto GetAllNamespaces(); 236 237 // Deletes the Document specified by the given namespace / uri pair from the 238 // search engine. Delete changes are automatically applied to disk, callers 239 // can also call PersistToDisk() to flush changes immediately. 240 // 241 // NOTE: Space is not reclaimed for deleted documents until Optimize() is 242 // called. 243 // 244 // Returns: 245 // OK on success 246 // NOT_FOUND if no document exists with namespace, uri 247 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 248 // INTERNAL_ERROR on IO error 249 DeleteResultProto Delete(std::string_view name_space, std::string_view uri) 250 ICING_LOCKS_EXCLUDED(mutex_); 251 252 // Deletes all Documents belonging to the specified namespace from the search 253 // engine. Delete changes are automatically applied to disk, callers can also 254 // call PersistToDisk() to flush changes immediately. 255 // 256 // NOTE: Space is not reclaimed for deleted documents until Optimize() is 257 // called. 258 // 259 // Returns: 260 // OK on success 261 // NOT_FOUND if namespace doesn't exist 262 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 263 // INTERNAL_ERROR on IO error 264 DeleteByNamespaceResultProto DeleteByNamespace(std::string_view name_space) 265 ICING_LOCKS_EXCLUDED(mutex_); 266 267 // Deletes all Documents belonging to the specified type from the search 268 // engine. Delete changes are automatically applied to disk, callers can also 269 // call PersistToDisk() to flush changes immediately. 270 // 271 // NOTE: Space is not reclaimed for deleted documents until Optimize() is 272 // called. 273 // 274 // Returns: 275 // OK on success 276 // NOT_FOUND if schema type doesn't exist 277 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 278 // INTERNAL_ERROR on IO error 279 DeleteBySchemaTypeResultProto DeleteBySchemaType(std::string_view schema_type) 280 ICING_LOCKS_EXCLUDED(mutex_); 281 282 // Deletes all Documents that match the query specified in search_spec. Delete 283 // changes are automatically applied to disk, callers can also call 284 // PersistToDisk() to flush changes immediately. 285 // 286 // NOTE: Space is not reclaimed for deleted documents until Optimize() is 287 // called. 288 // 289 // Returns: 290 // OK on success 291 // NOT_FOUND if the query doesn't match any documents 292 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 293 // INTERNAL_ERROR on IO error 294 DeleteByQueryResultProto DeleteByQuery( 295 const SearchSpecProto& search_spec, 296 bool return_deleted_document_info = false) ICING_LOCKS_EXCLUDED(mutex_); 297 298 // Retrieves, scores, ranks, and returns the results according to the specs. 299 // Results can be empty. If there're multiple pages of results, 300 // SearchResultProto.next_page_token will be set to a non-zero token and can 301 // be used to fetch more pages via GetNextPage() method. Clients should call 302 // InvalidateNextPageToken() after they get the pages they need to release 303 // result cache in memory. Please refer to each proto file for spec 304 // definitions. 305 // 306 // Returns a SearchResultProto with status: 307 // OK with results on success 308 // INVALID_ARGUMENT if any of specs is invalid 309 // ABORTED if failed to perform search but existing data is not affected 310 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 311 // INTERNAL_ERROR on any other errors 312 SearchResultProto Search(const SearchSpecProto& search_spec, 313 const ScoringSpecProto& scoring_spec, 314 const ResultSpecProto& result_spec) 315 ICING_LOCKS_EXCLUDED(mutex_); 316 317 // Retrieves, scores, ranks and returns the suggested query string according 318 // to the specs. Results can be empty. 319 // 320 // Returns a SuggestionResponse with status: 321 // OK with results on success 322 // INVALID_ARGUMENT if any of specs is invalid 323 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 324 // INTERNAL_ERROR on any other errors 325 SuggestionResponse SearchSuggestions( 326 const SuggestionSpecProto& suggestion_spec) ICING_LOCKS_EXCLUDED(mutex_); 327 328 // Fetches the next page of results of a previously executed query. Results 329 // can be empty if next-page token is invalid. Invalid next page tokens are 330 // tokens that are either zero or were previously passed to 331 // InvalidateNextPageToken. If there are pages of results remaining after the 332 // one retrieved by this call, SearchResultProto.next_page_token will be 333 // set to a non-zero token and can be used to fetch more pages via 334 // GetNextPage() method. 335 // 336 // Returns a SearchResultProto with status: 337 // OK with results on success 338 // ABORTED if failed to get results but existing data is not affected 339 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 340 // INTERNAL_ERROR on any other errors 341 SearchResultProto GetNextPage(uint64_t next_page_token) 342 ICING_LOCKS_EXCLUDED(mutex_); 343 344 // Invalidates the next-page token so that no more results of the related 345 // query can be returned. 346 void InvalidateNextPageToken(uint64_t next_page_token) 347 ICING_LOCKS_EXCLUDED(mutex_); 348 349 // Makes sure that every update/delete received till this point is flushed 350 // to disk. If the app crashes after a call to PersistToDisk(), Icing 351 // would be able to fully recover all data written up to this point. 352 // 353 // If persist_type is PersistType::LITE, then only the ground truth will be 354 // synced. This should be relatively lightweight to do (order of microseconds) 355 // and ensures that there will be no data loss. At worst, Icing may need to 356 // recover internal data structures by replaying the document log upon the 357 // next startup. Clients should call PersistToDisk(LITE) after each batch of 358 // mutations. 359 // 360 // If persist_type is PersistType::FULL, then all internal data structures in 361 // Icing will be synced. This is a heavier operation (order of milliseconds). 362 // It ensures that Icing will not need to recover internal data structures 363 // upon the next startup. Clients should call PersistToDisk(FULL) before their 364 // process dies. 365 // 366 // NOTE: It is not necessary to call PersistToDisk() to read back data 367 // that was recently written. All read APIs will include the most recent 368 // updates/deletes regardless of the data being flushed to disk. 369 // 370 // Returns: 371 // OK on success 372 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 373 // INTERNAL on I/O error 374 PersistToDiskResultProto PersistToDisk(PersistType::Code persist_type) 375 ICING_LOCKS_EXCLUDED(mutex_); 376 377 // Allows Icing to run tasks that are too expensive and/or unnecessary to be 378 // executed in real-time, but are useful to keep it fast and be 379 // resource-efficient. This method purely optimizes the internal files and 380 // has no functional impact on what gets accepted/returned. 381 // 382 // WARNING: This method is CPU and IO intensive and depending on the 383 // contents stored, it can take from a few seconds to a few minutes. 384 // This call also blocks all read/write operations on Icing. 385 // 386 // SUGGESTION: Assuming the client has no restrictions on their side, it's 387 // recommended to call this method about once every 24 hours when the 388 // device is idle and charging. It can also be called when the system needs 389 // to free up extra disk-space. 390 // 391 // Returns: 392 // OK on success 393 // ABORTED_ERROR if optimization is aborted due to non-fatal errors before 394 // actual modifications are made. 395 // DATA_LOSS_ERROR on errors that could potentially cause data loss, 396 // IcingSearchEngine is still functioning. 397 // INTERNAL_ERROR on any IO errors or other unrecoverable errors. Continued 398 // use of Icing is undefined. 399 // Clients could clear and reinitialize IcingSearchEngine. 400 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 401 OptimizeResultProto Optimize() ICING_LOCKS_EXCLUDED(mutex_); 402 403 // Returns potential size and document savings if Optimize were called. 404 // 405 // Returns: 406 // OK on success 407 // FAILED_PRECONDITION if IcingSearchEngine has not been initialized yet 408 // INTERNAL_ERROR on IO error 409 GetOptimizeInfoResultProto GetOptimizeInfo() ICING_LOCKS_EXCLUDED(mutex_); 410 411 // Calculates the StorageInfo for Icing. 412 // 413 // If an IO error occurs while trying to calculate the value for a field, then 414 // that field will be set to -1. 415 StorageInfoResultProto GetStorageInfo() ICING_LOCKS_EXCLUDED(mutex_); 416 417 // Get debug information for Icing. 418 DebugInfoResultProto GetDebugInfo(DebugInfoVerbosity::Code verbosity) 419 ICING_LOCKS_EXCLUDED(mutex_); 420 421 // Clears all data from Icing and re-initializes. Clients DO NOT need to call 422 // Initialize again. 423 // 424 // Returns: 425 // OK on success 426 // ABORTED_ERROR if failed to delete underlying files 427 // INTERNAL_ERROR if internal state is no longer consistent 428 ResetResultProto Reset() ICING_LOCKS_EXCLUDED(mutex_); 429 430 // Disallow copy and move. 431 IcingSearchEngine(const IcingSearchEngine&) = delete; 432 IcingSearchEngine& operator=(const IcingSearchEngine&) = delete; 433 434 protected: 435 IcingSearchEngine(IcingSearchEngineOptions options, 436 std::unique_ptr<const Filesystem> filesystem, 437 std::unique_ptr<const IcingFilesystem> icing_filesystem, 438 std::unique_ptr<Clock> clock, 439 std::unique_ptr<const JniCache> jni_cache = nullptr); 440 441 private: 442 const IcingSearchEngineOptions options_; 443 const std::unique_ptr<const Filesystem> filesystem_; 444 const std::unique_ptr<const IcingFilesystem> icing_filesystem_; 445 bool initialized_ ICING_GUARDED_BY(mutex_) = false; 446 447 // Abstraction for accessing time values. 448 const std::unique_ptr<const Clock> clock_; 449 450 // Provides key thresholds that affects the running time and memory of major 451 // components in Icing search engine. 452 const PerformanceConfiguration performance_configuration_; 453 454 // Used to manage pagination state of query results. Even though 455 // ResultStateManager has its own reader-writer lock, mutex_ must still be 456 // acquired first in order to adhere to the global lock ordering: 457 // 1. mutex_ 458 // 2. result_state_manager_.lock_ 459 std::unique_ptr<ResultStateManager> result_state_manager_ 460 ICING_GUARDED_BY(mutex_); 461 462 // Used to provide reader and writer locks 463 absl_ports::shared_mutex mutex_; 464 465 // Stores and processes the schema 466 std::unique_ptr<SchemaStore> schema_store_ ICING_GUARDED_BY(mutex_); 467 468 // Used to store all valid documents 469 std::unique_ptr<DocumentStore> document_store_ ICING_GUARDED_BY(mutex_); 470 471 std::unique_ptr<const LanguageSegmenter> language_segmenter_ 472 ICING_GUARDED_BY(mutex_); 473 474 std::unique_ptr<const Normalizer> normalizer_ ICING_GUARDED_BY(mutex_); 475 476 // Storage for all hits of string contents from the document store. 477 std::unique_ptr<Index> index_ ICING_GUARDED_BY(mutex_); 478 479 // Storage for all hits of numeric contents from the document store. 480 std::unique_ptr<NumericIndex<int64_t>> integer_index_ 481 ICING_GUARDED_BY(mutex_); 482 483 // Storage for all join qualified ids from the document store. 484 std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_ 485 ICING_GUARDED_BY(mutex_); 486 487 // Storage for all hits of embedding contents from the document store. 488 std::unique_ptr<EmbeddingIndex> embedding_index_ ICING_GUARDED_BY(mutex_); 489 490 // Pointer to JNI class references 491 const std::unique_ptr<const JniCache> jni_cache_; 492 493 // Resets all members that are created during Initialize. 494 void ResetMembers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 495 496 // Resets all members that are created during Initialize, deletes all 497 // underlying files and initializes a fresh index. 498 ResetResultProto ResetInternal() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 499 500 // Checks for the existence of the init marker file. If the failed init count 501 // exceeds kMaxUnsuccessfulInitAttempts, all data is deleted and the index is 502 // initialized from scratch. The updated count (original failed init count + 1 503 // ) is written to the marker file. 504 // 505 // RETURNS 506 // OK on success 507 // INTERNAL if an IO error occurs while trying to update the marker file. 508 libtextclassifier3::Status CheckInitMarkerFile( 509 InitializeStatsProto* initialize_stats) 510 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 511 512 // Helper method to do the actual work to persist data to disk. We need this 513 // separate method so that other public methods don't need to call 514 // PersistToDisk(). Public methods calling each other may cause deadlock 515 // issues. 516 libtextclassifier3::Status InternalPersistToDisk( 517 PersistType::Code persist_type) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 518 519 // Helper method to the actual work to Initialize. We need this separate 520 // method so that other public methods don't need to call Initialize(). Public 521 // methods calling each other may cause deadlock issues. 522 InitializeResultProto InternalInitialize() 523 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 524 525 // Helper method to initialize member variables. 526 // 527 // Returns: 528 // OK on success 529 // FAILED_PRECONDITION if initialize_stats is null 530 // RESOURCE_EXHAUSTED if the index runs out of storage 531 // NOT_FOUND if some Document's schema type is not in the SchemaStore 532 // INTERNAL on any I/O errors 533 libtextclassifier3::Status InitializeMembers( 534 InitializeStatsProto* initialize_stats) 535 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 536 537 // Do any initialization/recovery necessary to create a SchemaStore instance. 538 // 539 // Returns: 540 // OK on success 541 // FAILED_PRECONDITION if initialize_stats is null 542 // INTERNAL on I/O error 543 libtextclassifier3::Status InitializeSchemaStore( 544 InitializeStatsProto* initialize_stats) 545 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 546 547 // Do any initialization/recovery necessary to create a DocumentStore 548 // instance. 549 // 550 // See comments on DocumentStore::Create for explanation of 551 // force_recovery_and_revalidate_documents. 552 // 553 // Returns: 554 // On success, a boolean flag indicating whether derived files of the 555 // document store have been regenerated or not. If true, any other 556 // components depending on them should also be rebuilt if true. 557 // FAILED_PRECONDITION if initialize_stats is null 558 // INTERNAL on I/O error 559 libtextclassifier3::StatusOr<bool> InitializeDocumentStore( 560 bool force_recovery_and_revalidate_documents, 561 InitializeStatsProto* initialize_stats) 562 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 563 564 // Do any initialization/recovery necessary to create term index, integer 565 // index, and qualified id join index instances. 566 // 567 // If document_store_derived_files_regenerated is true, then we have to 568 // rebuild qualified id join index since NamespaceIds were reassigned. 569 // 570 // Returns: 571 // OK on success 572 // FAILED_PRECONDITION if initialize_stats is null 573 // RESOURCE_EXHAUSTED if the index runs out of storage 574 // NOT_FOUND if some Document's schema type is not in the SchemaStore 575 // INTERNAL on I/O error 576 libtextclassifier3::Status InitializeIndex( 577 bool document_store_derived_files_regenerated, 578 InitializeStatsProto* initialize_stats) 579 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 580 581 // Implementation of IcingSearchEngine::Search that only grabs the overall 582 // read-lock, allowing for parallel non-exclusive operations. 583 // This implementation is used if search_spec.use_read_only_search is true. 584 SearchResultProto SearchLockedShared(const SearchSpecProto& search_spec, 585 const ScoringSpecProto& scoring_spec, 586 const ResultSpecProto& result_spec) 587 ICING_LOCKS_EXCLUDED(mutex_); 588 589 // Implementation of IcingSearchEngine::Search that requires the overall 590 // write lock. No other operations of any kind can be executed in parallel if 591 // this version is used. 592 // This implementation is used if search_spec.use_read_only_search is false. 593 SearchResultProto SearchLockedExclusive(const SearchSpecProto& search_spec, 594 const ScoringSpecProto& scoring_spec, 595 const ResultSpecProto& result_spec) 596 ICING_LOCKS_EXCLUDED(mutex_); 597 598 // Helper method for the actual work to Search. We need this separate 599 // method to manage locking for Search. 600 SearchResultProto InternalSearch(const SearchSpecProto& search_spec, 601 const ScoringSpecProto& scoring_spec, 602 const ResultSpecProto& result_spec) 603 ICING_SHARED_LOCKS_REQUIRED(mutex_); 604 605 // Processes query and scores according to the specs. It is a helper function 606 // (called by Search) to process and score normal query and the nested child 607 // query for join search. 608 // 609 // Returns a QueryScoringResults 610 // OK on success with a vector of ScoredDocumentHits, 611 // SectionRestrictQueryTermsMap, and other stats fields for logging. 612 // Any other errors when processing the query or scoring 613 struct QueryScoringResults { 614 libtextclassifier3::Status status; 615 SectionRestrictQueryTermsMap query_terms; 616 std::vector<ScoredDocumentHit> scored_document_hits; 617 QueryScoringResultsQueryScoringResults618 explicit QueryScoringResults( 619 libtextclassifier3::Status status_in, 620 SectionRestrictQueryTermsMap&& query_terms_in, 621 std::vector<ScoredDocumentHit>&& scored_document_hits_in) 622 : status(std::move(status_in)), 623 query_terms(std::move(query_terms_in)), 624 scored_document_hits(std::move(scored_document_hits_in)) {} 625 }; 626 QueryScoringResults ProcessQueryAndScore( 627 const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec, 628 const ResultSpecProto& result_spec, 629 const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms, 630 QueryStatsProto::SearchStats* search_stats) 631 ICING_SHARED_LOCKS_REQUIRED(mutex_); 632 633 // Many of the internal components rely on other components' derived data. 634 // Check that everything is consistent with each other so that we're not 635 // using outdated derived data in some parts of our system. 636 // 637 // NOTE: this method can be called only at startup time or after 638 // PersistToDisk(), otherwise the check could fail due to any changes that are 639 // not persisted. 640 // 641 // Returns: 642 // OK on success 643 // NOT_FOUND if missing header file 644 // INTERNAL_ERROR on any IO errors or if header is inconsistent 645 libtextclassifier3::Status CheckConsistency() 646 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 647 648 // Discards derived data that requires rebuild based on rebuild_result. 649 // 650 // Returns: 651 // OK on success 652 // FAILED_PRECONDITION_ERROR if those instances are valid (non nullptr) 653 // INTERNAL_ERROR on any I/O errors 654 libtextclassifier3::Status DiscardDerivedFiles( 655 const version_util::DerivedFilesRebuildResult& rebuild_result) 656 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 657 658 // Repopulates derived data off our ground truths. 659 // 660 // Returns: 661 // OK on success 662 // INTERNAL_ERROR on any IO errors 663 libtextclassifier3::Status RegenerateDerivedFiles( 664 InitializeStatsProto* initialize_stats = nullptr, 665 bool log_document_store_stats = false) 666 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 667 668 // Optimizes the DocumentStore by removing any unneeded documents (i.e. 669 // deleted, expired, etc.) from the filesystem storage. 670 // 671 // NOTE: This may leave the DocumentStore in an invalid/uncreated state. Users 672 // would need call Initialize() to reinitialize everything into a valid state. 673 // 674 // Returns: 675 // On success, OptimizeResult which contains a vector mapping from old 676 // document id to new document id and another vector mapping from old 677 // namespace id to new namespace id. A value of kInvalidDocumentId indicates 678 // that the old document id has been deleted. 679 // ABORTED_ERROR if any error happens before the actual optimization, the 680 // original document store should be still available 681 // DATA_LOSS_ERROR on errors that could potentially cause data loss, 682 // document store is still available 683 // INTERNAL_ERROR on any IO errors or other errors that we can't recover 684 // from 685 libtextclassifier3::StatusOr<DocumentStore::OptimizeResult> 686 OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) 687 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 688 689 // Helper method to restore missing document data in index_, integer_index_, 690 // and qualified_id_join_index_. All documents will be reindexed. This does 691 // not clear the index, so it is recommended to call ClearAllIndices, 692 // ClearSearchIndices, or ClearJoinIndices first if needed. 693 // 694 // Returns: 695 // On success, OK and a bool indicating whether or not restoration was 696 // needed. 697 // DATA_LOSS, if an error during index merging caused us to lose indexed 698 // data in the main index. Despite the data loss, this is still considered 699 // a successful run and needed_restoration will be set to true. 700 // RESOURCE_EXHAUSTED if the index fills up before finishing indexing 701 // NOT_FOUND if some Document's schema type is not in the SchemaStore 702 // INTERNAL_ERROR on any IO errors 703 struct IndexRestorationResult { 704 libtextclassifier3::Status status; 705 bool index_needed_restoration; 706 bool integer_index_needed_restoration; 707 bool qualified_id_join_index_needed_restoration; 708 bool embedding_index_needed_restoration; 709 }; 710 IndexRestorationResult RestoreIndexIfNeeded() 711 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 712 713 // If we lost the schema during a previous failure, it may "look" the same as 714 // not having a schema set before: we don't have a schema proto file. So do 715 // some extra checks to differentiate between having-lost the schema, and 716 // never having a schema before. This may determine if we need to do extra 717 // recovery steps. 718 // 719 // Returns: 720 // bool indicating if we had a schema and unintentionally lost it 721 // INTERNAL_ERROR on I/O error 722 libtextclassifier3::StatusOr<bool> LostPreviousSchema() 723 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 724 725 // Helper method to create all types of data indexing handlers to index term, 726 // integer, and join qualified ids. 727 libtextclassifier3::StatusOr< 728 std::vector<std::unique_ptr<DataIndexingHandler>>> 729 CreateDataIndexingHandlers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 730 731 // Helper method to discard parts of (term, integer, qualified id join) 732 // indices if they contain data for document ids greater than 733 // last_stored_document_id. 734 // 735 // REQUIRES: last_stored_document_id is valid (!= kInvalidDocumentId). Note: 736 // if we want to truncate everything in the index, then please call 737 // ClearSearchIndices/ClearJoinIndices/ClearAllIndices instead. 738 // 739 // Returns: 740 // On success, a DocumentId indicating the first document to start for 741 // reindexing and 2 bool flags indicating whether term or integer index 742 // needs restoration. 743 // INTERNAL on any I/O errors 744 struct TruncateIndexResult { 745 DocumentId first_document_to_reindex; 746 bool index_needed_restoration; 747 bool integer_index_needed_restoration; 748 bool qualified_id_join_index_needed_restoration; 749 bool embedding_index_needed_restoration; 750 TruncateIndexResultTruncateIndexResult751 explicit TruncateIndexResult( 752 DocumentId first_document_to_reindex_in, 753 bool index_needed_restoration_in, 754 bool integer_index_needed_restoration_in, 755 bool qualified_id_join_index_needed_restoration_in, 756 bool embedding_index_needed_restoration_in) 757 : first_document_to_reindex(first_document_to_reindex_in), 758 index_needed_restoration(index_needed_restoration_in), 759 integer_index_needed_restoration(integer_index_needed_restoration_in), 760 qualified_id_join_index_needed_restoration( 761 qualified_id_join_index_needed_restoration_in), 762 embedding_index_needed_restoration( 763 embedding_index_needed_restoration_in) {} 764 }; 765 libtextclassifier3::StatusOr<TruncateIndexResult> TruncateIndicesTo( 766 DocumentId last_stored_document_id) 767 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 768 769 // Helper method to discard search (term, integer) indices. 770 // 771 // Returns: 772 // OK on success 773 // INTERNAL_ERROR on any I/O errors 774 libtextclassifier3::Status ClearSearchIndices() 775 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 776 777 // Helper method to discard join (qualified id) indices. 778 // 779 // Returns: 780 // OK on success 781 // INTERNAL_ERROR on any I/O errors 782 libtextclassifier3::Status ClearJoinIndices() 783 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 784 785 // Helper method to discard all search and join indices. 786 // 787 // Returns: 788 // OK on success 789 // INTERNAL_ERROR on any I/O errors 790 libtextclassifier3::Status ClearAllIndices() 791 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 792 }; 793 794 } // namespace lib 795 } // namespace icing 796 797 #endif // ICING_ICING_SEARCH_ENGINE_H_ 798