1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_SCHEMA_SCHEMA_STORE_H_ 16 #define ICING_SCHEMA_SCHEMA_STORE_H_ 17 18 #include <cstdint> 19 #include <cstring> 20 #include <limits> 21 #include <memory> 22 #include <optional> 23 #include <string> 24 #include <string_view> 25 #include <unordered_map> 26 #include <unordered_set> 27 #include <utility> 28 #include <vector> 29 30 #include "icing/text_classifier/lib3/utils/base/status.h" 31 #include "icing/text_classifier/lib3/utils/base/statusor.h" 32 #include "icing/absl_ports/canonical_errors.h" 33 #include "icing/feature-flags.h" 34 #include "icing/file/file-backed-proto.h" 35 #include "icing/file/filesystem.h" 36 #include "icing/file/version-util.h" 37 #include "icing/proto/debug.pb.h" 38 #include "icing/proto/document.pb.h" 39 #include "icing/proto/logging.pb.h" 40 #include "icing/proto/schema.pb.h" 41 #include "icing/proto/search.pb.h" 42 #include "icing/proto/storage.pb.h" 43 #include "icing/schema/joinable-property.h" 44 #include "icing/schema/schema-type-manager.h" 45 #include "icing/schema/schema-util.h" 46 #include "icing/schema/scorable_property_manager.h" 47 #include "icing/schema/section.h" 48 #include "icing/store/document-filter-data.h" 49 #include "icing/store/key-mapper.h" 50 #include "icing/util/clock.h" 51 #include "icing/util/crc32.h" 52 #include "icing/util/logging.h" 53 #include "icing/util/status-macros.h" 54 55 namespace icing { 56 namespace lib { 57 58 // Holds the ground truth schema proto. Tracks compatible changes to the schema 59 // and will update any derived data based on the schema proto, such as Sections, 60 // SchemaTypeConfigs, PropertyConfigs, and SchemaTypeIds. To ensure they have 61 // the most up-to-date data, callers should not save instances themselves and 62 // should always call Get* from the SchemaStore. 63 class SchemaStore { 64 public: 65 struct LegacyHeader { 66 // Holds the magic as a quick sanity check against file corruption. 67 int32_t magic; 68 69 // Checksum of the SchemaStore's sub-component's checksums. 70 uint32_t checksum; 71 }; 72 73 class Header { 74 public: 75 static constexpr int32_t kMagic = 0x72650d0a; 76 Header(const Filesystem * filesystem,std::string path)77 explicit Header(const Filesystem* filesystem, std::string path) 78 : path_(std::move(path)), filesystem_(filesystem) {} 79 Header(Header && other)80 Header(Header&& other) 81 : serialized_header_(std::move(other.serialized_header_)), 82 path_(std::move(other.path_)), 83 header_fd_(std::move(other.header_fd_)), 84 filesystem_(other.filesystem_), 85 dirty_(other.dirty_) {} 86 87 Header& operator=(Header&& other) { 88 serialized_header_ = std::move(other.serialized_header_); 89 path_ = std::move(other.path_); 90 header_fd_ = std::move(other.header_fd_); 91 filesystem_ = other.filesystem_; 92 dirty_ = other.dirty_; 93 return *this; 94 } 95 96 struct SerializedHeader { SerializedHeaderSerializedHeader97 explicit SerializedHeader() 98 : magic(kMagic), 99 checksum(0), 100 overlay_created(false), 101 min_overlay_version_compatibility( 102 std::numeric_limits<int32_t>::max()) { 103 memset(overlay_created_padding, 0, kOverlayCreatedPaddingSize); 104 memset(padding, 0, kPaddingSize); 105 } 106 // Holds the magic as a quick sanity check against file corruption. 107 int32_t magic; 108 109 // Checksum of the SchemaStore's sub-component's checksums. 110 uint32_t checksum; 111 112 bool overlay_created; 113 // Three bytes of padding due to the fact that 114 // min_overlay_version_compatibility_ has an alignof() == 4 and the offset 115 // of overlay_created_padding_ == 9. 116 static constexpr int kOverlayCreatedPaddingSize = 3; 117 uint8_t overlay_created_padding[kOverlayCreatedPaddingSize]; 118 119 int32_t min_overlay_version_compatibility; 120 121 static constexpr int kPaddingSize = 1008; 122 // Padding exists just to reserve space for additional values. 123 uint8_t padding[kPaddingSize]; 124 }; 125 static_assert(sizeof(SerializedHeader) == 1024); 126 127 // RETURNS: 128 // - On success, a valid Header instance 129 // - NOT_FOUND if header file doesn't exist 130 // - INTERNAL if unable to read header 131 static libtextclassifier3::StatusOr<Header> Read( 132 const Filesystem* filesystem, std::string path); 133 134 libtextclassifier3::Status Write(); 135 136 libtextclassifier3::Status PersistToDisk(); 137 magic()138 int32_t magic() const { return serialized_header_.magic; } 139 checksum()140 uint32_t checksum() const { return serialized_header_.checksum; } set_checksum(uint32_t checksum)141 void set_checksum(uint32_t checksum) { 142 dirty_ = true; 143 serialized_header_.checksum = checksum; 144 } 145 overlay_created()146 bool overlay_created() const { return serialized_header_.overlay_created; } 147 min_overlay_version_compatibility()148 int32_t min_overlay_version_compatibility() const { 149 return serialized_header_.min_overlay_version_compatibility; 150 } 151 SetOverlayInfo(bool overlay_created,int32_t min_overlay_version_compatibility)152 void SetOverlayInfo(bool overlay_created, 153 int32_t min_overlay_version_compatibility) { 154 dirty_ = true; 155 serialized_header_.overlay_created = overlay_created; 156 serialized_header_.min_overlay_version_compatibility = 157 min_overlay_version_compatibility; 158 } 159 160 private: Header(SerializedHeader serialized_header,std::string path,ScopedFd header_fd,const Filesystem * filesystem)161 explicit Header(SerializedHeader serialized_header, std::string path, 162 ScopedFd header_fd, const Filesystem* filesystem) 163 : serialized_header_(std::move(serialized_header)), 164 path_(std::move(path)), 165 header_fd_(std::move(header_fd)), 166 filesystem_(filesystem), 167 dirty_(false) {} 168 169 SerializedHeader serialized_header_; 170 std::string path_; 171 ScopedFd header_fd_; 172 const Filesystem* filesystem_; // Not owned. 173 bool dirty_; 174 }; 175 176 // Holds information on what may have been affected by the new schema. This is 177 // generally data that other classes may depend on from the SchemaStore, 178 // so that we can know if we should go update those classes as well. 179 struct SetSchemaResult { 180 // Whether we are able to write the schema as determined by SetSchema's 181 // arguments. This boolean reflects SetSchema's logic, and does not reflect 182 // any system level IO errors that may prevent the schema from being written 183 // to file. 184 bool success = false; 185 186 // SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if: 187 // 1. Schema types are added in the middle of the SchemaProto 188 // 2. Schema types are removed from the middle of the SchemaProto 189 // 3. Schema types are reordered in the SchemaProto 190 // 191 // SchemaTypeIds are not changed if schema types are added/removed to the 192 // end of the SchemaProto. 193 std::unordered_set<SchemaTypeId> old_schema_type_ids_changed; 194 195 // Schema types that have been removed from the new schema. Represented by 196 // the `schema_type` field in the SchemaTypeConfigProto. 197 std::unordered_set<std::string> schema_types_deleted_by_name; 198 199 // Schema types that have been removed from the new schema. Represented by 200 // the SchemaTypeId assigned to this SchemaTypeConfigProto in the *old* 201 // schema. 202 std::unordered_set<SchemaTypeId> schema_types_deleted_by_id; 203 204 // Schema types whose SchemaTypeConfigProto has changed in an incompatible 205 // manner in the new schema. Compatibility determined in 206 // SchemaUtil::ComputeCompatibilityDelta. Represented by the `schema_type` 207 // field in the SchemaTypeConfigProto. 208 std::unordered_set<std::string> schema_types_incompatible_by_name; 209 210 // Schema types whose SchemaTypeConfigProto has changed in an incompatible 211 // manner in the new schema. Compatibility determined in 212 // SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId 213 // assigned to this SchemaTypeConfigProto in the *old* schema. 214 std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id; 215 216 // Schema types that were added in the new schema. Represented by the 217 // `schema_type` field in the SchemaTypeConfigProto. 218 std::unordered_set<std::string> schema_types_new_by_name; 219 220 // Schema types that were changed in a way that was backwards compatible and 221 // didn't invalidate the index. Represented by the `schema_type` field in 222 // the SchemaTypeConfigProto. 223 std::unordered_set<std::string> 224 schema_types_changed_fully_compatible_by_name; 225 226 // Schema types that were changed in a way that was backwards compatible, 227 // but invalidated the index. Represented by the `schema_type` field in the 228 // SchemaTypeConfigProto. 229 std::unordered_set<std::string> schema_types_index_incompatible_by_name; 230 231 // Schema types that were changed in a way that was backwards compatible, 232 // but invalidated the joinable cache. Represented by the `schema_type` 233 // field in the SchemaTypeConfigProto. 234 std::unordered_set<std::string> schema_types_join_incompatible_by_name; 235 236 // Schema types that were changed in a way that was backwards compatible, 237 // but inconsistent with the old schema so that the scorable property cache 238 // needs to be re-generated. 239 std::unordered_set<SchemaTypeId> 240 schema_types_scorable_property_inconsistent_by_id; 241 242 // Schema types that were changed in a way that was backwards compatible, 243 // but inconsistent with the old schema so that the scorable property cache 244 // needs to be re-generated. 245 std::unordered_set<std::string> 246 schema_types_scorable_property_inconsistent_by_name; 247 }; 248 249 struct ExpandedTypePropertyMask { 250 std::string schema_type; 251 std::unordered_set<std::string> paths; 252 }; 253 254 static constexpr std::string_view kSchemaTypeWildcard = "*"; 255 256 static constexpr std::string_view kDefaultEmptySchemaDatabase = ""; 257 258 // Factory function to create a SchemaStore which does not take ownership 259 // of any input components, and all pointers must refer to valid objects that 260 // outlive the created SchemaStore instance. The base_dir must already exist. 261 // There does not need to be an existing schema already. 262 // 263 // If initialize_stats is present, the fields related to SchemaStore will be 264 // populated. 265 // 266 // Returns: 267 // A SchemaStore on success 268 // FAILED_PRECONDITION on any null pointer input 269 // INTERNAL_ERROR on any IO errors 270 static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create( 271 const Filesystem* filesystem, const std::string& base_dir, 272 const Clock* clock, const FeatureFlags* feature_flags, 273 InitializeStatsProto* initialize_stats = nullptr); 274 275 // Migrates schema files (backup v.s. new schema) according to version state 276 // change. Also performs schema database migration and populates the database 277 // fields in the persisted schema file if necessary. 278 // 279 // Returns: 280 // OK on success or nothing to migrate 281 static libtextclassifier3::Status MigrateSchema( 282 const Filesystem* filesystem, const std::string& base_dir, 283 version_util::StateChange version_state_change, int32_t new_version, 284 bool perform_schema_database_migration); 285 286 // Discards all derived data in the schema store. 287 // 288 // Returns: 289 // OK on success or nothing to discard 290 // INTERNAL_ERROR on any I/O errors 291 static libtextclassifier3::Status DiscardDerivedFiles( 292 const Filesystem* filesystem, const std::string& base_dir); 293 294 SchemaStore(SchemaStore&&) = default; 295 SchemaStore& operator=(SchemaStore&&) = default; 296 297 SchemaStore(const SchemaStore&) = delete; 298 SchemaStore& operator=(const SchemaStore&) = delete; 299 300 // Persists and updates checksum of subcomponents. 301 ~SchemaStore(); 302 303 // Retrieve the current schema if it exists. 304 // 305 // Returns: 306 // - SchemaProto* if exists 307 // - INTERNAL_ERROR on any IO errors 308 // - NOT_FOUND_ERROR if a schema hasn't been set before 309 libtextclassifier3::StatusOr<const SchemaProto*> GetSchema() const; 310 311 // Retrieve the current schema for a given database if it exists. 312 // 313 // This is an expensive operation. Use GetSchema() when retrieving the entire 314 // schema, or if there is only a single database in the schema store. 315 // 316 // Returns: 317 // - SchemaProto* containing only schema types from the database, if exists 318 // - INTERNAL_ERROR on any IO errors 319 // - NOT_FOUND_ERROR if the database doesn't exist in the schema, or if a 320 // schema hasn't been set before 321 libtextclassifier3::StatusOr<SchemaProto> GetSchema( 322 const std::string& database) const; 323 324 // Update our current schema if it's compatible. Does not accept incompatible 325 // schema or schema with types from multiple databases. Compatibility rules 326 // defined by SchemaUtil::ComputeCompatibilityDelta. 327 // 328 // NOTE: This method is deprecated. Please use 329 // `SetSchema(SetSchemaRequestProto&& set_schema_request)` instead. 330 // 331 // TODO: b/337913932 - Remove this method once all callers (currently only 332 // used in tests) are migrated to the new SetSchema method. 333 libtextclassifier3::StatusOr<SetSchemaResult> SetSchema( 334 SchemaProto new_schema, bool ignore_errors_and_delete_documents); 335 336 // Update our current schema if it's compatible. Does not accept incompatible 337 // schema or schema with types from multiple databases. Compatibility rules 338 // defined by SchemaUtil::ComputeCompatibilityDelta. 339 // 340 // Does not support setting the schema across multiple databases if 341 // `feature_flags_->enable_schema_database()` is true. This means that: 342 // - All types within the new schema must have their `database` field matching 343 // `set_schema_request.database()`. 344 // 345 // If ignore_errors_and_delete_documents is set to true, then incompatible 346 // schema are allowed and we'll force set the schema, meaning 347 // SetSchemaResult.success will always be true. 348 // 349 // Returns: 350 // - SetSchemaResult that encapsulates the differences between the old and 351 // new schema, as well as if the new schema can be set. 352 // - INTERNAL_ERROR on any IO errors 353 // - ALREADY_EXISTS_ERROR if type names in the new schema are already in use 354 // by a different database. 355 // - INVALID_ARGUMENT_ERROR if the schema is invalid. This can happen if 356 // the schema is malformed, if the new schema contains types where the 357 // database field does not match the database field in the 358 // set_schema_request. 359 libtextclassifier3::StatusOr<SetSchemaResult> SetSchema( 360 SetSchemaRequestProto&& set_schema_request); 361 362 // Get the SchemaTypeConfigProto of schema_type name. 363 // 364 // Returns: 365 // SchemaTypeConfigProto on success 366 // FAILED_PRECONDITION if schema hasn't been set yet 367 // NOT_FOUND if schema type name doesn't exist 368 // INTERNAL on any I/O errors 369 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*> 370 GetSchemaTypeConfig(std::string_view schema_type) const; 371 372 // Get a map contains all schema_type name to its blob property paths. 373 // 374 // Returns: 375 // A map contains all schema_type name to its blob property paths on success 376 // FAILED_PRECONDITION if schema hasn't been set yet 377 // INTERNAL on any I/O errors 378 libtextclassifier3::StatusOr< 379 std::unordered_map<std::string, std::vector<std::string>>> 380 ConstructBlobPropertyMap() const; 381 382 // Returns the schema type of the passed in SchemaTypeId 383 // 384 // Returns: 385 // schema type on success 386 // FAILED_PRECONDITION if schema hasn't been set yet 387 // INVALID_ARGUMENT if schema type id is invalid 388 libtextclassifier3::StatusOr<const std::string*> GetSchemaType( 389 SchemaTypeId schema_type_id) const; 390 391 // Returns the SchemaTypeId of the passed in schema type 392 // 393 // Returns: 394 // SchemaTypeId on success 395 // FAILED_PRECONDITION if schema hasn't been set yet 396 // NOT_FOUND_ERROR if we don't know about the schema type 397 // INTERNAL_ERROR on IO error 398 libtextclassifier3::StatusOr<SchemaTypeId> GetSchemaTypeId( 399 std::string_view schema_type) const; 400 401 // Similar to GetSchemaTypeId but will return a set of SchemaTypeId to also 402 // include child types. 403 // 404 // Returns: 405 // A set of SchemaTypeId on success 406 // FAILED_PRECONDITION if schema hasn't been set yet 407 // NOT_FOUND_ERROR if we don't know about the schema type 408 // INTERNAL_ERROR on IO error 409 libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*> 410 GetSchemaTypeIdsWithChildren(std::string_view schema_type) const; 411 412 // Returns the SectionMetadata associated with the SectionId that's in the 413 // SchemaTypeId. 414 // 415 // Returns: 416 // Valid pointer to SectionMetadata on success 417 // FAILED_PRECONDITION if schema hasn't been set yet 418 // INVALID_ARGUMENT if schema type id or section id is invalid 419 libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata( 420 SchemaTypeId schema_type_id, SectionId section_id) const; 421 422 // Returns true if a property is defined in the said schema, regardless of 423 // whether it is indexed or not. 424 bool IsPropertyDefinedInSchema(SchemaTypeId schema_type_id, 425 const std::string& property) const; 426 427 // Extracts all sections of different types from the given document and group 428 // them by type. 429 // - Each Section vector is sorted by section Id in ascending order. The 430 // sorted section ids may not be continuous, since not all sections are 431 // present in the document. 432 // - Sections with empty content won't be returned. 433 // - For example, we may extract: 434 // string_sections: [2, 7, 10] 435 // integer_sections: [3, 5, 8] 436 // 437 // Returns: 438 // A SectionGroup instance on success 439 // FAILED_PRECONDITION if schema hasn't been set yet 440 // NOT_FOUND if type config name of document not found 441 libtextclassifier3::StatusOr<SectionGroup> ExtractSections( 442 const DocumentProto& document) const; 443 444 // Returns the JoinablePropertyMetadata associated with property_path that's 445 // in the SchemaTypeId. 446 // 447 // Returns: 448 // Valid pointer to JoinablePropertyMetadata on success 449 // nullptr if property_path doesn't exist (or is not joinable) in the 450 // joinable metadata list of the schema 451 // FAILED_PRECONDITION if schema hasn't been set yet 452 // INVALID_ARGUMENT if schema type id is invalid 453 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*> 454 GetJoinablePropertyMetadata(SchemaTypeId schema_type_id, 455 const std::string& property_path) const; 456 457 // Returns the JoinablePropertyMetadata associated with joinable_property_id 458 // that's in the SchemaTypeId. 459 // 460 // Returns: 461 // Valid pointer to JoinablePropertyMetadata on success 462 // FAILED_PRECONDITION if schema hasn't been set yet 463 // INVALID_ARGUMENT if schema type id or joinable property id is invalid 464 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*> 465 GetJoinablePropertyMetadata(SchemaTypeId schema_type_id, 466 JoinablePropertyId joinable_property_id) const; 467 468 // Extracts all joinable property contents of different types from the given 469 // document and group them by joinable value type. 470 // - Joinable properties are sorted by joinable property id in ascending 471 // order. The sorted joinable property ids may not be continuous, since not 472 // all joinable properties are present in the document. 473 // - Joinable property ids start from 0. 474 // - Joinable properties with empty content won't be returned. 475 // 476 // Returns: 477 // A JoinablePropertyGroup instance on success 478 // FAILED_PRECONDITION if schema hasn't been set yet 479 // NOT_FOUND if the type config name of document not found 480 libtextclassifier3::StatusOr<JoinablePropertyGroup> ExtractJoinableProperties( 481 const DocumentProto& document) const; 482 483 // Returns the quantization type for the given schema_type_id and section_id. 484 // 485 // Returns: 486 // - The quantization type on success. 487 // - INVALID_ARGUMENT_ERROR if schema_type_id or section_id is invalid. 488 // - Any error from schema store. 489 libtextclassifier3::StatusOr<EmbeddingIndexingConfig::QuantizationType::Code> GetQuantizationType(SchemaTypeId schema_type_id,SectionId section_id)490 GetQuantizationType(SchemaTypeId schema_type_id, SectionId section_id) const { 491 ICING_ASSIGN_OR_RETURN(const SectionMetadata* section_metadata, 492 GetSectionMetadata(schema_type_id, section_id)); 493 return section_metadata->quantization_type; 494 } 495 496 // Syncs all the data changes to disk. 497 // 498 // Returns: 499 // OK on success 500 // INTERNAL on I/O errors. 501 libtextclassifier3::Status PersistToDisk(); 502 503 // Recomputes the combined checksum of components of the schema store and 504 // updates the header. 505 // 506 // Returns: 507 // - the checksum on success 508 // - INTERNAL on I/O errors. 509 libtextclassifier3::StatusOr<Crc32> UpdateChecksum(); 510 511 // Recomputes the combined checksum of components of the schema store. Does 512 // NOT update the header. 513 // 514 // Returns: 515 // - the checksum on success 516 // - INTERNAL on I/O errors. 517 libtextclassifier3::StatusOr<Crc32> GetChecksum() const; 518 519 // Returns: 520 // - On success, the section metadata list for the specified schema type 521 // - NOT_FOUND if the schema type is not present in the schema 522 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*> 523 GetSectionMetadata(const std::string& schema_type) const; 524 525 // Gets the index of the given |property_path|, where the index N means that 526 // it is the Nth scorable property path in the schema config of the given 527 // |schema_type_id|, in lexicographical order. 528 // 529 // Returns: 530 // - Index on success 531 // - std::nullopt if the |property_path| doesn't point to a scorable 532 // property under the |schema_type_id| 533 // - FAILED_PRECONDITION if the schema hasn't been set yet 534 // - INVALID_ARGUMENT if |schema_type_id| is invalid 535 libtextclassifier3::StatusOr<std::optional<int>> GetScorablePropertyIndex( 536 SchemaTypeId schema_type_id, std::string_view property_path) const; 537 538 // Returns the list of ScorablePropertyInfo for the given |schema_type_id|, 539 // in lexicographical order of its property path. 540 // 541 // Returns: 542 // - Vector of scorable property info on success. The vector can be empty 543 // if no scorable property is found under the schema config of 544 // |schema_type_id|. 545 // - FAILED_PRECONDITION if the schema hasn't been set yet 546 // - INVALID_ARGUMENT if |schema_type_id| is invalid 547 libtextclassifier3::StatusOr< 548 const std::vector<ScorablePropertyManager::ScorablePropertyInfo>*> 549 GetOrderedScorablePropertyInfo(SchemaTypeId schema_type_id) const; 550 551 // Calculates the StorageInfo for the Schema Store. 552 // 553 // If an IO error occurs while trying to calculate the value for a field, then 554 // that field will be set to -1. 555 SchemaStoreStorageInfoProto GetStorageInfo() const; 556 557 // Get debug information for the schema store. 558 // 559 // Returns: 560 // SchemaDebugInfoProto on success 561 // INTERNAL_ERROR on IO errors, crc compute error 562 libtextclassifier3::StatusOr<SchemaDebugInfoProto> GetDebugInfo() const; 563 564 // Expands the provided type_property_masks into a vector of 565 // ExpandedTypePropertyMasks to account for polymorphism. If both a parent 566 // type and one of its child type appears in the masks, the parent type's 567 // paths will be merged into the child's. 568 // 569 // For example, assume that we have two schema types A and B, and we have 570 // - A is the parent type of B 571 // - Paths of A: {P1, P2} 572 // - Paths of B: {P3} 573 // 574 // Then, we will have the following in the result. 575 // - Expanded paths of A: {P1, P2} 576 // - Expanded paths of B: {P1, P2, P3} 577 std::vector<ExpandedTypePropertyMask> ExpandTypePropertyMasks( 578 const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks) 579 const; 580 581 private: 582 // Factory function to create a SchemaStore and set its schema. The created 583 // instance does not take ownership of any input components and all pointers 584 // must refer to valid objects that outlive the created SchemaStore instance. 585 // The base_dir must already exist. No schema must have set in base_dir prior 586 // to this. 587 // 588 // Returns: 589 // A SchemaStore on success 590 // FAILED_PRECONDITION on any null pointer input or if there has already 591 // been a schema set for this path. 592 // INTERNAL_ERROR on any IO errors 593 static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create( 594 const Filesystem* filesystem, const std::string& base_dir, 595 const Clock* clock, const FeatureFlags* feature_flags, 596 SchemaProto schema); 597 598 // Use SchemaStore::Create instead. 599 explicit SchemaStore(const Filesystem* filesystem, std::string base_dir, 600 const Clock* clock, const FeatureFlags* feature_flags); 601 602 // Deletes the overlay schema and ensures that the Header is correctly set. 603 // 604 // RETURNS: 605 // OK on success 606 // INTERNAL_ERROR on any IO errors 607 static libtextclassifier3::Status DiscardOverlaySchema( 608 const Filesystem* filesystem, const std::string& base_dir, 609 Header& header); 610 611 // Handles the overlay schema after a version change by deleting it if it is 612 // no longer compatible with the new version. 613 // 614 // Requires: base_dir exists. 615 // 616 // Returns: 617 // OK on success 618 // INTERNAL_ERROR on any IO errors 619 static libtextclassifier3::Status HandleOverlaySchemaForVersionChange( 620 const Filesystem* filesystem, const std::string& base_dir, 621 version_util::StateChange version_state_change, int32_t new_version); 622 623 // Populates the schema database field in the schema proto that is stored in 624 // the input schema file. 625 // 626 // Returns: 627 // OK on success or nothing to migrate 628 // INTERNAL_ERROR on IO error 629 static libtextclassifier3::Status PopulateSchemaDatabaseFieldForSchemaFile( 630 const Filesystem* filesystem, const std::string& schema_filename); 631 632 // Verifies that there is no error retrieving a previously set schema. Then 633 // initializes like normal. 634 // 635 // Returns: 636 // OK on success 637 // INTERNAL_ERROR on IO error 638 libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats); 639 640 // First, blindly writes new_schema to the schema_file. Then initializes like 641 // normal. 642 // 643 // Returns: 644 // OK on success 645 // INTERNAL_ERROR on IO error 646 // FAILED_PRECONDITION if there is already a schema set for the schema_file. 647 libtextclassifier3::Status Initialize(SchemaProto new_schema); 648 649 // Handles initializing the SchemaStore and regenerating any data if needed. 650 // 651 // Returns: 652 // OK on success 653 // INTERNAL_ERROR on IO error 654 libtextclassifier3::Status InitializeInternal( 655 bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats); 656 657 // Creates sub-components and verifies the integrity of each sub-component. 658 // 659 // Returns: 660 // OK on success 661 // INTERNAL_ERROR on IO error 662 libtextclassifier3::Status InitializeDerivedFiles(); 663 664 // Populates any derived data structures off of the schema. 665 // 666 // Returns: 667 // OK on success 668 // NOT_FOUND_ERROR if a schema proto has not been set 669 // INTERNAL_ERROR on any IO errors 670 libtextclassifier3::Status RegenerateDerivedFiles( 671 bool create_overlay_if_necessary); 672 673 // Build type_config_map_, schema_subtype_id_map_, and schema_type_manager_. 674 // 675 // Returns: 676 // OK on success 677 // NOT_FOUND_ERROR if a schema proto has not been set 678 // INTERNAL_ERROR on any IO errors 679 libtextclassifier3::Status BuildInMemoryCache(); 680 681 // Update and replace the header file. Creates the header file if it doesn't 682 // exist. 683 // 684 // Returns: 685 // OK on success 686 // INTERNAL on I/O error 687 libtextclassifier3::Status UpdateHeader(const Crc32& checksum); 688 689 // Resets the unique_ptr to the schema_type_mapper_, deletes the underlying 690 // file, and re-creates a new instance of the schema_type_mapper_. Does not 691 // populate the schema_type_mapper_. 692 // 693 // Returns any IO errors. 694 libtextclassifier3::Status ResetSchemaTypeMapper(); 695 696 // Creates a new schema store with new_schema and then swaps that new schema 697 // store with the existing one. This function guarantees that either: this 698 // instance will be fully updated to the new schema or no changes will take 699 // effect. 700 // 701 // Returns: 702 // OK on success 703 // INTERNAL on I/O error. 704 libtextclassifier3::Status ApplySchemaChange(SchemaProto new_schema); 705 CheckSchemaSet()706 libtextclassifier3::Status CheckSchemaSet() const { 707 return has_schema_successfully_set_ 708 ? libtextclassifier3::Status::OK 709 : absl_ports::FailedPreconditionError("Schema not set yet."); 710 } 711 712 // Correctly loads the Header, schema_file_ and (if present) the 713 // overlay_schema_file_. 714 // 715 // If feature_flags_->release_backup_schema_file_after_initialization() is 716 // true, then schema_file_ will be released if the overlay_schema_file_ is 717 // present. 718 // 719 // RETURNS: 720 // - OK on success 721 // - INTERNAL if an IO error is encountered when reading the Header or 722 // schemas. 723 // Or an invalid schema configuration is present. 724 libtextclassifier3::Status LoadSchema(); 725 726 // Resets the schema_file_'s cached FileBackedProto instance if needed. 727 // 728 // This is the case if the overlay_schema_file_ is present and 729 // feature_flags_->release_backup_schema_file_if_overlay_present is true. ResetSchemaFileIfNeeded()730 void ResetSchemaFileIfNeeded() { 731 if (feature_flags_->release_backup_schema_file_if_overlay_present() && 732 overlay_schema_file_ != nullptr) { 733 ICING_VLOG(2) 734 << "Freeing schema store's base schema file's " 735 "FileBackedProto instance since overlay_schema_file_ is present."; 736 schema_file_.ReleaseCachedSchemaFile(); 737 } 738 } 739 740 // Sets the schema for a database for the first time. 741 // 742 // Note that when schema database is disabled, this function sets the entire 743 // schema, with all types under the default empty database. 744 // 745 // Requires: 746 // - `new_schema` is valid according to `ValidateSchemaDatabase' 747 // 748 // Returns: 749 // - SetSchemaResult that indicates if the new schema can be set. 750 // - INTERNAL_ERROR on any IO errors. 751 // - INVALID_ARGUMENT_ERROR if the new schema is invalid. 752 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult> 753 SetInitialSchemaForDatabase(SchemaProto new_schema, 754 const std::string& database, 755 bool ignore_errors_and_delete_documents); 756 757 // Sets the schema for a database, overriding any existing schema for that 758 // database. 759 // 760 // Note that when schema database is disabled, this function sets and 761 // overrides the entire schema. 762 // 763 // Requires: 764 // - `new_schema` and `database` are valid according to 765 // `ValidateSchemaDatabase(new_schema, database)` 766 // - Types in `new_schema` and `old_schema` all belong to the provided 767 // database. 768 // - The old schema is guaranteed to contain types from exactly one 769 // database when schema database is enabled, because it was obtained 770 // using `GetSchema(database)`. 771 // 772 // Returns: 773 // - SetSchemaResult that encapsulates the differences between the old and 774 // new schema, as well as if the new schema can be set. 775 // - INTERNAL_ERROR on any IO errors. 776 // - INVALID_ARGUMENT_ERROR if the schema is invalid, or if there are 777 // mismatches between the schema databases. 778 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult> 779 SetSchemaWithDatabaseOverride(SchemaProto new_schema, 780 const SchemaProto& old_schema, 781 const std::string& database, 782 bool ignore_errors_and_delete_documents); 783 784 // Initial validation on the SchemaProto for SetSchema. This is intended as a 785 // preliminary check before any expensive operations are performed during 786 // `SetSchema::Validate`. Returns the schema's database if it's valid. 787 // 788 // Note that when schema database is disabled, any schema input is valid and 789 // an empty string is returned as the database. 790 // 791 // Checks that: 792 // - The new schema only contains types from a single database, which matches 793 // the provided database. 794 // - The schema's type names are not already in use in other databases. This 795 // is done outside of `SchemaUtil::Validate` because we need to know all 796 // existing type names, which is stored in the SchemaStore and not known to 797 // SchemaUtil. 798 // 799 // Returns: 800 // - OK on success 801 // - INVALID_ARGUMENT_ERROR if new_schema.types's databases do not match the 802 // provided database. 803 // - ALREADY_EXISTS_ERROR if new_schema's types names are not unique 804 libtextclassifier3::Status ValidateSchemaDatabase( 805 const SchemaProto& new_schema, const std::string& database) const; 806 807 // Returns a SchemaProto representing the full schema, which is a combination 808 // of the existing schema and the input database schema. Deletes all types 809 // belonging to the specified database if input_database_schema is an empty 810 // proto. 811 // 812 // For the database being updated by the input database schema: 813 // - If the existing schema does not contain the database, the input types 814 // are appended to the end of the SchemaProto, without changing the order 815 // of the existing schema types. 816 // - Otherwise, the existing schema types are replaced with types from the 817 // input database schema in their original position in the existing 818 // SchemaProto. 819 // - Types from input_database_schema are added in the order in which they 820 // appear. 821 // - If more types are added to the database, the additional types are 822 // appended at the end of the SchemaProto, without changing the order of 823 // existing types from unaffected databases. 824 // 825 // Requires: 826 // - input_database_schema is valid according to `ValidateSchemaDatabase` 827 // and `SchemaUtil::Validate`. 828 // 829 // Returns: 830 // - SchemaProto on success 831 // - INTERNAL_ERROR on any IO errors, or if the schema store was not 832 // previously initialized properly. 833 // - INVALID_ARGUMENT_ERROR if the input schema contains types from multiple 834 // databases. 835 libtextclassifier3::StatusOr<SchemaProto> GetFullSchemaProtoWithUpdatedDb( 836 SchemaProto input_database_schema, 837 const std::string& database_to_update) const; 838 839 const Filesystem* filesystem_; 840 std::string base_dir_; 841 const Clock* clock_; 842 const FeatureFlags* feature_flags_; // Does not own. 843 844 // Used internally to indicate whether the class has been successfully 845 // initialized with a valid schema. Will be false if Initialize failed or no 846 // schema has ever been set. 847 bool has_schema_successfully_set_ = false; 848 849 // Wrapper class to store a cached schema file FileBackedProto instance and 850 // its checksum. 851 class SchemaFileCache { 852 public: SchemaFileCache(const Filesystem * filesystem,const std::string & schema_file_path)853 explicit SchemaFileCache(const Filesystem* filesystem, 854 const std::string& schema_file_path) 855 : filesystem_(filesystem), schema_file_path_(schema_file_path) {} 856 // Returns a reference to the proto read from the schema FileBackedProto. 857 // 858 // NOTE: The caller does NOT get ownership of the object returned and 859 // the returned object is only valid till a new version of the proto is 860 // written to the file. 861 // 862 // Returns NOT_FOUND if the file was empty or never written to. 863 // Returns INTERNAL_ERROR if an IO error or a corruption was encountered. Read()864 libtextclassifier3::StatusOr<const SchemaProto*> Read() { 865 return GetCachedSchemaFile().Read(); 866 } 867 868 // Writes the new schema_proto to schema_file_ and updates the cached 869 // checksum. 870 // 871 // Returns: INTERNAL_ERROR if any IO error is encountered. Write(std::unique_ptr<SchemaProto> schema_proto)872 libtextclassifier3::Status Write( 873 std::unique_ptr<SchemaProto> schema_proto) { 874 ICING_RETURN_IF_ERROR( 875 GetCachedSchemaFile().Write(std::move(schema_proto))); 876 ICING_ASSIGN_OR_RETURN(Crc32 checksum, 877 GetCachedSchemaFile().GetChecksum()); 878 checksum_ = std::make_unique<Crc32>(checksum); 879 return libtextclassifier3::Status::OK; 880 } 881 882 // Sets the swapped_to_file_path for the cached schema_file_ instance and 883 // the schema_file_path_. SetSwappedFilepath(std::string new_schema_file_path)884 void SetSwappedFilepath(std::string new_schema_file_path) { 885 if (schema_file_ != nullptr) { 886 schema_file_->SetSwappedFilepath(new_schema_file_path); 887 } 888 schema_file_path_ = std::move(new_schema_file_path); 889 } 890 891 // Releases the cached schema_file_ FileBackedProto instance. ReleaseCachedSchemaFile()892 void ReleaseCachedSchemaFile() { schema_file_.reset(); } 893 GetChecksum()894 libtextclassifier3::StatusOr<Crc32> GetChecksum() { 895 if (checksum_ == nullptr) { 896 ICING_ASSIGN_OR_RETURN(Crc32 checksum, 897 GetCachedSchemaFile().GetChecksum()); 898 checksum_ = std::make_unique<Crc32>(std::move(checksum)); 899 } 900 return *checksum_; 901 } 902 903 private: GetCachedSchemaFile()904 FileBackedProto<SchemaProto>& GetCachedSchemaFile() { 905 if (schema_file_ == nullptr) { 906 schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>( 907 *filesystem_, schema_file_path_); 908 } 909 return *schema_file_; 910 } 911 912 const Filesystem* filesystem_; 913 std::string schema_file_path_; 914 std::unique_ptr<FileBackedProto<SchemaProto>> schema_file_; 915 std::unique_ptr<Crc32> checksum_; 916 }; 917 918 // Caches a FileBackedProto instance and the checksum for the schema file. 919 // 920 // If the overlay_schema_file_ is present and 921 // feature_flags_->release_backup_schema_file_if_overlay_present is true, then 922 // the cached schema FileBackedProto instance should be released and reloaded 923 // only during mutating SetSchema operations. 924 mutable SchemaFileCache schema_file_; 925 926 // This schema holds the definition of any schema types that are not 927 // compatible with older versions of Icing code. 928 std::unique_ptr<FileBackedProto<SchemaProto>> overlay_schema_file_; 929 930 // Maps schema types to a densely-assigned unique id. 931 std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_; 932 933 // Maps schema type ids to the corresponding schema type. This is an inverse 934 // map of schema_type_mapper_. 935 std::unordered_map<SchemaTypeId, std::string> reverse_schema_type_mapper_; 936 937 // A hash map of (database -> vector of type config names in the database). 938 // 939 // We use a vector instead of a set because we need to preserve the order of 940 // the types (i.e. the order in which they appear in the input SchemaProto 941 // during SetSchema), so that we can return the correct SchemaProto for 942 // GetSchema. 943 // 944 // This keeps track of the type configs defined in each database, which allows 945 // schema operations to be performed on a per-database basis. 946 std::unordered_map<std::string, std::vector<std::string>> database_type_map_; 947 948 // A hash map of (type config name -> type config), allows faster lookup of 949 // type config in schema. The O(1) type config access makes schema-related and 950 // section-related operations faster. 951 SchemaUtil::TypeConfigMap type_config_map_; 952 953 // Maps from each type id to all of its subtype ids. 954 // T2 is a subtype of T1, if and only if one of the following conditions is 955 // met: 956 // - T2 is T1 957 // - T2 extends T1 958 // - There exists a type U, such that T2 is a subtype of U, and U is a subtype 959 // of T1 960 std::unordered_map<SchemaTypeId, std::unordered_set<SchemaTypeId>> 961 schema_subtype_id_map_; 962 963 // Manager of section (indexable property) and joinable property related 964 // metadata for all Schemas. 965 std::unique_ptr<const SchemaTypeManager> schema_type_manager_; 966 967 // Used to cache and manage the schema's scorable properties. 968 std::unique_ptr<ScorablePropertyManager> scorable_property_manager_; 969 970 std::unique_ptr<Header> header_; 971 }; 972 973 } // namespace lib 974 } // namespace icing 975 976 #endif // ICING_SCHEMA_SCHEMA_STORE_H_ 977