1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_SCHEMA_SCHEMA_STORE_H_ 16 #define ICING_SCHEMA_SCHEMA_STORE_H_ 17 18 #include <cstdint> 19 #include <cstring> 20 #include <limits> 21 #include <memory> 22 #include <string> 23 #include <string_view> 24 #include <unordered_map> 25 #include <unordered_set> 26 #include <vector> 27 28 #include "icing/text_classifier/lib3/utils/base/status.h" 29 #include "icing/text_classifier/lib3/utils/base/statusor.h" 30 #include "icing/absl_ports/canonical_errors.h" 31 #include "icing/file/file-backed-proto.h" 32 #include "icing/file/filesystem.h" 33 #include "icing/file/version-util.h" 34 #include "icing/proto/debug.pb.h" 35 #include "icing/proto/document.pb.h" 36 #include "icing/proto/logging.pb.h" 37 #include "icing/proto/schema.pb.h" 38 #include "icing/proto/search.pb.h" 39 #include "icing/proto/storage.pb.h" 40 #include "icing/schema/joinable-property.h" 41 #include "icing/schema/schema-type-manager.h" 42 #include "icing/schema/schema-util.h" 43 #include "icing/schema/section.h" 44 #include "icing/store/document-filter-data.h" 45 #include "icing/store/key-mapper.h" 46 #include "icing/util/clock.h" 47 #include "icing/util/crc32.h" 48 49 namespace icing { 50 namespace lib { 51 52 // Holds the ground truth schema proto. Tracks compatible changes to the schema 53 // and will update any derived data based on the schema proto, such as Sections, 54 // SchemaTypeConfigs, PropertyConfigs, and SchemaTypeIds. To ensure they have 55 // the most up-to-date data, callers should not save instances themselves and 56 // should always call Get* from the SchemaStore. 57 class SchemaStore { 58 public: 59 struct LegacyHeader { 60 // Holds the magic as a quick sanity check against file corruption. 61 int32_t magic; 62 63 // Checksum of the SchemaStore's sub-component's checksums. 64 uint32_t checksum; 65 }; 66 67 class Header { 68 public: 69 static constexpr int32_t kMagic = 0x72650d0a; 70 Header()71 explicit Header() 72 : magic_(kMagic), 73 checksum_(0), 74 overlay_created_(false), 75 min_overlay_version_compatibility_( 76 std::numeric_limits<int32_t>::max()) { 77 memset(overlay_created_padding_, 0, kOverlayCreatedPaddingSize); 78 memset(padding_, 0, kPaddingSize); 79 } 80 81 // RETURNS: 82 // - On success, a valid Header instance 83 // - NOT_FOUND if header file doesn't exist 84 // - INTERNAL if unable to read header 85 static libtextclassifier3::StatusOr<Header> Read( 86 const Filesystem* filesystem, const std::string& path); 87 88 libtextclassifier3::Status Write(const Filesystem* filesystem, 89 const std::string& path); 90 magic()91 int32_t magic() const { return magic_; } 92 checksum()93 uint32_t checksum() const { return checksum_; } set_checksum(uint32_t checksum)94 void set_checksum(uint32_t checksum) { checksum_ = checksum; } 95 overlay_created()96 bool overlay_created() const { return overlay_created_; } 97 min_overlay_version_compatibility()98 int32_t min_overlay_version_compatibility() const { 99 return min_overlay_version_compatibility_; 100 } 101 SetOverlayInfo(bool overlay_created,int32_t min_overlay_version_compatibility)102 void SetOverlayInfo(bool overlay_created, 103 int32_t min_overlay_version_compatibility) { 104 overlay_created_ = overlay_created; 105 min_overlay_version_compatibility_ = min_overlay_version_compatibility; 106 } 107 108 private: 109 // Holds the magic as a quick sanity check against file corruption. 110 int32_t magic_; 111 112 // Checksum of the SchemaStore's sub-component's checksums. 113 uint32_t checksum_; 114 115 bool overlay_created_; 116 // Three bytes of padding due to the fact that 117 // min_overlay_version_compatibility_ has an alignof() == 4 and the offset 118 // of overlay_created_padding_ == 9. 119 static constexpr int kOverlayCreatedPaddingSize = 3; 120 uint8_t overlay_created_padding_[kOverlayCreatedPaddingSize]; 121 122 int32_t min_overlay_version_compatibility_; 123 124 static constexpr int kPaddingSize = 1008; 125 // Padding exists just to reserve space for additional values. 126 uint8_t padding_[kPaddingSize]; 127 }; 128 static_assert(sizeof(Header) == 1024); 129 130 // Holds information on what may have been affected by the new schema. This is 131 // generally data that other classes may depend on from the SchemaStore, 132 // so that we can know if we should go update those classes as well. 133 struct SetSchemaResult { 134 // Whether we are able to write the schema as determined by SetSchema's 135 // arguments. This boolean reflects SetSchema's logic, and does not reflect 136 // any system level IO errors that may prevent the schema from being written 137 // to file. 138 bool success = false; 139 140 // SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if: 141 // 1. Schema types are added in the middle of the SchemaProto 142 // 2. Schema types are removed from the middle of the SchemaProto 143 // 3. Schema types are reordered in the SchemaProto 144 // 145 // SchemaTypeIds are not changed if schema types are added/removed to the 146 // end of the SchemaProto. 147 std::unordered_set<SchemaTypeId> old_schema_type_ids_changed; 148 149 // Schema types that have been removed from the new schema. Represented by 150 // the `schema_type` field in the SchemaTypeConfigProto. 151 std::unordered_set<std::string> schema_types_deleted_by_name; 152 153 // Schema types that have been removed from the new schema. Represented by 154 // the SchemaTypeId assigned to this SchemaTypeConfigProto in the *old* 155 // schema. 156 std::unordered_set<SchemaTypeId> schema_types_deleted_by_id; 157 158 // Schema types whose SchemaTypeConfigProto has changed in an incompatible 159 // manner in the new schema. Compatibility determined in 160 // SchemaUtil::ComputeCompatibilityDelta. Represented by the `schema_type` 161 // field in the SchemaTypeConfigProto. 162 std::unordered_set<std::string> schema_types_incompatible_by_name; 163 164 // Schema types whose SchemaTypeConfigProto has changed in an incompatible 165 // manner in the new schema. Compatibility determined in 166 // SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId 167 // assigned to this SchemaTypeConfigProto in the *old* schema. 168 std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id; 169 170 // Schema types that were added in the new schema. Represented by the 171 // `schema_type` field in the SchemaTypeConfigProto. 172 std::unordered_set<std::string> schema_types_new_by_name; 173 174 // Schema types that were changed in a way that was backwards compatible and 175 // didn't invalidate the index. Represented by the `schema_type` field in 176 // the SchemaTypeConfigProto. 177 std::unordered_set<std::string> 178 schema_types_changed_fully_compatible_by_name; 179 180 // Schema types that were changed in a way that was backwards compatible, 181 // but invalidated the index. Represented by the `schema_type` field in the 182 // SchemaTypeConfigProto. 183 std::unordered_set<std::string> schema_types_index_incompatible_by_name; 184 185 // Schema types that were changed in a way that was backwards compatible, 186 // but invalidated the joinable cache. Represented by the `schema_type` 187 // field in the SchemaTypeConfigProto. 188 std::unordered_set<std::string> schema_types_join_incompatible_by_name; 189 }; 190 191 struct ExpandedTypePropertyMask { 192 std::string schema_type; 193 std::unordered_set<std::string> paths; 194 }; 195 196 static constexpr std::string_view kSchemaTypeWildcard = "*"; 197 198 // Factory function to create a SchemaStore which does not take ownership 199 // of any input components, and all pointers must refer to valid objects that 200 // outlive the created SchemaStore instance. The base_dir must already exist. 201 // There does not need to be an existing schema already. 202 // 203 // If initialize_stats is present, the fields related to SchemaStore will be 204 // populated. 205 // 206 // Returns: 207 // A SchemaStore on success 208 // FAILED_PRECONDITION on any null pointer input 209 // INTERNAL_ERROR on any IO errors 210 static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create( 211 const Filesystem* filesystem, const std::string& base_dir, 212 const Clock* clock, InitializeStatsProto* initialize_stats = nullptr); 213 214 // Migrates schema files (backup v.s. new schema) according to version state 215 // change. 216 // 217 // Returns: 218 // OK on success or nothing to migrate 219 static libtextclassifier3::Status MigrateSchema( 220 const Filesystem* filesystem, const std::string& base_dir, 221 version_util::StateChange version_state_change, int32_t new_version); 222 223 // Discards all derived data in the schema store. 224 // 225 // Returns: 226 // OK on success or nothing to discard 227 // INTERNAL_ERROR on any I/O errors 228 static libtextclassifier3::Status DiscardDerivedFiles( 229 const Filesystem* filesystem, const std::string& base_dir); 230 231 SchemaStore(SchemaStore&&) = default; 232 SchemaStore& operator=(SchemaStore&&) = default; 233 234 SchemaStore(const SchemaStore&) = delete; 235 SchemaStore& operator=(const SchemaStore&) = delete; 236 237 // Persists and updates checksum of subcomponents. 238 ~SchemaStore(); 239 240 // Retrieve the current schema if it exists. 241 // 242 // Returns: 243 // SchemaProto* if exists 244 // INTERNAL_ERROR on any IO errors 245 // NOT_FOUND_ERROR if a schema hasn't been set before 246 libtextclassifier3::StatusOr<const SchemaProto*> GetSchema() const; 247 248 // Update our current schema if it's compatible. Does not accept incompatible 249 // schema. Compatibility rules defined by 250 // SchemaUtil::ComputeCompatibilityDelta. 251 // 252 // If ignore_errors_and_delete_documents is set to true, then incompatible 253 // schema are allowed and we'll force set the schema, meaning 254 // SetSchemaResult.success will always be true. 255 // 256 // Returns: 257 // SetSchemaResult that encapsulates the differences between the old and new 258 // schema, as well as if the new schema can be set. 259 // INTERNAL_ERROR on any IO errors 260 libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema( 261 const SchemaProto& new_schema, 262 bool ignore_errors_and_delete_documents, 263 bool allow_circular_schema_definitions); 264 libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema( 265 SchemaProto&& new_schema, 266 bool ignore_errors_and_delete_documents, 267 bool allow_circular_schema_definitions); 268 269 // Get the SchemaTypeConfigProto of schema_type name. 270 // 271 // Returns: 272 // SchemaTypeConfigProto on success 273 // FAILED_PRECONDITION if schema hasn't been set yet 274 // NOT_FOUND if schema type name doesn't exist 275 // INTERNAL on any I/O errors 276 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*> 277 GetSchemaTypeConfig(std::string_view schema_type) const; 278 279 // Returns the schema type of the passed in SchemaTypeId 280 // 281 // Returns: 282 // schema type on success 283 // FAILED_PRECONDITION if schema hasn't been set yet 284 // INVALID_ARGUMENT if schema type id is invalid 285 libtextclassifier3::StatusOr<const std::string*> GetSchemaType( 286 SchemaTypeId schema_type_id) const; 287 288 // Returns the SchemaTypeId of the passed in schema type 289 // 290 // Returns: 291 // SchemaTypeId on success 292 // FAILED_PRECONDITION if schema hasn't been set yet 293 // NOT_FOUND_ERROR if we don't know about the schema type 294 // INTERNAL_ERROR on IO error 295 libtextclassifier3::StatusOr<SchemaTypeId> GetSchemaTypeId( 296 std::string_view schema_type) const; 297 298 // Similar to GetSchemaTypeId but will return a set of SchemaTypeId to also 299 // include child types. 300 // 301 // Returns: 302 // A set of SchemaTypeId on success 303 // FAILED_PRECONDITION if schema hasn't been set yet 304 // NOT_FOUND_ERROR if we don't know about the schema type 305 // INTERNAL_ERROR on IO error 306 libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*> 307 GetSchemaTypeIdsWithChildren(std::string_view schema_type) const; 308 309 // Returns the SectionMetadata associated with the SectionId that's in the 310 // SchemaTypeId. 311 // 312 // Returns: 313 // Valid pointer to SectionMetadata on success 314 // FAILED_PRECONDITION if schema hasn't been set yet 315 // INVALID_ARGUMENT if schema type id or section id is invalid 316 libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata( 317 SchemaTypeId schema_type_id, SectionId section_id) const; 318 319 // Returns true if a property is defined in the said schema, regardless of 320 // whether it is indexed or not. 321 bool IsPropertyDefinedInSchema(SchemaTypeId schema_type_id, 322 const std::string& property) const; 323 324 // Extracts all sections of different types from the given document and group 325 // them by type. 326 // - Each Section vector is sorted by section Id in ascending order. The 327 // sorted section ids may not be continuous, since not all sections are 328 // present in the document. 329 // - Sections with empty content won't be returned. 330 // - For example, we may extract: 331 // string_sections: [2, 7, 10] 332 // integer_sections: [3, 5, 8] 333 // 334 // Returns: 335 // A SectionGroup instance on success 336 // FAILED_PRECONDITION if schema hasn't been set yet 337 // NOT_FOUND if type config name of document not found 338 libtextclassifier3::StatusOr<SectionGroup> ExtractSections( 339 const DocumentProto& document) const; 340 341 // Returns the JoinablePropertyMetadata associated with property_path that's 342 // in the SchemaTypeId. 343 // 344 // Returns: 345 // Valid pointer to JoinablePropertyMetadata on success 346 // nullptr if property_path doesn't exist (or is not joinable) in the 347 // joinable metadata list of the schema 348 // FAILED_PRECONDITION if schema hasn't been set yet 349 // INVALID_ARGUMENT if schema type id is invalid 350 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*> 351 GetJoinablePropertyMetadata(SchemaTypeId schema_type_id, 352 const std::string& property_path) const; 353 354 // Extracts all joinable property contents of different types from the given 355 // document and group them by joinable value type. 356 // - Joinable properties are sorted by joinable property id in ascending 357 // order. The sorted joinable property ids may not be continuous, since not 358 // all joinable properties are present in the document. 359 // - Joinable property ids start from 0. 360 // - Joinable properties with empty content won't be returned. 361 // 362 // Returns: 363 // A JoinablePropertyGroup instance on success 364 // FAILED_PRECONDITION if schema hasn't been set yet 365 // NOT_FOUND if the type config name of document not found 366 libtextclassifier3::StatusOr<JoinablePropertyGroup> ExtractJoinableProperties( 367 const DocumentProto& document) const; 368 369 // Syncs all the data changes to disk. 370 // 371 // Returns: 372 // OK on success 373 // INTERNAL on I/O errors. 374 libtextclassifier3::Status PersistToDisk(); 375 376 // Computes the combined checksum of the schema store - includes the ground 377 // truth and all derived files. 378 // 379 // Returns: 380 // Combined checksum on success 381 // INTERNAL_ERROR on compute error 382 libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const; 383 384 // Returns: 385 // - On success, the section metadata list for the specified schema type 386 // - NOT_FOUND if the schema type is not present in the schema 387 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*> 388 GetSectionMetadata(const std::string& schema_type) const; 389 390 // Calculates the StorageInfo for the Schema Store. 391 // 392 // If an IO error occurs while trying to calculate the value for a field, then 393 // that field will be set to -1. 394 SchemaStoreStorageInfoProto GetStorageInfo() const; 395 396 // Get debug information for the schema store. 397 // 398 // Returns: 399 // SchemaDebugInfoProto on success 400 // INTERNAL_ERROR on IO errors, crc compute error 401 libtextclassifier3::StatusOr<SchemaDebugInfoProto> GetDebugInfo() const; 402 403 // Expands the provided type_property_masks into a vector of 404 // ExpandedTypePropertyMasks to account for polymorphism. If both a parent 405 // type and one of its child type appears in the masks, the parent type's 406 // paths will be merged into the child's. 407 // 408 // For example, assume that we have two schema types A and B, and we have 409 // - A is the parent type of B 410 // - Paths of A: {P1, P2} 411 // - Paths of B: {P3} 412 // 413 // Then, we will have the following in the result. 414 // - Expanded paths of A: {P1, P2} 415 // - Expanded paths of B: {P1, P2, P3} 416 std::vector<ExpandedTypePropertyMask> ExpandTypePropertyMasks( 417 const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks) 418 const; 419 420 private: 421 // Factory function to create a SchemaStore and set its schema. The created 422 // instance does not take ownership of any input components and all pointers 423 // must refer to valid objects that outlive the created SchemaStore instance. 424 // The base_dir must already exist. No schema must have set in base_dir prior 425 // to this. 426 // 427 // Returns: 428 // A SchemaStore on success 429 // FAILED_PRECONDITION on any null pointer input or if there has already 430 // been a schema set for this path. 431 // INTERNAL_ERROR on any IO errors 432 static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create( 433 const Filesystem* filesystem, const std::string& base_dir, 434 const Clock* clock, SchemaProto schema); 435 436 // Use SchemaStore::Create instead. 437 explicit SchemaStore(const Filesystem* filesystem, std::string base_dir, 438 const Clock* clock); 439 440 // Deletes the overlay schema and ensures that the Header is correctly set. 441 // 442 // RETURNS: 443 // OK on success 444 // INTERNAL_ERROR on any IO errors 445 static libtextclassifier3::Status DiscardOverlaySchema( 446 const Filesystem* filesystem, const std::string& base_dir, 447 Header& header); 448 449 // Verifies that there is no error retrieving a previously set schema. Then 450 // initializes like normal. 451 // 452 // Returns: 453 // OK on success 454 // INTERNAL_ERROR on IO error 455 libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats); 456 457 // First, blindly writes new_schema to the schema_file. Then initializes like 458 // normal. 459 // 460 // Returns: 461 // OK on success 462 // INTERNAL_ERROR on IO error 463 // FAILED_PRECONDITION if there is already a schema set for the schema_file. 464 libtextclassifier3::Status Initialize(SchemaProto new_schema); 465 466 // Handles initializing the SchemaStore and regenerating any data if needed. 467 // 468 // Returns: 469 // OK on success 470 // INTERNAL_ERROR on IO error 471 libtextclassifier3::Status InitializeInternal( 472 bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats); 473 474 // Creates sub-components and verifies the integrity of each sub-component. 475 // 476 // Returns: 477 // OK on success 478 // INTERNAL_ERROR on IO error 479 libtextclassifier3::Status InitializeDerivedFiles(); 480 481 // Populates any derived data structures off of the schema. 482 // 483 // Returns: 484 // OK on success 485 // NOT_FOUND_ERROR if a schema proto has not been set 486 // INTERNAL_ERROR on any IO errors 487 libtextclassifier3::Status RegenerateDerivedFiles( 488 bool create_overlay_if_necessary); 489 490 // Build type_config_map_, schema_subtype_id_map_, and schema_type_manager_. 491 // 492 // Returns: 493 // OK on success 494 // NOT_FOUND_ERROR if a schema proto has not been set 495 // INTERNAL_ERROR on any IO errors 496 libtextclassifier3::Status BuildInMemoryCache(); 497 498 // Update and replace the header file. Creates the header file if it doesn't 499 // exist. 500 // 501 // Returns: 502 // OK on success 503 // INTERNAL on I/O error 504 libtextclassifier3::Status UpdateHeader(const Crc32& checksum); 505 506 // Resets the unique_ptr to the schema_type_mapper_, deletes the underlying 507 // file, and re-creates a new instance of the schema_type_mapper_. Does not 508 // populate the schema_type_mapper_. 509 // 510 // Returns any IO errors. 511 libtextclassifier3::Status ResetSchemaTypeMapper(); 512 513 // Creates a new schema store with new_schema and then swaps that new schema 514 // store with the existing one. This function guarantees that either: this 515 // instance will be fully updated to the new schema or no changes will take 516 // effect. 517 // 518 // Returns: 519 // OK on success 520 // INTERNAL on I/O error. 521 libtextclassifier3::Status ApplySchemaChange(SchemaProto new_schema); 522 CheckSchemaSet()523 libtextclassifier3::Status CheckSchemaSet() const { 524 return has_schema_successfully_set_ 525 ? libtextclassifier3::Status::OK 526 : absl_ports::FailedPreconditionError("Schema not set yet."); 527 } 528 529 // Correctly loads the Header, schema_file_ and (if present) the 530 // overlay_schema_file_. 531 // RETURNS: 532 // - OK on success 533 // - INTERNAL if an IO error is encountered when reading the Header or 534 // schemas. 535 // Or an invalid schema configuration is present. 536 libtextclassifier3::Status LoadSchema(); 537 538 const Filesystem* filesystem_; 539 std::string base_dir_; 540 const Clock* clock_; 541 542 // Used internally to indicate whether the class has been successfully 543 // initialized with a valid schema. Will be false if Initialize failed or no 544 // schema has ever been set. 545 bool has_schema_successfully_set_ = false; 546 547 // Cached schema 548 std::unique_ptr<FileBackedProto<SchemaProto>> schema_file_; 549 550 // This schema holds the definition of any schema types that are not 551 // compatible with older versions of Icing code. 552 std::unique_ptr<FileBackedProto<SchemaProto>> overlay_schema_file_; 553 554 // Maps schema types to a densely-assigned unique id. 555 std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_; 556 557 // Maps schema type ids to the corresponding schema type. This is an inverse 558 // map of schema_type_mapper_. 559 std::unordered_map<SchemaTypeId, std::string> reverse_schema_type_mapper_; 560 561 // A hash map of (type config name -> type config), allows faster lookup of 562 // type config in schema. The O(1) type config access makes schema-related and 563 // section-related operations faster. 564 SchemaUtil::TypeConfigMap type_config_map_; 565 566 // Maps from each type id to all of its subtype ids. 567 // T2 is a subtype of T1, if and only if one of the following conditions is 568 // met: 569 // - T2 is T1 570 // - T2 extends T1 571 // - There exists a type U, such that T2 is a subtype of U, and U is a subtype 572 // of T1 573 std::unordered_map<SchemaTypeId, std::unordered_set<SchemaTypeId>> 574 schema_subtype_id_map_; 575 576 // Manager of section (indexable property) and joinable property related 577 // metadata for all Schemas. 578 std::unique_ptr<const SchemaTypeManager> schema_type_manager_; 579 580 std::unique_ptr<Header> header_; 581 }; 582 583 } // namespace lib 584 } // namespace icing 585 586 #endif // ICING_SCHEMA_SCHEMA_STORE_H_ 587