1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_SCHEMA_SCHEMA_STORE_H_ 16 #define ICING_SCHEMA_SCHEMA_STORE_H_ 17 18 #include <cstdint> 19 #include <cstring> 20 #include <limits> 21 #include <memory> 22 #include <string> 23 #include <string_view> 24 #include <unordered_map> 25 #include <unordered_set> 26 #include <vector> 27 28 #include "icing/text_classifier/lib3/utils/base/status.h" 29 #include "icing/text_classifier/lib3/utils/base/statusor.h" 30 #include "icing/absl_ports/canonical_errors.h" 31 #include "icing/file/file-backed-proto.h" 32 #include "icing/file/filesystem.h" 33 #include "icing/file/version-util.h" 34 #include "icing/proto/debug.pb.h" 35 #include "icing/proto/document.pb.h" 36 #include "icing/proto/logging.pb.h" 37 #include "icing/proto/schema.pb.h" 38 #include "icing/proto/search.pb.h" 39 #include "icing/proto/storage.pb.h" 40 #include "icing/schema/joinable-property.h" 41 #include "icing/schema/schema-type-manager.h" 42 #include "icing/schema/schema-util.h" 43 #include "icing/schema/section.h" 44 #include "icing/store/document-filter-data.h" 45 #include "icing/store/key-mapper.h" 46 #include "icing/util/clock.h" 47 #include "icing/util/crc32.h" 48 49 namespace icing { 50 namespace lib { 51 52 // Holds the ground truth schema proto. Tracks compatible changes to the schema 53 // and will update any derived data based on the schema proto, such as Sections, 54 // SchemaTypeConfigs, PropertyConfigs, and SchemaTypeIds. To ensure they have 55 // the most up-to-date data, callers should not save instances themselves and 56 // should always call Get* from the SchemaStore. 57 class SchemaStore { 58 public: 59 struct LegacyHeader { 60 // Holds the magic as a quick sanity check against file corruption. 61 int32_t magic; 62 63 // Checksum of the SchemaStore's sub-component's checksums. 64 uint32_t checksum; 65 }; 66 67 class Header { 68 public: 69 static constexpr int32_t kMagic = 0x72650d0a; 70 Header()71 explicit Header() 72 : magic_(kMagic), 73 checksum_(0), 74 overlay_created_(false), 75 min_overlay_version_compatibility_( 76 std::numeric_limits<int32_t>::max()) { 77 memset(overlay_created_padding_, 0, kOverlayCreatedPaddingSize); 78 memset(padding_, 0, kPaddingSize); 79 } 80 81 // RETURNS: 82 // - On success, a valid Header instance 83 // - NOT_FOUND if header file doesn't exist 84 // - INTERNAL if unable to read header 85 static libtextclassifier3::StatusOr<Header> Read( 86 const Filesystem* filesystem, const std::string& path); 87 88 libtextclassifier3::Status Write(const Filesystem* filesystem, 89 const std::string& path); 90 magic()91 int32_t magic() const { return magic_; } 92 checksum()93 uint32_t checksum() const { return checksum_; } set_checksum(uint32_t checksum)94 void set_checksum(uint32_t checksum) { checksum_ = checksum; } 95 overlay_created()96 bool overlay_created() const { return overlay_created_; } 97 min_overlay_version_compatibility()98 int32_t min_overlay_version_compatibility() const { 99 return min_overlay_version_compatibility_; 100 } 101 SetOverlayInfo(bool overlay_created,int32_t min_overlay_version_compatibility)102 void SetOverlayInfo(bool overlay_created, 103 int32_t min_overlay_version_compatibility) { 104 overlay_created_ = overlay_created; 105 min_overlay_version_compatibility_ = min_overlay_version_compatibility; 106 } 107 108 private: 109 // Holds the magic as a quick sanity check against file corruption. 110 int32_t magic_; 111 112 // Checksum of the SchemaStore's sub-component's checksums. 113 uint32_t checksum_; 114 115 bool overlay_created_; 116 // Three bytes of padding due to the fact that 117 // min_overlay_version_compatibility_ has an alignof() == 4 and the offset 118 // of overlay_created_padding_ == 9. 119 static constexpr int kOverlayCreatedPaddingSize = 3; 120 uint8_t overlay_created_padding_[kOverlayCreatedPaddingSize]; 121 122 int32_t min_overlay_version_compatibility_; 123 124 static constexpr int kPaddingSize = 1008; 125 // Padding exists just to reserve space for additional values. 126 uint8_t padding_[kPaddingSize]; 127 }; 128 static_assert(sizeof(Header) == 1024); 129 130 // Holds information on what may have been affected by the new schema. This is 131 // generally data that other classes may depend on from the SchemaStore, 132 // so that we can know if we should go update those classes as well. 133 struct SetSchemaResult { 134 // Whether we are able to write the schema as determined by SetSchema's 135 // arguments. This boolean reflects SetSchema's logic, and does not reflect 136 // any system level IO errors that may prevent the schema from being written 137 // to file. 138 bool success = false; 139 140 // SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if: 141 // 1. Schema types are added in the middle of the SchemaProto 142 // 2. Schema types are removed from the middle of the SchemaProto 143 // 3. Schema types are reordered in the SchemaProto 144 // 145 // SchemaTypeIds are not changed if schema types are added/removed to the 146 // end of the SchemaProto. 147 std::unordered_set<SchemaTypeId> old_schema_type_ids_changed; 148 149 // Schema types that have been removed from the new schema. Represented by 150 // the `schema_type` field in the SchemaTypeConfigProto. 151 std::unordered_set<std::string> schema_types_deleted_by_name; 152 153 // Schema types that have been removed from the new schema. Represented by 154 // the SchemaTypeId assigned to this SchemaTypeConfigProto in the *old* 155 // schema. 156 std::unordered_set<SchemaTypeId> schema_types_deleted_by_id; 157 158 // Schema types whose SchemaTypeConfigProto has changed in an incompatible 159 // manner in the new schema. Compatibility determined in 160 // SchemaUtil::ComputeCompatibilityDelta. Represented by the `schema_type` 161 // field in the SchemaTypeConfigProto. 162 std::unordered_set<std::string> schema_types_incompatible_by_name; 163 164 // Schema types whose SchemaTypeConfigProto has changed in an incompatible 165 // manner in the new schema. Compatibility determined in 166 // SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId 167 // assigned to this SchemaTypeConfigProto in the *old* schema. 168 std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id; 169 170 // Schema types that were added in the new schema. Represented by the 171 // `schema_type` field in the SchemaTypeConfigProto. 172 std::unordered_set<std::string> schema_types_new_by_name; 173 174 // Schema types that were changed in a way that was backwards compatible and 175 // didn't invalidate the index. Represented by the `schema_type` field in 176 // the SchemaTypeConfigProto. 177 std::unordered_set<std::string> 178 schema_types_changed_fully_compatible_by_name; 179 180 // Schema types that were changed in a way that was backwards compatible, 181 // but invalidated the index. Represented by the `schema_type` field in the 182 // SchemaTypeConfigProto. 183 std::unordered_set<std::string> schema_types_index_incompatible_by_name; 184 185 // Schema types that were changed in a way that was backwards compatible, 186 // but invalidated the joinable cache. Represented by the `schema_type` 187 // field in the SchemaTypeConfigProto. 188 std::unordered_set<std::string> schema_types_join_incompatible_by_name; 189 }; 190 191 struct ExpandedTypePropertyMask { 192 std::string schema_type; 193 std::unordered_set<std::string> paths; 194 }; 195 196 static constexpr std::string_view kSchemaTypeWildcard = "*"; 197 198 // Factory function to create a SchemaStore which does not take ownership 199 // of any input components, and all pointers must refer to valid objects that 200 // outlive the created SchemaStore instance. The base_dir must already exist. 201 // There does not need to be an existing schema already. 202 // 203 // If initialize_stats is present, the fields related to SchemaStore will be 204 // populated. 205 // 206 // Returns: 207 // A SchemaStore on success 208 // FAILED_PRECONDITION on any null pointer input 209 // INTERNAL_ERROR on any IO errors 210 static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create( 211 const Filesystem* filesystem, const std::string& base_dir, 212 const Clock* clock, InitializeStatsProto* initialize_stats = nullptr); 213 214 // Migrates schema files (backup v.s. new schema) according to version state 215 // change. 216 // 217 // Returns: 218 // OK on success or nothing to migrate 219 static libtextclassifier3::Status MigrateSchema( 220 const Filesystem* filesystem, const std::string& base_dir, 221 version_util::StateChange version_state_change, int32_t new_version); 222 223 // Discards all derived data in the schema store. 224 // 225 // Returns: 226 // OK on success or nothing to discard 227 // INTERNAL_ERROR on any I/O errors 228 static libtextclassifier3::Status DiscardDerivedFiles( 229 const Filesystem* filesystem, const std::string& base_dir); 230 231 SchemaStore(SchemaStore&&) = default; 232 SchemaStore& operator=(SchemaStore&&) = default; 233 234 SchemaStore(const SchemaStore&) = delete; 235 SchemaStore& operator=(const SchemaStore&) = delete; 236 237 // Persists and updates checksum of subcomponents. 238 ~SchemaStore(); 239 240 // Retrieve the current schema if it exists. 241 // 242 // Returns: 243 // SchemaProto* if exists 244 // INTERNAL_ERROR on any IO errors 245 // NOT_FOUND_ERROR if a schema hasn't been set before 246 libtextclassifier3::StatusOr<const SchemaProto*> GetSchema() const; 247 248 // Update our current schema if it's compatible. Does not accept incompatible 249 // schema. Compatibility rules defined by 250 // SchemaUtil::ComputeCompatibilityDelta. 251 // 252 // If ignore_errors_and_delete_documents is set to true, then incompatible 253 // schema are allowed and we'll force set the schema, meaning 254 // SetSchemaResult.success will always be true. 255 // 256 // Returns: 257 // SetSchemaResult that encapsulates the differences between the old and new 258 // schema, as well as if the new schema can be set. 259 // INTERNAL_ERROR on any IO errors 260 libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema( 261 const SchemaProto& new_schema, 262 bool ignore_errors_and_delete_documents, 263 bool allow_circular_schema_definitions); 264 libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema( 265 SchemaProto&& new_schema, 266 bool ignore_errors_and_delete_documents, 267 bool allow_circular_schema_definitions); 268 269 // Get the SchemaTypeConfigProto of schema_type name. 270 // 271 // Returns: 272 // SchemaTypeConfigProto on success 273 // FAILED_PRECONDITION if schema hasn't been set yet 274 // NOT_FOUND if schema type name doesn't exist 275 // INTERNAL on any I/O errors 276 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*> 277 GetSchemaTypeConfig(std::string_view schema_type) const; 278 279 // Returns the SchemaTypeId of the passed in schema type 280 // 281 // Returns: 282 // SchemaTypeId on success 283 // FAILED_PRECONDITION if schema hasn't been set yet 284 // NOT_FOUND_ERROR if we don't know about the schema type 285 // INTERNAL_ERROR on IO error 286 libtextclassifier3::StatusOr<SchemaTypeId> GetSchemaTypeId( 287 std::string_view schema_type) const; 288 289 // Similar to GetSchemaTypeId but will return a set of SchemaTypeId to also 290 // include child types. 291 // 292 // Returns: 293 // A set of SchemaTypeId on success 294 // FAILED_PRECONDITION if schema hasn't been set yet 295 // NOT_FOUND_ERROR if we don't know about the schema type 296 // INTERNAL_ERROR on IO error 297 libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*> 298 GetSchemaTypeIdsWithChildren(std::string_view schema_type) const; 299 300 // Returns the SectionMetadata associated with the SectionId that's in the 301 // SchemaTypeId. 302 // 303 // Returns: 304 // Valid pointer to SectionMetadata on success 305 // FAILED_PRECONDITION if schema hasn't been set yet 306 // INVALID_ARGUMENT if schema type id or section id is invalid 307 libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata( 308 SchemaTypeId schema_type_id, SectionId section_id) const; 309 310 // Returns true if a property is defined in the said schema, regardless of 311 // whether it is indexed or not. 312 bool IsPropertyDefinedInSchema(SchemaTypeId schema_type_id, 313 const std::string& property) const; 314 315 // Extracts all sections of different types from the given document and group 316 // them by type. 317 // - Each Section vector is sorted by section Id in ascending order. The 318 // sorted section ids may not be continuous, since not all sections are 319 // present in the document. 320 // - Sections with empty content won't be returned. 321 // - For example, we may extract: 322 // string_sections: [2, 7, 10] 323 // integer_sections: [3, 5, 8] 324 // 325 // Returns: 326 // A SectionGroup instance on success 327 // FAILED_PRECONDITION if schema hasn't been set yet 328 // NOT_FOUND if type config name of document not found 329 libtextclassifier3::StatusOr<SectionGroup> ExtractSections( 330 const DocumentProto& document) const; 331 332 // Returns the JoinablePropertyMetadata associated with property_path that's 333 // in the SchemaTypeId. 334 // 335 // Returns: 336 // Valid pointer to JoinablePropertyMetadata on success 337 // nullptr if property_path doesn't exist (or is not joinable) in the 338 // joinable metadata list of the schema 339 // FAILED_PRECONDITION if schema hasn't been set yet 340 // INVALID_ARGUMENT if schema type id is invalid 341 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*> 342 GetJoinablePropertyMetadata(SchemaTypeId schema_type_id, 343 const std::string& property_path) const; 344 345 // Extracts all joinable property contents of different types from the given 346 // document and group them by joinable value type. 347 // - Joinable properties are sorted by joinable property id in ascending 348 // order. The sorted joinable property ids may not be continuous, since not 349 // all joinable properties are present in the document. 350 // - Joinable property ids start from 0. 351 // - Joinable properties with empty content won't be returned. 352 // 353 // Returns: 354 // A JoinablePropertyGroup instance on success 355 // FAILED_PRECONDITION if schema hasn't been set yet 356 // NOT_FOUND if the type config name of document not found 357 libtextclassifier3::StatusOr<JoinablePropertyGroup> ExtractJoinableProperties( 358 const DocumentProto& document) const; 359 360 // Syncs all the data changes to disk. 361 // 362 // Returns: 363 // OK on success 364 // INTERNAL on I/O errors. 365 libtextclassifier3::Status PersistToDisk(); 366 367 // Computes the combined checksum of the schema store - includes the ground 368 // truth and all derived files. 369 // 370 // Returns: 371 // Combined checksum on success 372 // INTERNAL_ERROR on compute error 373 libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const; 374 375 // Returns: 376 // - On success, the section metadata list for the specified schema type 377 // - NOT_FOUND if the schema type is not present in the schema 378 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*> 379 GetSectionMetadata(const std::string& schema_type) const; 380 381 // Calculates the StorageInfo for the Schema Store. 382 // 383 // If an IO error occurs while trying to calculate the value for a field, then 384 // that field will be set to -1. 385 SchemaStoreStorageInfoProto GetStorageInfo() const; 386 387 // Get debug information for the schema store. 388 // 389 // Returns: 390 // SchemaDebugInfoProto on success 391 // INTERNAL_ERROR on IO errors, crc compute error 392 libtextclassifier3::StatusOr<SchemaDebugInfoProto> GetDebugInfo() const; 393 394 // Expands the provided type_property_masks into a vector of 395 // ExpandedTypePropertyMasks to account for polymorphism. If both a parent 396 // type and one of its child type appears in the masks, the parent type's 397 // paths will be merged into the child's. 398 // 399 // For example, assume that we have two schema types A and B, and we have 400 // - A is the parent type of B 401 // - Paths of A: {P1, P2} 402 // - Paths of B: {P3} 403 // 404 // Then, we will have the following in the result. 405 // - Expanded paths of A: {P1, P2} 406 // - Expanded paths of B: {P1, P2, P3} 407 std::vector<ExpandedTypePropertyMask> ExpandTypePropertyMasks( 408 const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks) 409 const; 410 411 private: 412 // Factory function to create a SchemaStore and set its schema. The created 413 // instance does not take ownership of any input components and all pointers 414 // must refer to valid objects that outlive the created SchemaStore instance. 415 // The base_dir must already exist. No schema must have set in base_dir prior 416 // to this. 417 // 418 // Returns: 419 // A SchemaStore on success 420 // FAILED_PRECONDITION on any null pointer input or if there has already 421 // been a schema set for this path. 422 // INTERNAL_ERROR on any IO errors 423 static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create( 424 const Filesystem* filesystem, const std::string& base_dir, 425 const Clock* clock, SchemaProto schema); 426 427 // Use SchemaStore::Create instead. 428 explicit SchemaStore(const Filesystem* filesystem, std::string base_dir, 429 const Clock* clock); 430 431 // Deletes the overlay schema and ensures that the Header is correctly set. 432 // 433 // RETURNS: 434 // OK on success 435 // INTERNAL_ERROR on any IO errors 436 static libtextclassifier3::Status DiscardOverlaySchema( 437 const Filesystem* filesystem, const std::string& base_dir, 438 Header& header); 439 440 // Verifies that there is no error retrieving a previously set schema. Then 441 // initializes like normal. 442 // 443 // Returns: 444 // OK on success 445 // INTERNAL_ERROR on IO error 446 libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats); 447 448 // First, blindly writes new_schema to the schema_file. Then initializes like 449 // normal. 450 // 451 // Returns: 452 // OK on success 453 // INTERNAL_ERROR on IO error 454 // FAILED_PRECONDITION if there is already a schema set for the schema_file. 455 libtextclassifier3::Status Initialize(SchemaProto new_schema); 456 457 // Handles initializing the SchemaStore and regenerating any data if needed. 458 // 459 // Returns: 460 // OK on success 461 // INTERNAL_ERROR on IO error 462 libtextclassifier3::Status InitializeInternal( 463 bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats); 464 465 // Creates sub-components and verifies the integrity of each sub-component. 466 // 467 // Returns: 468 // OK on success 469 // INTERNAL_ERROR on IO error 470 libtextclassifier3::Status InitializeDerivedFiles(); 471 472 // Populates any derived data structures off of the schema. 473 // 474 // Returns: 475 // OK on success 476 // NOT_FOUND_ERROR if a schema proto has not been set 477 // INTERNAL_ERROR on any IO errors 478 libtextclassifier3::Status RegenerateDerivedFiles( 479 bool create_overlay_if_necessary); 480 481 // Build type_config_map_, schema_subtype_id_map_, and schema_type_manager_. 482 // 483 // Returns: 484 // OK on success 485 // NOT_FOUND_ERROR if a schema proto has not been set 486 // INTERNAL_ERROR on any IO errors 487 libtextclassifier3::Status BuildInMemoryCache(); 488 489 // Update and replace the header file. Creates the header file if it doesn't 490 // exist. 491 // 492 // Returns: 493 // OK on success 494 // INTERNAL on I/O error 495 libtextclassifier3::Status UpdateHeader(const Crc32& checksum); 496 497 // Resets the unique_ptr to the schema_type_mapper_, deletes the underlying 498 // file, and re-creates a new instance of the schema_type_mapper_. Does not 499 // populate the schema_type_mapper_. 500 // 501 // Returns any IO errors. 502 libtextclassifier3::Status ResetSchemaTypeMapper(); 503 504 // Creates a new schema store with new_schema and then swaps that new schema 505 // store with the existing one. This function guarantees that either: this 506 // instance will be fully updated to the new schema or no changes will take 507 // effect. 508 // 509 // Returns: 510 // OK on success 511 // INTERNAL on I/O error. 512 libtextclassifier3::Status ApplySchemaChange(SchemaProto new_schema); 513 CheckSchemaSet()514 libtextclassifier3::Status CheckSchemaSet() const { 515 return has_schema_successfully_set_ 516 ? libtextclassifier3::Status::OK 517 : absl_ports::FailedPreconditionError("Schema not set yet."); 518 } 519 520 // Correctly loads the Header, schema_file_ and (if present) the 521 // overlay_schema_file_. 522 // RETURNS: 523 // - OK on success 524 // - INTERNAL if an IO error is encountered when reading the Header or 525 // schemas. 526 // Or an invalid schema configuration is present. 527 libtextclassifier3::Status LoadSchema(); 528 529 const Filesystem* filesystem_; 530 std::string base_dir_; 531 const Clock* clock_; 532 533 // Used internally to indicate whether the class has been successfully 534 // initialized with a valid schema. Will be false if Initialize failed or no 535 // schema has ever been set. 536 bool has_schema_successfully_set_ = false; 537 538 // Cached schema 539 std::unique_ptr<FileBackedProto<SchemaProto>> schema_file_; 540 541 // This schema holds the definition of any schema types that are not 542 // compatible with older versions of Icing code. 543 std::unique_ptr<FileBackedProto<SchemaProto>> overlay_schema_file_; 544 545 // Maps schema types to a densely-assigned unique id. 546 std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_; 547 548 // Maps schema type ids to the corresponding schema type. This is an inverse 549 // map of schema_type_mapper_. 550 std::unordered_map<SchemaTypeId, std::string> reverse_schema_type_mapper_; 551 552 // A hash map of (type config name -> type config), allows faster lookup of 553 // type config in schema. The O(1) type config access makes schema-related and 554 // section-related operations faster. 555 SchemaUtil::TypeConfigMap type_config_map_; 556 557 // Maps from each type id to all of its subtype ids. 558 // T2 is a subtype of T1, if and only if one of the following conditions is 559 // met: 560 // - T2 is T1 561 // - T2 extends T1 562 // - There exists a type U, such that T2 is a subtype of U, and U is a subtype 563 // of T1 564 std::unordered_map<SchemaTypeId, std::unordered_set<SchemaTypeId>> 565 schema_subtype_id_map_; 566 567 // Manager of section (indexable property) and joinable property related 568 // metadata for all Schemas. 569 std::unique_ptr<const SchemaTypeManager> schema_type_manager_; 570 571 std::unique_ptr<Header> header_; 572 }; 573 574 } // namespace lib 575 } // namespace icing 576 577 #endif // ICING_SCHEMA_SCHEMA_STORE_H_ 578