1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_SCHEMA_SCHEMA_STORE_H_ 16 #define ICING_SCHEMA_SCHEMA_STORE_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <unordered_set> 23 #include <vector> 24 25 #include "icing/text_classifier/lib3/utils/base/status.h" 26 #include "icing/text_classifier/lib3/utils/base/statusor.h" 27 #include "icing/file/file-backed-proto.h" 28 #include "icing/file/filesystem.h" 29 #include "icing/proto/document.pb.h" 30 #include "icing/proto/logging.pb.h" 31 #include "icing/proto/schema.pb.h" 32 #include "icing/proto/storage.pb.h" 33 #include "icing/schema/schema-util.h" 34 #include "icing/schema/section-manager.h" 35 #include "icing/schema/section.h" 36 #include "icing/store/document-filter-data.h" 37 #include "icing/store/key-mapper.h" 38 #include "icing/util/clock.h" 39 #include "icing/util/crc32.h" 40 41 namespace icing { 42 namespace lib { 43 44 // Holds the ground truth schema proto. Tracks compatible changes to the schema 45 // and will update any derived data based on the schema proto, such as Sections, 46 // SchemaTypeConfigs, PropertyConfigs, and SchemaTypeIds. To ensure they have 47 // the most up-to-date data, callers should not save instances themselves and 48 // should always call Get* from the SchemaStore. 49 class SchemaStore { 50 public: 51 struct Header { 52 static constexpr int32_t kMagic = 0x72650d0a; 53 54 // Holds the magic as a quick sanity check against file corruption. 55 int32_t magic; 56 57 // Checksum of the SchemaStore's sub-component's checksums. 58 uint32_t checksum; 59 }; 60 61 // Holds information on what may have been affected by the new schema. This is 62 // generally data that other classes may depend on from the SchemaStore, 63 // so that we can know if we should go update those classes as well. 64 struct SetSchemaResult { 65 // Whether we are able to write the schema as determined by SetSchema's 66 // arguments. This boolean reflects SetSchema's logic, and does not reflect 67 // any system level IO errors that may prevent the schema from being written 68 // to file. 69 bool success = false; 70 71 // Whether the new schema changes invalidate the index. 72 bool index_incompatible = false; 73 74 // SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if: 75 // 1. Schema types are added in the middle of the SchemaProto 76 // 2. Schema types are removed from the middle of the SchemaProto 77 // 3. Schema types are reordered in the SchemaProto 78 // 79 // SchemaTypeIds are not changed if schema types are added/removed to the 80 // end of the SchemaProto. 81 std::unordered_set<SchemaTypeId> old_schema_type_ids_changed; 82 83 // Schema types that have been removed from the new schema. Represented by 84 // the `schema_type` field in the SchemaTypeConfigProto. 85 std::unordered_set<std::string> schema_types_deleted_by_name; 86 87 // Schema types that have been removed from the new schema. Represented by 88 // the SchemaTypeId assigned to this SchemaTypeConfigProto in the *old* 89 // schema. 90 std::unordered_set<SchemaTypeId> schema_types_deleted_by_id; 91 92 // Schema types whose SchemaTypeConfigProto has changed in an incompatible 93 // manner in the new schema. Compatibility determined in 94 // SchemaUtil::ComputeCompatibilityDelta. Represented by the `schema_type` 95 // field in the SchemaTypeConfigProto. 96 std::unordered_set<std::string> schema_types_incompatible_by_name; 97 98 // Schema types whose SchemaTypeConfigProto has changed in an incompatible 99 // manner in the new schema. Compatibility determined in 100 // SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId 101 // assigned to this SchemaTypeConfigProto in the *old* schema. 102 std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id; 103 }; 104 105 // Factory function to create a SchemaStore which does not take ownership 106 // of any input components, and all pointers must refer to valid objects that 107 // outlive the created SchemaStore instance. The base_dir must already exist. 108 // There does not need to be an existing schema already. 109 // 110 // If initialize_stats is present, the fields related to SchemaStore will be 111 // populated. 112 // 113 // Returns: 114 // A SchemaStore on success 115 // FAILED_PRECONDITION on any null pointer input 116 // INTERNAL_ERROR on any IO errors 117 static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create( 118 const Filesystem* filesystem, const std::string& base_dir, 119 const Clock* clock, InitializeStatsProto* initialize_stats = nullptr); 120 121 // Not copyable 122 SchemaStore(const SchemaStore&) = delete; 123 SchemaStore& operator=(const SchemaStore&) = delete; 124 125 // Persists and updates checksum of subcomponents. 126 ~SchemaStore(); 127 128 // Retrieve the current schema if it exists. Caller does not get ownership of 129 // the schema proto and modifying the returned pointer does not affect the 130 // underlying schema proto. 131 // 132 // Returns: 133 // SchemaProto* if exists 134 // INTERNAL_ERROR on any IO errors 135 // NOT_FOUND_ERROR if a schema hasn't been set before 136 libtextclassifier3::StatusOr<const SchemaProto*> GetSchema() const; 137 138 // Update our current schema if it's compatible. Does not accept incompatible 139 // schema. Compatibility rules defined by 140 // SchemaUtil::ComputeCompatibilityDelta. 141 // 142 // If ignore_errors_and_delete_documents is set to true, then incompatible 143 // schema are allowed and we'll force set the schema, meaning 144 // SetSchemaResult.success will always be true. 145 // 146 // Returns: 147 // SetSchemaResult that encapsulates the differences between the old and new 148 // schema, as well as if the new schema can be set. 149 // INTERNAL_ERROR on any IO errors 150 libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema( 151 const SchemaProto& new_schema, 152 bool ignore_errors_and_delete_documents = false); 153 libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema( 154 SchemaProto&& new_schema, 155 bool ignore_errors_and_delete_documents = false); 156 157 // Get the SchemaTypeConfigProto of schema_type name. 158 // 159 // Returns: 160 // SchemaTypeConfigProto on success 161 // FAILED_PRECONDITION if schema hasn't been set yet 162 // NOT_FOUND if schema type name doesn't exist 163 // INTERNAL on any I/O errors 164 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*> 165 GetSchemaTypeConfig(std::string_view schema_type) const; 166 167 // Returns the SchemaTypeId of the passed in schema type 168 // 169 // Returns: 170 // SchemaTypeId on success 171 // FAILED_PRECONDITION if schema hasn't been set yet 172 // NOT_FOUND_ERROR if we don't know about the schema type 173 // INTERNAL_ERROR on IO error 174 libtextclassifier3::StatusOr<SchemaTypeId> GetSchemaTypeId( 175 std::string_view schema_type) const; 176 177 // Finds content of a section by section path (e.g. property1.property2) 178 // 179 // Returns: 180 // A string of content on success 181 // FAILED_PRECONDITION if schema hasn't been set yet 182 // NOT_FOUND if: 183 // 1. Property is optional and not found in the document 184 // 2. section_path is invalid 185 // 3. Content is empty 186 libtextclassifier3::StatusOr<std::vector<std::string_view>> 187 GetStringSectionContent(const DocumentProto& document, 188 std::string_view section_path) const; 189 190 // Finds content of a section by id 191 // 192 // Returns: 193 // A string of content on success 194 // FAILED_PRECONDITION if schema hasn't been set yet 195 // INVALID_ARGUMENT if section id is invalid 196 // NOT_FOUND if type config name of document not found 197 libtextclassifier3::StatusOr<std::vector<std::string_view>> 198 GetStringSectionContent(const DocumentProto& document, 199 SectionId section_id) const; 200 201 // Returns the SectionMetadata associated with the SectionId that's in the 202 // SchemaTypeId. 203 // 204 // Returns: 205 // pointer to SectionMetadata on success 206 // FAILED_PRECONDITION if schema hasn't been set yet 207 // INVALID_ARGUMENT if schema type id or section is invalid 208 libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata( 209 SchemaTypeId schema_type_id, SectionId section_id) const; 210 211 // Extracts all sections from the given document, sections are sorted by 212 // section id in increasing order. Section ids start from 0. Sections with 213 // empty content won't be returned. 214 // 215 // Returns: 216 // A list of sections on success 217 // FAILED_PRECONDITION if schema hasn't been set yet 218 // NOT_FOUND if type config name of document not found 219 libtextclassifier3::StatusOr<std::vector<Section>> ExtractSections( 220 const DocumentProto& document) const; 221 222 // Syncs all the data changes to disk. 223 // 224 // Returns: 225 // OK on success 226 // INTERNAL on I/O errors. 227 libtextclassifier3::Status PersistToDisk(); 228 229 // Computes the combined checksum of the schema store - includes the ground 230 // truth and all derived files. 231 // 232 // Returns: 233 // Combined checksum on success 234 // INTERNAL_ERROR on compute error 235 libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const; 236 237 // Calculates the StorageInfo for the Schema Store. 238 // 239 // If an IO error occurs while trying to calculate the value for a field, then 240 // that field will be set to -1. 241 SchemaStoreStorageInfoProto GetStorageInfo() const; 242 243 private: 244 // Use SchemaStore::Create instead. 245 explicit SchemaStore(const Filesystem* filesystem, std::string base_dir, 246 const Clock* clock); 247 248 // Handles initializing the SchemaStore and regenerating any data if needed. 249 // 250 // Returns: 251 // OK on success 252 // INTERNAL_ERROR on IO error 253 libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats); 254 255 // Creates sub-components and verifies the integrity of each sub-component. 256 // 257 // Returns: 258 // OK on success 259 // INTERNAL_ERROR on IO error 260 libtextclassifier3::Status InitializeDerivedFiles(); 261 262 // Populates any derived data structures off of the schema. 263 // 264 // Returns: 265 // OK on success 266 // NOT_FOUND_ERROR if a schema proto has not been set 267 // INTERNAL_ERROR on any IO errors 268 libtextclassifier3::Status RegenerateDerivedFiles(); 269 270 // Checks if the header exists already. This does not create the header file 271 // if it doesn't exist. 272 bool HeaderExists(); 273 274 // Update and replace the header file. Creates the header file if it doesn't 275 // exist. 276 // 277 // Returns: 278 // OK on success 279 // INTERNAL on I/O error 280 libtextclassifier3::Status UpdateHeader(const Crc32& checksum); 281 282 // Resets the unique_ptr to the schema_type_mapper_, deletes the underlying 283 // file, and re-creates a new instance of the schema_type_mapper_. Does not 284 // populate the schema_type_mapper_. 285 // 286 // Returns any IO errors. 287 libtextclassifier3::Status ResetSchemaTypeMapper(); 288 CheckSchemaSet()289 libtextclassifier3::Status CheckSchemaSet() const { 290 return has_schema_successfully_set_ 291 ? libtextclassifier3::Status::OK 292 : absl_ports::FailedPreconditionError("Schema not set yet."); 293 } 294 295 const Filesystem& filesystem_; 296 const std::string base_dir_; 297 const Clock& clock_; 298 299 // Used internally to indicate whether the class has been successfully 300 // initialized with a valid schema. Will be false if Initialize failed or no 301 // schema has ever been set. 302 bool has_schema_successfully_set_ = false; 303 304 // Cached schema 305 FileBackedProto<SchemaProto> schema_file_; 306 307 // A hash map of (type config name -> type config), allows faster lookup of 308 // type config in schema. The O(1) type config access makes schema-related and 309 // section-related operations faster. 310 SchemaUtil::TypeConfigMap type_config_map_; 311 312 // Maps schema types to a densely-assigned unique id. 313 std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_; 314 315 // Manager of indexed section related metadata. 316 std::unique_ptr<const SectionManager> section_manager_; 317 }; 318 319 } // namespace lib 320 } // namespace icing 321 322 #endif // ICING_SCHEMA_SCHEMA_STORE_H_ 323