• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_SCHEMA_SCHEMA_STORE_H_
16 #define ICING_SCHEMA_SCHEMA_STORE_H_
17 
18 #include <cstdint>
19 #include <cstring>
20 #include <limits>
21 #include <memory>
22 #include <optional>
23 #include <string>
24 #include <string_view>
25 #include <unordered_map>
26 #include <unordered_set>
27 #include <utility>
28 #include <vector>
29 
30 #include "icing/text_classifier/lib3/utils/base/status.h"
31 #include "icing/text_classifier/lib3/utils/base/statusor.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/feature-flags.h"
34 #include "icing/file/file-backed-proto.h"
35 #include "icing/file/filesystem.h"
36 #include "icing/file/version-util.h"
37 #include "icing/proto/debug.pb.h"
38 #include "icing/proto/document.pb.h"
39 #include "icing/proto/logging.pb.h"
40 #include "icing/proto/schema.pb.h"
41 #include "icing/proto/search.pb.h"
42 #include "icing/proto/storage.pb.h"
43 #include "icing/schema/joinable-property.h"
44 #include "icing/schema/schema-type-manager.h"
45 #include "icing/schema/schema-util.h"
46 #include "icing/schema/scorable_property_manager.h"
47 #include "icing/schema/section.h"
48 #include "icing/store/document-filter-data.h"
49 #include "icing/store/key-mapper.h"
50 #include "icing/util/clock.h"
51 #include "icing/util/crc32.h"
52 #include "icing/util/logging.h"
53 #include "icing/util/status-macros.h"
54 
55 namespace icing {
56 namespace lib {
57 
58 // Holds the ground truth schema proto. Tracks compatible changes to the schema
59 // and will update any derived data based on the schema proto, such as Sections,
60 // SchemaTypeConfigs, PropertyConfigs, and SchemaTypeIds. To ensure they have
61 // the most up-to-date data, callers should not save instances themselves and
62 // should always call Get* from the SchemaStore.
63 class SchemaStore {
64  public:
65   struct LegacyHeader {
66     // Holds the magic as a quick sanity check against file corruption.
67     int32_t magic;
68 
69     // Checksum of the SchemaStore's sub-component's checksums.
70     uint32_t checksum;
71   };
72 
73   class Header {
74    public:
75     static constexpr int32_t kMagic = 0x72650d0a;
76 
Header(const Filesystem * filesystem,std::string path)77     explicit Header(const Filesystem* filesystem, std::string path)
78         : path_(std::move(path)), filesystem_(filesystem) {}
79 
Header(Header && other)80     Header(Header&& other)
81         : serialized_header_(std::move(other.serialized_header_)),
82           path_(std::move(other.path_)),
83           header_fd_(std::move(other.header_fd_)),
84           filesystem_(other.filesystem_),
85           dirty_(other.dirty_) {}
86 
87     Header& operator=(Header&& other) {
88       serialized_header_ = std::move(other.serialized_header_);
89       path_ = std::move(other.path_);
90       header_fd_ = std::move(other.header_fd_);
91       filesystem_ = other.filesystem_;
92       dirty_ = other.dirty_;
93       return *this;
94     }
95 
96     struct SerializedHeader {
SerializedHeaderSerializedHeader97       explicit SerializedHeader()
98           : magic(kMagic),
99             checksum(0),
100             overlay_created(false),
101             min_overlay_version_compatibility(
102                 std::numeric_limits<int32_t>::max()) {
103         memset(overlay_created_padding, 0, kOverlayCreatedPaddingSize);
104         memset(padding, 0, kPaddingSize);
105       }
106       // Holds the magic as a quick sanity check against file corruption.
107       int32_t magic;
108 
109       // Checksum of the SchemaStore's sub-component's checksums.
110       uint32_t checksum;
111 
112       bool overlay_created;
113       // Three bytes of padding due to the fact that
114       // min_overlay_version_compatibility_ has an alignof() == 4 and the offset
115       // of overlay_created_padding_ == 9.
116       static constexpr int kOverlayCreatedPaddingSize = 3;
117       uint8_t overlay_created_padding[kOverlayCreatedPaddingSize];
118 
119       int32_t min_overlay_version_compatibility;
120 
121       static constexpr int kPaddingSize = 1008;
122       // Padding exists just to reserve space for additional values.
123       uint8_t padding[kPaddingSize];
124     };
125     static_assert(sizeof(SerializedHeader) == 1024);
126 
127     // RETURNS:
128     //   - On success, a valid Header instance
129     //   - NOT_FOUND if header file doesn't exist
130     //   - INTERNAL if unable to read header
131     static libtextclassifier3::StatusOr<Header> Read(
132         const Filesystem* filesystem, std::string path);
133 
134     libtextclassifier3::Status Write();
135 
136     libtextclassifier3::Status PersistToDisk();
137 
magic()138     int32_t magic() const { return serialized_header_.magic; }
139 
checksum()140     uint32_t checksum() const { return serialized_header_.checksum; }
set_checksum(uint32_t checksum)141     void set_checksum(uint32_t checksum) {
142       dirty_ = true;
143       serialized_header_.checksum = checksum;
144     }
145 
overlay_created()146     bool overlay_created() const { return serialized_header_.overlay_created; }
147 
min_overlay_version_compatibility()148     int32_t min_overlay_version_compatibility() const {
149       return serialized_header_.min_overlay_version_compatibility;
150     }
151 
SetOverlayInfo(bool overlay_created,int32_t min_overlay_version_compatibility)152     void SetOverlayInfo(bool overlay_created,
153                         int32_t min_overlay_version_compatibility) {
154       dirty_ = true;
155       serialized_header_.overlay_created = overlay_created;
156       serialized_header_.min_overlay_version_compatibility =
157           min_overlay_version_compatibility;
158     }
159 
160    private:
Header(SerializedHeader serialized_header,std::string path,ScopedFd header_fd,const Filesystem * filesystem)161     explicit Header(SerializedHeader serialized_header, std::string path,
162                     ScopedFd header_fd, const Filesystem* filesystem)
163         : serialized_header_(std::move(serialized_header)),
164           path_(std::move(path)),
165           header_fd_(std::move(header_fd)),
166           filesystem_(filesystem),
167           dirty_(false) {}
168 
169     SerializedHeader serialized_header_;
170     std::string path_;
171     ScopedFd header_fd_;
172     const Filesystem* filesystem_;  // Not owned.
173     bool dirty_;
174   };
175 
176   // Holds information on what may have been affected by the new schema. This is
177   // generally data that other classes may depend on from the SchemaStore,
178   // so that we can know if we should go update those classes as well.
179   struct SetSchemaResult {
180     // Whether we are able to write the schema as determined by SetSchema's
181     // arguments. This boolean reflects SetSchema's logic, and does not reflect
182     // any system level IO errors that may prevent the schema from being written
183     // to file.
184     bool success = false;
185 
186     // SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if:
187     //   1. Schema types are added in the middle of the SchemaProto
188     //   2. Schema types are removed from the middle of the SchemaProto
189     //   3. Schema types are reordered in the SchemaProto
190     //
191     // SchemaTypeIds are not changed if schema types are added/removed to the
192     // end of the SchemaProto.
193     std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
194 
195     // Schema types that have been removed from the new schema. Represented by
196     // the `schema_type` field in the SchemaTypeConfigProto.
197     std::unordered_set<std::string> schema_types_deleted_by_name;
198 
199     // Schema types that have been removed from the new schema. Represented by
200     // the SchemaTypeId assigned to this SchemaTypeConfigProto in the *old*
201     // schema.
202     std::unordered_set<SchemaTypeId> schema_types_deleted_by_id;
203 
204     // Schema types whose SchemaTypeConfigProto has changed in an incompatible
205     // manner in the new schema. Compatibility determined in
206     // SchemaUtil::ComputeCompatibilityDelta. Represented by the `schema_type`
207     // field in the SchemaTypeConfigProto.
208     std::unordered_set<std::string> schema_types_incompatible_by_name;
209 
210     // Schema types whose SchemaTypeConfigProto has changed in an incompatible
211     // manner in the new schema. Compatibility determined in
212     // SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId
213     // assigned to this SchemaTypeConfigProto in the *old* schema.
214     std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id;
215 
216     // Schema types that were added in the new schema. Represented by the
217     // `schema_type` field in the SchemaTypeConfigProto.
218     std::unordered_set<std::string> schema_types_new_by_name;
219 
220     // Schema types that were changed in a way that was backwards compatible and
221     // didn't invalidate the index. Represented by the `schema_type` field in
222     // the SchemaTypeConfigProto.
223     std::unordered_set<std::string>
224         schema_types_changed_fully_compatible_by_name;
225 
226     // Schema types that were changed in a way that was backwards compatible,
227     // but invalidated the index. Represented by the `schema_type` field in the
228     // SchemaTypeConfigProto.
229     std::unordered_set<std::string> schema_types_index_incompatible_by_name;
230 
231     // Schema types that were changed in a way that was backwards compatible,
232     // but invalidated the joinable cache. Represented by the `schema_type`
233     // field in the SchemaTypeConfigProto.
234     std::unordered_set<std::string> schema_types_join_incompatible_by_name;
235 
236     // Schema types that were changed in a way that was backwards compatible,
237     // but inconsistent with the old schema so that the scorable property cache
238     // needs to be re-generated.
239     std::unordered_set<SchemaTypeId>
240         schema_types_scorable_property_inconsistent_by_id;
241 
242     // Schema types that were changed in a way that was backwards compatible,
243     // but inconsistent with the old schema so that the scorable property cache
244     // needs to be re-generated.
245     std::unordered_set<std::string>
246         schema_types_scorable_property_inconsistent_by_name;
247   };
248 
249   struct ExpandedTypePropertyMask {
250     std::string schema_type;
251     std::unordered_set<std::string> paths;
252   };
253 
254   static constexpr std::string_view kSchemaTypeWildcard = "*";
255 
256   static constexpr std::string_view kDefaultEmptySchemaDatabase = "";
257 
258   // Factory function to create a SchemaStore which does not take ownership
259   // of any input components, and all pointers must refer to valid objects that
260   // outlive the created SchemaStore instance. The base_dir must already exist.
261   // There does not need to be an existing schema already.
262   //
263   // If initialize_stats is present, the fields related to SchemaStore will be
264   // populated.
265   //
266   // Returns:
267   //   A SchemaStore on success
268   //   FAILED_PRECONDITION on any null pointer input
269   //   INTERNAL_ERROR on any IO errors
270   static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
271       const Filesystem* filesystem, const std::string& base_dir,
272       const Clock* clock, const FeatureFlags* feature_flags,
273       InitializeStatsProto* initialize_stats = nullptr);
274 
275   // Migrates schema files (backup v.s. new schema) according to version state
276   // change. Also performs schema database migration and populates the database
277   // fields in the persisted schema file if necessary.
278   //
279   // Returns:
280   //   OK on success or nothing to migrate
281   static libtextclassifier3::Status MigrateSchema(
282       const Filesystem* filesystem, const std::string& base_dir,
283       version_util::StateChange version_state_change, int32_t new_version,
284       bool perform_schema_database_migration);
285 
286   // Discards all derived data in the schema store.
287   //
288   // Returns:
289   //   OK on success or nothing to discard
290   //   INTERNAL_ERROR on any I/O errors
291   static libtextclassifier3::Status DiscardDerivedFiles(
292       const Filesystem* filesystem, const std::string& base_dir);
293 
294   SchemaStore(SchemaStore&&) = default;
295   SchemaStore& operator=(SchemaStore&&) = default;
296 
297   SchemaStore(const SchemaStore&) = delete;
298   SchemaStore& operator=(const SchemaStore&) = delete;
299 
300   // Persists and updates checksum of subcomponents.
301   ~SchemaStore();
302 
303   // Retrieve the current schema if it exists.
304   //
305   // Returns:
306   //   - SchemaProto* if exists
307   //   - INTERNAL_ERROR on any IO errors
308   //   - NOT_FOUND_ERROR if a schema hasn't been set before
309   libtextclassifier3::StatusOr<const SchemaProto*> GetSchema() const;
310 
311   // Retrieve the current schema for a given database if it exists.
312   //
313   // This is an expensive operation. Use GetSchema() when retrieving the entire
314   // schema, or if there is only a single database in the schema store.
315   //
316   // Returns:
317   //   - SchemaProto* containing only schema types from the database, if exists
318   //   - INTERNAL_ERROR on any IO errors
319   //   - NOT_FOUND_ERROR if the database doesn't exist in the schema, or if a
320   //     schema hasn't been set before
321   libtextclassifier3::StatusOr<SchemaProto> GetSchema(
322       const std::string& database) const;
323 
324   // Update our current schema if it's compatible. Does not accept incompatible
325   // schema or schema with types from multiple databases. Compatibility rules
326   // defined by SchemaUtil::ComputeCompatibilityDelta.
327   //
328   // NOTE: This method is deprecated. Please use
329   // `SetSchema(SetSchemaRequestProto&& set_schema_request)` instead.
330   //
331   // TODO: b/337913932 - Remove this method once all callers (currently only
332   // used in tests) are migrated to the new SetSchema method.
333   libtextclassifier3::StatusOr<SetSchemaResult> SetSchema(
334       SchemaProto new_schema, bool ignore_errors_and_delete_documents);
335 
336   // Update our current schema if it's compatible. Does not accept incompatible
337   // schema or schema with types from multiple databases. Compatibility rules
338   // defined by SchemaUtil::ComputeCompatibilityDelta.
339   //
340   // Does not support setting the schema across multiple databases if
341   // `feature_flags_->enable_schema_database()` is true. This means that:
342   // - All types within the new schema must have their `database` field matching
343   //  `set_schema_request.database()`.
344   //
345   // If ignore_errors_and_delete_documents is set to true, then incompatible
346   // schema are allowed and we'll force set the schema, meaning
347   // SetSchemaResult.success will always be true.
348   //
349   // Returns:
350   //   - SetSchemaResult that encapsulates the differences between the old and
351   //     new schema, as well as if the new schema can be set.
352   //   - INTERNAL_ERROR on any IO errors
353   //   - ALREADY_EXISTS_ERROR if type names in the new schema are already in use
354   //     by a different database.
355   //   - INVALID_ARGUMENT_ERROR if the schema is invalid. This can happen if
356   //     the schema is malformed, if the new schema contains types where the
357   //     database field does not match the database field in the
358   //     set_schema_request.
359   libtextclassifier3::StatusOr<SetSchemaResult> SetSchema(
360       SetSchemaRequestProto&& set_schema_request);
361 
362   // Get the SchemaTypeConfigProto of schema_type name.
363   //
364   // Returns:
365   //   SchemaTypeConfigProto on success
366   //   FAILED_PRECONDITION if schema hasn't been set yet
367   //   NOT_FOUND if schema type name doesn't exist
368   //   INTERNAL on any I/O errors
369   libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
370   GetSchemaTypeConfig(std::string_view schema_type) const;
371 
372   // Get a map contains all schema_type name to its blob property paths.
373   //
374   // Returns:
375   //   A map contains all schema_type name to its blob property paths on success
376   //   FAILED_PRECONDITION if schema hasn't been set yet
377   //   INTERNAL on any I/O errors
378   libtextclassifier3::StatusOr<
379       std::unordered_map<std::string, std::vector<std::string>>>
380   ConstructBlobPropertyMap() const;
381 
382   // Returns the schema type of the passed in SchemaTypeId
383   //
384   // Returns:
385   //   schema type on success
386   //   FAILED_PRECONDITION if schema hasn't been set yet
387   //   INVALID_ARGUMENT if schema type id is invalid
388   libtextclassifier3::StatusOr<const std::string*> GetSchemaType(
389       SchemaTypeId schema_type_id) const;
390 
391   // Returns the SchemaTypeId of the passed in schema type
392   //
393   // Returns:
394   //   SchemaTypeId on success
395   //   FAILED_PRECONDITION if schema hasn't been set yet
396   //   NOT_FOUND_ERROR if we don't know about the schema type
397   //   INTERNAL_ERROR on IO error
398   libtextclassifier3::StatusOr<SchemaTypeId> GetSchemaTypeId(
399       std::string_view schema_type) const;
400 
401   // Similar to GetSchemaTypeId but will return a set of SchemaTypeId to also
402   // include child types.
403   //
404   // Returns:
405   //   A set of SchemaTypeId on success
406   //   FAILED_PRECONDITION if schema hasn't been set yet
407   //   NOT_FOUND_ERROR if we don't know about the schema type
408   //   INTERNAL_ERROR on IO error
409   libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
410   GetSchemaTypeIdsWithChildren(std::string_view schema_type) const;
411 
412   // Returns the SectionMetadata associated with the SectionId that's in the
413   // SchemaTypeId.
414   //
415   // Returns:
416   //   Valid pointer to SectionMetadata on success
417   //   FAILED_PRECONDITION if schema hasn't been set yet
418   //   INVALID_ARGUMENT if schema type id or section id is invalid
419   libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata(
420       SchemaTypeId schema_type_id, SectionId section_id) const;
421 
422   // Returns true if a property is defined in the said schema, regardless of
423   // whether it is indexed or not.
424   bool IsPropertyDefinedInSchema(SchemaTypeId schema_type_id,
425                                  const std::string& property) const;
426 
427   // Extracts all sections of different types from the given document and group
428   // them by type.
429   // - Each Section vector is sorted by section Id in ascending order. The
430   //   sorted section ids may not be continuous, since not all sections are
431   //   present in the document.
432   // - Sections with empty content won't be returned.
433   // - For example, we may extract:
434   //   string_sections: [2, 7, 10]
435   //   integer_sections: [3, 5, 8]
436   //
437   // Returns:
438   //   A SectionGroup instance on success
439   //   FAILED_PRECONDITION if schema hasn't been set yet
440   //   NOT_FOUND if type config name of document not found
441   libtextclassifier3::StatusOr<SectionGroup> ExtractSections(
442       const DocumentProto& document) const;
443 
444   // Returns the JoinablePropertyMetadata associated with property_path that's
445   // in the SchemaTypeId.
446   //
447   // Returns:
448   //   Valid pointer to JoinablePropertyMetadata on success
449   //   nullptr if property_path doesn't exist (or is not joinable) in the
450   //     joinable metadata list of the schema
451   //   FAILED_PRECONDITION if schema hasn't been set yet
452   //   INVALID_ARGUMENT if schema type id is invalid
453   libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
454   GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,
455                               const std::string& property_path) const;
456 
457   // Returns the JoinablePropertyMetadata associated with joinable_property_id
458   // that's in the SchemaTypeId.
459   //
460   // Returns:
461   //   Valid pointer to JoinablePropertyMetadata on success
462   //   FAILED_PRECONDITION if schema hasn't been set yet
463   //   INVALID_ARGUMENT if schema type id or joinable property id is invalid
464   libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
465   GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,
466                               JoinablePropertyId joinable_property_id) const;
467 
468   // Extracts all joinable property contents of different types from the given
469   // document and group them by joinable value type.
470   // - Joinable properties are sorted by joinable property id in ascending
471   //   order. The sorted joinable property ids may not be continuous, since not
472   //   all joinable properties are present in the document.
473   // - Joinable property ids start from 0.
474   // - Joinable properties with empty content won't be returned.
475   //
476   // Returns:
477   //   A JoinablePropertyGroup instance on success
478   //   FAILED_PRECONDITION if schema hasn't been set yet
479   //   NOT_FOUND if the type config name of document not found
480   libtextclassifier3::StatusOr<JoinablePropertyGroup> ExtractJoinableProperties(
481       const DocumentProto& document) const;
482 
483   // Returns the quantization type for the given schema_type_id and section_id.
484   //
485   // Returns:
486   //   - The quantization type on success.
487   //   - INVALID_ARGUMENT_ERROR if schema_type_id or section_id is invalid.
488   //   - Any error from schema store.
489   libtextclassifier3::StatusOr<EmbeddingIndexingConfig::QuantizationType::Code>
GetQuantizationType(SchemaTypeId schema_type_id,SectionId section_id)490   GetQuantizationType(SchemaTypeId schema_type_id, SectionId section_id) const {
491     ICING_ASSIGN_OR_RETURN(const SectionMetadata* section_metadata,
492                            GetSectionMetadata(schema_type_id, section_id));
493     return section_metadata->quantization_type;
494   }
495 
496   // Syncs all the data changes to disk.
497   //
498   // Returns:
499   //   OK on success
500   //   INTERNAL on I/O errors.
501   libtextclassifier3::Status PersistToDisk();
502 
503   // Recomputes the combined checksum of components of the schema store and
504   // updates the header.
505   //
506   // Returns:
507   //   - the checksum on success
508   //   - INTERNAL on I/O errors.
509   libtextclassifier3::StatusOr<Crc32> UpdateChecksum();
510 
511   // Recomputes the combined checksum of components of the schema store. Does
512   // NOT update the header.
513   //
514   // Returns:
515   //   - the checksum on success
516   //   - INTERNAL on I/O errors.
517   libtextclassifier3::StatusOr<Crc32> GetChecksum() const;
518 
519   // Returns:
520   //   - On success, the section metadata list for the specified schema type
521   //   - NOT_FOUND if the schema type is not present in the schema
522   libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
523   GetSectionMetadata(const std::string& schema_type) const;
524 
525   // Gets the index of the given |property_path|, where the index N means that
526   // it is the Nth scorable property path in the schema config of the given
527   // |schema_type_id|, in lexicographical order.
528   //
529   // Returns:
530   //   - Index on success
531   //   - std::nullopt if the |property_path| doesn't point to a scorable
532   //     property under the |schema_type_id|
533   //   - FAILED_PRECONDITION if the schema hasn't been set yet
534   //   - INVALID_ARGUMENT if |schema_type_id| is invalid
535   libtextclassifier3::StatusOr<std::optional<int>> GetScorablePropertyIndex(
536       SchemaTypeId schema_type_id, std::string_view property_path) const;
537 
538   // Returns the list of ScorablePropertyInfo for the given |schema_type_id|,
539   // in lexicographical order of its property path.
540   //
541   // Returns:
542   //   - Vector of scorable property info on success. The vector can be empty
543   //     if no scorable property is found under the schema config of
544   //     |schema_type_id|.
545   //   - FAILED_PRECONDITION if the schema hasn't been set yet
546   //   - INVALID_ARGUMENT if |schema_type_id| is invalid
547   libtextclassifier3::StatusOr<
548       const std::vector<ScorablePropertyManager::ScorablePropertyInfo>*>
549   GetOrderedScorablePropertyInfo(SchemaTypeId schema_type_id) const;
550 
551   // Calculates the StorageInfo for the Schema Store.
552   //
553   // If an IO error occurs while trying to calculate the value for a field, then
554   // that field will be set to -1.
555   SchemaStoreStorageInfoProto GetStorageInfo() const;
556 
557   // Get debug information for the schema store.
558   //
559   // Returns:
560   //   SchemaDebugInfoProto on success
561   //   INTERNAL_ERROR on IO errors, crc compute error
562   libtextclassifier3::StatusOr<SchemaDebugInfoProto> GetDebugInfo() const;
563 
564   // Expands the provided type_property_masks into a vector of
565   // ExpandedTypePropertyMasks to account for polymorphism. If both a parent
566   // type and one of its child type appears in the masks, the parent type's
567   // paths will be merged into the child's.
568   //
569   // For example, assume that we have two schema types A and B, and we have
570   // - A is the parent type of B
571   // - Paths of A: {P1, P2}
572   // - Paths of B: {P3}
573   //
574   // Then, we will have the following in the result.
575   // - Expanded paths of A: {P1, P2}
576   // - Expanded paths of B: {P1, P2, P3}
577   std::vector<ExpandedTypePropertyMask> ExpandTypePropertyMasks(
578       const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks)
579       const;
580 
581  private:
582   // Factory function to create a SchemaStore and set its schema. The created
583   // instance does not take ownership of any input components and all pointers
584   // must refer to valid objects that outlive the created SchemaStore instance.
585   // The base_dir must already exist. No schema must have set in base_dir prior
586   // to this.
587   //
588   // Returns:
589   //   A SchemaStore on success
590   //   FAILED_PRECONDITION on any null pointer input or if there has already
591   //       been a schema set for this path.
592   //   INTERNAL_ERROR on any IO errors
593   static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
594       const Filesystem* filesystem, const std::string& base_dir,
595       const Clock* clock, const FeatureFlags* feature_flags,
596       SchemaProto schema);
597 
598   // Use SchemaStore::Create instead.
599   explicit SchemaStore(const Filesystem* filesystem, std::string base_dir,
600                        const Clock* clock, const FeatureFlags* feature_flags);
601 
602   // Deletes the overlay schema and ensures that the Header is correctly set.
603   //
604   // RETURNS:
605   //   OK on success
606   //   INTERNAL_ERROR on any IO errors
607   static libtextclassifier3::Status DiscardOverlaySchema(
608       const Filesystem* filesystem, const std::string& base_dir,
609       Header& header);
610 
611   // Handles the overlay schema after a version change by deleting it if it is
612   // no longer compatible with the new version.
613   //
614   // Requires: base_dir exists.
615   //
616   // Returns:
617   //   OK on success
618   //   INTERNAL_ERROR on any IO errors
619   static libtextclassifier3::Status HandleOverlaySchemaForVersionChange(
620       const Filesystem* filesystem, const std::string& base_dir,
621       version_util::StateChange version_state_change, int32_t new_version);
622 
623   // Populates the schema database field in the schema proto that is stored in
624   // the input schema file.
625   //
626   // Returns:
627   //   OK on success or nothing to migrate
628   //   INTERNAL_ERROR on IO error
629   static libtextclassifier3::Status PopulateSchemaDatabaseFieldForSchemaFile(
630       const Filesystem* filesystem, const std::string& schema_filename);
631 
632   // Verifies that there is no error retrieving a previously set schema. Then
633   // initializes like normal.
634   //
635   // Returns:
636   //   OK on success
637   //   INTERNAL_ERROR on IO error
638   libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats);
639 
640   // First, blindly writes new_schema to the schema_file. Then initializes like
641   // normal.
642   //
643   // Returns:
644   //   OK on success
645   //   INTERNAL_ERROR on IO error
646   //   FAILED_PRECONDITION if there is already a schema set for the schema_file.
647   libtextclassifier3::Status Initialize(SchemaProto new_schema);
648 
649   // Handles initializing the SchemaStore and regenerating any data if needed.
650   //
651   // Returns:
652   //   OK on success
653   //   INTERNAL_ERROR on IO error
654   libtextclassifier3::Status InitializeInternal(
655       bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats);
656 
657   // Creates sub-components and verifies the integrity of each sub-component.
658   //
659   // Returns:
660   //   OK on success
661   //   INTERNAL_ERROR on IO error
662   libtextclassifier3::Status InitializeDerivedFiles();
663 
664   // Populates any derived data structures off of the schema.
665   //
666   // Returns:
667   //   OK on success
668   //   NOT_FOUND_ERROR if a schema proto has not been set
669   //   INTERNAL_ERROR on any IO errors
670   libtextclassifier3::Status RegenerateDerivedFiles(
671       bool create_overlay_if_necessary);
672 
673   // Build type_config_map_, schema_subtype_id_map_, and schema_type_manager_.
674   //
675   // Returns:
676   //   OK on success
677   //   NOT_FOUND_ERROR if a schema proto has not been set
678   //   INTERNAL_ERROR on any IO errors
679   libtextclassifier3::Status BuildInMemoryCache();
680 
681   // Update and replace the header file. Creates the header file if it doesn't
682   // exist.
683   //
684   // Returns:
685   //   OK on success
686   //   INTERNAL on I/O error
687   libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
688 
689   // Resets the unique_ptr to the schema_type_mapper_, deletes the underlying
690   // file, and re-creates a new instance of the schema_type_mapper_. Does not
691   // populate the schema_type_mapper_.
692   //
693   // Returns any IO errors.
694   libtextclassifier3::Status ResetSchemaTypeMapper();
695 
696   // Creates a new schema store with new_schema and then swaps that new schema
697   // store with the existing one. This function guarantees that either: this
698   // instance will be fully updated to the new schema or no changes will take
699   // effect.
700   //
701   // Returns:
702   //   OK on success
703   //   INTERNAL on I/O error.
704   libtextclassifier3::Status ApplySchemaChange(SchemaProto new_schema);
705 
CheckSchemaSet()706   libtextclassifier3::Status CheckSchemaSet() const {
707     return has_schema_successfully_set_
708                ? libtextclassifier3::Status::OK
709                : absl_ports::FailedPreconditionError("Schema not set yet.");
710   }
711 
712   // Correctly loads the Header, schema_file_ and (if present) the
713   // overlay_schema_file_.
714   //
715   // If feature_flags_->release_backup_schema_file_after_initialization() is
716   // true, then schema_file_ will be released if the overlay_schema_file_ is
717   // present.
718   //
719   // RETURNS:
720   //   - OK on success
721   //   - INTERNAL if an IO error is encountered when reading the Header or
722   //   schemas.
723   //     Or an invalid schema configuration is present.
724   libtextclassifier3::Status LoadSchema();
725 
726   // Resets the schema_file_'s cached FileBackedProto instance if needed.
727   //
728   // This is the case if the overlay_schema_file_ is present and
729   // feature_flags_->release_backup_schema_file_if_overlay_present is true.
ResetSchemaFileIfNeeded()730   void ResetSchemaFileIfNeeded() {
731     if (feature_flags_->release_backup_schema_file_if_overlay_present() &&
732         overlay_schema_file_ != nullptr) {
733       ICING_VLOG(2)
734           << "Freeing schema store's base schema file's "
735              "FileBackedProto instance since overlay_schema_file_ is present.";
736       schema_file_.ReleaseCachedSchemaFile();
737     }
738   }
739 
740   // Sets the schema for a database for the first time.
741   //
742   // Note that when schema database is disabled, this function sets the entire
743   // schema, with all types under the default empty database.
744   //
745   // Requires:
746   //   - `new_schema` is valid according to `ValidateSchemaDatabase'
747   //
748   // Returns:
749   //   - SetSchemaResult that indicates if the new schema can be set.
750   //   - INTERNAL_ERROR on any IO errors.
751   //   - INVALID_ARGUMENT_ERROR if the new schema is invalid.
752   libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
753   SetInitialSchemaForDatabase(SchemaProto new_schema,
754                               const std::string& database,
755                               bool ignore_errors_and_delete_documents);
756 
757   // Sets the schema for a database, overriding any existing schema for that
758   // database.
759   //
760   // Note that when schema database is disabled, this function sets and
761   // overrides the entire schema.
762   //
763   // Requires:
764   //   - `new_schema` and `database` are valid according to
765   //     `ValidateSchemaDatabase(new_schema, database)`
766   //   - Types in `new_schema` and `old_schema` all belong to the provided
767   //     database.
768   //     - The old schema is guaranteed to contain types from exactly one
769   //       database when schema database is enabled, because it was obtained
770   //       using `GetSchema(database)`.
771   //
772   // Returns:
773   //   - SetSchemaResult that encapsulates the differences between the old and
774   //     new schema, as well as if the new schema can be set.
775   //   - INTERNAL_ERROR on any IO errors.
776   //   - INVALID_ARGUMENT_ERROR if the schema is invalid, or if there are
777   //     mismatches between the schema databases.
778   libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
779   SetSchemaWithDatabaseOverride(SchemaProto new_schema,
780                                 const SchemaProto& old_schema,
781                                 const std::string& database,
782                                 bool ignore_errors_and_delete_documents);
783 
784   // Initial validation on the SchemaProto for SetSchema. This is intended as a
785   // preliminary check before any expensive operations are performed during
786   // `SetSchema::Validate`. Returns the schema's database if it's valid.
787   //
788   // Note that when schema database is disabled, any schema input is valid and
789   // an empty string is returned as the database.
790   //
791   // Checks that:
792   // - The new schema only contains types from a single database, which matches
793   //   the provided database.
794   // - The schema's type names are not already in use in other databases. This
795   //   is done outside of `SchemaUtil::Validate` because we need to know all
796   //   existing type names, which is stored in the SchemaStore and not known to
797   //   SchemaUtil.
798   //
799   // Returns:
800   //   - OK on success
801   //   - INVALID_ARGUMENT_ERROR if new_schema.types's databases do not match the
802   //     provided database.
803   //   - ALREADY_EXISTS_ERROR if new_schema's types names are not unique
804   libtextclassifier3::Status ValidateSchemaDatabase(
805       const SchemaProto& new_schema, const std::string& database) const;
806 
807   // Returns a SchemaProto representing the full schema, which is a combination
808   // of the existing schema and the input database schema. Deletes all types
809   // belonging to the specified database if input_database_schema is an empty
810   // proto.
811   //
812   // For the database being updated by the input database schema:
813   // - If the existing schema does not contain the database, the input types
814   //   are appended to the end of the SchemaProto, without changing the order
815   //   of the existing schema types.
816   // - Otherwise, the existing schema types are replaced with types from the
817   //   input database schema in their original position in the existing
818   //   SchemaProto.
819   //   - Types from input_database_schema are added in the order in which they
820   //     appear.
821   //   - If more types are added to the database, the additional types are
822   //     appended at the end of the SchemaProto, without changing the order of
823   //     existing types from unaffected databases.
824   //
825   // Requires:
826   //   - input_database_schema is valid according to `ValidateSchemaDatabase`
827   //     and `SchemaUtil::Validate`.
828   //
829   // Returns:
830   //   - SchemaProto on success
831   //   - INTERNAL_ERROR on any IO errors, or if the schema store was not
832   //     previously initialized properly.
833   //   - INVALID_ARGUMENT_ERROR if the input schema contains types from multiple
834   //     databases.
835   libtextclassifier3::StatusOr<SchemaProto> GetFullSchemaProtoWithUpdatedDb(
836       SchemaProto input_database_schema,
837       const std::string& database_to_update) const;
838 
839   const Filesystem* filesystem_;
840   std::string base_dir_;
841   const Clock* clock_;
842   const FeatureFlags* feature_flags_;  // Does not own.
843 
844   // Used internally to indicate whether the class has been successfully
845   // initialized with a valid schema. Will be false if Initialize failed or no
846   // schema has ever been set.
847   bool has_schema_successfully_set_ = false;
848 
849   // Wrapper class to store a cached schema file FileBackedProto instance and
850   // its checksum.
851   class SchemaFileCache {
852    public:
SchemaFileCache(const Filesystem * filesystem,const std::string & schema_file_path)853     explicit SchemaFileCache(const Filesystem* filesystem,
854                              const std::string& schema_file_path)
855         : filesystem_(filesystem), schema_file_path_(schema_file_path) {}
856     // Returns a reference to the proto read from the schema FileBackedProto.
857     //
858     // NOTE: The caller does NOT get ownership of the object returned and
859     // the returned object is only valid till a new version of the proto is
860     // written to the file.
861     //
862     // Returns NOT_FOUND if the file was empty or never written to.
863     // Returns INTERNAL_ERROR if an IO error or a corruption was encountered.
Read()864     libtextclassifier3::StatusOr<const SchemaProto*> Read() {
865       return GetCachedSchemaFile().Read();
866     }
867 
868     // Writes the new schema_proto to schema_file_ and updates the cached
869     // checksum.
870     //
871     // Returns: INTERNAL_ERROR if any IO error is encountered.
Write(std::unique_ptr<SchemaProto> schema_proto)872     libtextclassifier3::Status Write(
873         std::unique_ptr<SchemaProto> schema_proto) {
874       ICING_RETURN_IF_ERROR(
875           GetCachedSchemaFile().Write(std::move(schema_proto)));
876       ICING_ASSIGN_OR_RETURN(Crc32 checksum,
877                              GetCachedSchemaFile().GetChecksum());
878       checksum_ = std::make_unique<Crc32>(checksum);
879       return libtextclassifier3::Status::OK;
880     }
881 
882     // Sets the swapped_to_file_path for the cached schema_file_ instance and
883     // the schema_file_path_.
SetSwappedFilepath(std::string new_schema_file_path)884     void SetSwappedFilepath(std::string new_schema_file_path) {
885       if (schema_file_ != nullptr) {
886         schema_file_->SetSwappedFilepath(new_schema_file_path);
887       }
888       schema_file_path_ = std::move(new_schema_file_path);
889     }
890 
891     // Releases the cached schema_file_ FileBackedProto instance.
ReleaseCachedSchemaFile()892     void ReleaseCachedSchemaFile() { schema_file_.reset(); }
893 
GetChecksum()894     libtextclassifier3::StatusOr<Crc32> GetChecksum() {
895       if (checksum_ == nullptr) {
896         ICING_ASSIGN_OR_RETURN(Crc32 checksum,
897                                GetCachedSchemaFile().GetChecksum());
898         checksum_ = std::make_unique<Crc32>(std::move(checksum));
899       }
900       return *checksum_;
901     }
902 
903    private:
GetCachedSchemaFile()904     FileBackedProto<SchemaProto>& GetCachedSchemaFile() {
905       if (schema_file_ == nullptr) {
906         schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
907             *filesystem_, schema_file_path_);
908       }
909       return *schema_file_;
910     }
911 
912     const Filesystem* filesystem_;
913     std::string schema_file_path_;
914     std::unique_ptr<FileBackedProto<SchemaProto>> schema_file_;
915     std::unique_ptr<Crc32> checksum_;
916   };
917 
918   // Caches a FileBackedProto instance and the checksum for the schema file.
919   //
920   // If the overlay_schema_file_ is present and
921   // feature_flags_->release_backup_schema_file_if_overlay_present is true, then
922   // the cached schema FileBackedProto instance should be released and reloaded
923   // only during mutating SetSchema operations.
924   mutable SchemaFileCache schema_file_;
925 
926   // This schema holds the definition of any schema types that are not
927   // compatible with older versions of Icing code.
928   std::unique_ptr<FileBackedProto<SchemaProto>> overlay_schema_file_;
929 
930   // Maps schema types to a densely-assigned unique id.
931   std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_;
932 
933   // Maps schema type ids to the corresponding schema type. This is an inverse
934   // map of schema_type_mapper_.
935   std::unordered_map<SchemaTypeId, std::string> reverse_schema_type_mapper_;
936 
937   // A hash map of (database -> vector of type config names in the database).
938   //
939   // We use a vector instead of a set because we need to preserve the order of
940   // the types (i.e. the order in which they appear in the input SchemaProto
941   // during SetSchema), so that we can return the correct SchemaProto for
942   // GetSchema.
943   //
944   // This keeps track of the type configs defined in each database, which allows
945   // schema operations to be performed on a per-database basis.
946   std::unordered_map<std::string, std::vector<std::string>> database_type_map_;
947 
948   // A hash map of (type config name -> type config), allows faster lookup of
949   // type config in schema. The O(1) type config access makes schema-related and
950   // section-related operations faster.
951   SchemaUtil::TypeConfigMap type_config_map_;
952 
953   // Maps from each type id to all of its subtype ids.
954   // T2 is a subtype of T1, if and only if one of the following conditions is
955   // met:
956   // - T2 is T1
957   // - T2 extends T1
958   // - There exists a type U, such that T2 is a subtype of U, and U is a subtype
959   //   of T1
960   std::unordered_map<SchemaTypeId, std::unordered_set<SchemaTypeId>>
961       schema_subtype_id_map_;
962 
963   // Manager of section (indexable property) and joinable property related
964   // metadata for all Schemas.
965   std::unique_ptr<const SchemaTypeManager> schema_type_manager_;
966 
967   // Used to cache and manage the schema's scorable properties.
968   std::unique_ptr<ScorablePropertyManager> scorable_property_manager_;
969 
970   std::unique_ptr<Header> header_;
971 };
972 
973 }  // namespace lib
974 }  // namespace icing
975 
976 #endif  // ICING_SCHEMA_SCHEMA_STORE_H_
977