• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_SCHEMA_SCHEMA_STORE_H_
16 #define ICING_SCHEMA_SCHEMA_STORE_H_
17 
18 #include <cstdint>
19 #include <cstring>
20 #include <limits>
21 #include <memory>
22 #include <string>
23 #include <string_view>
24 #include <unordered_map>
25 #include <unordered_set>
26 #include <vector>
27 
28 #include "icing/text_classifier/lib3/utils/base/status.h"
29 #include "icing/text_classifier/lib3/utils/base/statusor.h"
30 #include "icing/absl_ports/canonical_errors.h"
31 #include "icing/file/file-backed-proto.h"
32 #include "icing/file/filesystem.h"
33 #include "icing/file/version-util.h"
34 #include "icing/proto/debug.pb.h"
35 #include "icing/proto/document.pb.h"
36 #include "icing/proto/logging.pb.h"
37 #include "icing/proto/schema.pb.h"
38 #include "icing/proto/search.pb.h"
39 #include "icing/proto/storage.pb.h"
40 #include "icing/schema/joinable-property.h"
41 #include "icing/schema/schema-type-manager.h"
42 #include "icing/schema/schema-util.h"
43 #include "icing/schema/section.h"
44 #include "icing/store/document-filter-data.h"
45 #include "icing/store/key-mapper.h"
46 #include "icing/util/clock.h"
47 #include "icing/util/crc32.h"
48 
49 namespace icing {
50 namespace lib {
51 
52 // Holds the ground truth schema proto. Tracks compatible changes to the schema
53 // and will update any derived data based on the schema proto, such as Sections,
54 // SchemaTypeConfigs, PropertyConfigs, and SchemaTypeIds. To ensure they have
55 // the most up-to-date data, callers should not save instances themselves and
56 // should always call Get* from the SchemaStore.
57 class SchemaStore {
58  public:
59   struct LegacyHeader {
60     // Holds the magic as a quick sanity check against file corruption.
61     int32_t magic;
62 
63     // Checksum of the SchemaStore's sub-component's checksums.
64     uint32_t checksum;
65   };
66 
67   class Header {
68    public:
69     static constexpr int32_t kMagic = 0x72650d0a;
70 
Header()71     explicit Header()
72         : magic_(kMagic),
73           checksum_(0),
74           overlay_created_(false),
75           min_overlay_version_compatibility_(
76               std::numeric_limits<int32_t>::max()) {
77       memset(overlay_created_padding_, 0, kOverlayCreatedPaddingSize);
78       memset(padding_, 0, kPaddingSize);
79     }
80 
81     // RETURNS:
82     //   - On success, a valid Header instance
83     //   - NOT_FOUND if header file doesn't exist
84     //   - INTERNAL if unable to read header
85     static libtextclassifier3::StatusOr<Header> Read(
86         const Filesystem* filesystem, const std::string& path);
87 
88     libtextclassifier3::Status Write(const Filesystem* filesystem,
89                                      const std::string& path);
90 
magic()91     int32_t magic() const { return magic_; }
92 
checksum()93     uint32_t checksum() const { return checksum_; }
set_checksum(uint32_t checksum)94     void set_checksum(uint32_t checksum) { checksum_ = checksum; }
95 
overlay_created()96     bool overlay_created() const { return overlay_created_; }
97 
min_overlay_version_compatibility()98     int32_t min_overlay_version_compatibility() const {
99       return min_overlay_version_compatibility_;
100     }
101 
SetOverlayInfo(bool overlay_created,int32_t min_overlay_version_compatibility)102     void SetOverlayInfo(bool overlay_created,
103                         int32_t min_overlay_version_compatibility) {
104       overlay_created_ = overlay_created;
105       min_overlay_version_compatibility_ = min_overlay_version_compatibility;
106     }
107 
108    private:
109     // Holds the magic as a quick sanity check against file corruption.
110     int32_t magic_;
111 
112     // Checksum of the SchemaStore's sub-component's checksums.
113     uint32_t checksum_;
114 
115     bool overlay_created_;
116     // Three bytes of padding due to the fact that
117     // min_overlay_version_compatibility_ has an alignof() == 4 and the offset
118     // of overlay_created_padding_ == 9.
119     static constexpr int kOverlayCreatedPaddingSize = 3;
120     uint8_t overlay_created_padding_[kOverlayCreatedPaddingSize];
121 
122     int32_t min_overlay_version_compatibility_;
123 
124     static constexpr int kPaddingSize = 1008;
125     // Padding exists just to reserve space for additional values.
126     uint8_t padding_[kPaddingSize];
127   };
128   static_assert(sizeof(Header) == 1024);
129 
130   // Holds information on what may have been affected by the new schema. This is
131   // generally data that other classes may depend on from the SchemaStore,
132   // so that we can know if we should go update those classes as well.
133   struct SetSchemaResult {
134     // Whether we are able to write the schema as determined by SetSchema's
135     // arguments. This boolean reflects SetSchema's logic, and does not reflect
136     // any system level IO errors that may prevent the schema from being written
137     // to file.
138     bool success = false;
139 
140     // SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if:
141     //   1. Schema types are added in the middle of the SchemaProto
142     //   2. Schema types are removed from the middle of the SchemaProto
143     //   3. Schema types are reordered in the SchemaProto
144     //
145     // SchemaTypeIds are not changed if schema types are added/removed to the
146     // end of the SchemaProto.
147     std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
148 
149     // Schema types that have been removed from the new schema. Represented by
150     // the `schema_type` field in the SchemaTypeConfigProto.
151     std::unordered_set<std::string> schema_types_deleted_by_name;
152 
153     // Schema types that have been removed from the new schema. Represented by
154     // the SchemaTypeId assigned to this SchemaTypeConfigProto in the *old*
155     // schema.
156     std::unordered_set<SchemaTypeId> schema_types_deleted_by_id;
157 
158     // Schema types whose SchemaTypeConfigProto has changed in an incompatible
159     // manner in the new schema. Compatibility determined in
160     // SchemaUtil::ComputeCompatibilityDelta. Represented by the `schema_type`
161     // field in the SchemaTypeConfigProto.
162     std::unordered_set<std::string> schema_types_incompatible_by_name;
163 
164     // Schema types whose SchemaTypeConfigProto has changed in an incompatible
165     // manner in the new schema. Compatibility determined in
166     // SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId
167     // assigned to this SchemaTypeConfigProto in the *old* schema.
168     std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id;
169 
170     // Schema types that were added in the new schema. Represented by the
171     // `schema_type` field in the SchemaTypeConfigProto.
172     std::unordered_set<std::string> schema_types_new_by_name;
173 
174     // Schema types that were changed in a way that was backwards compatible and
175     // didn't invalidate the index. Represented by the `schema_type` field in
176     // the SchemaTypeConfigProto.
177     std::unordered_set<std::string>
178         schema_types_changed_fully_compatible_by_name;
179 
180     // Schema types that were changed in a way that was backwards compatible,
181     // but invalidated the index. Represented by the `schema_type` field in the
182     // SchemaTypeConfigProto.
183     std::unordered_set<std::string> schema_types_index_incompatible_by_name;
184 
185     // Schema types that were changed in a way that was backwards compatible,
186     // but invalidated the joinable cache. Represented by the `schema_type`
187     // field in the SchemaTypeConfigProto.
188     std::unordered_set<std::string> schema_types_join_incompatible_by_name;
189   };
190 
191   struct ExpandedTypePropertyMask {
192     std::string schema_type;
193     std::unordered_set<std::string> paths;
194   };
195 
196   static constexpr std::string_view kSchemaTypeWildcard = "*";
197 
198   // Factory function to create a SchemaStore which does not take ownership
199   // of any input components, and all pointers must refer to valid objects that
200   // outlive the created SchemaStore instance. The base_dir must already exist.
201   // There does not need to be an existing schema already.
202   //
203   // If initialize_stats is present, the fields related to SchemaStore will be
204   // populated.
205   //
206   // Returns:
207   //   A SchemaStore on success
208   //   FAILED_PRECONDITION on any null pointer input
209   //   INTERNAL_ERROR on any IO errors
210   static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
211       const Filesystem* filesystem, const std::string& base_dir,
212       const Clock* clock, InitializeStatsProto* initialize_stats = nullptr);
213 
214   // Migrates schema files (backup v.s. new schema) according to version state
215   // change.
216   //
217   // Returns:
218   //   OK on success or nothing to migrate
219   static libtextclassifier3::Status MigrateSchema(
220       const Filesystem* filesystem, const std::string& base_dir,
221       version_util::StateChange version_state_change, int32_t new_version);
222 
223   // Discards all derived data in the schema store.
224   //
225   // Returns:
226   //   OK on success or nothing to discard
227   //   INTERNAL_ERROR on any I/O errors
228   static libtextclassifier3::Status DiscardDerivedFiles(
229       const Filesystem* filesystem, const std::string& base_dir);
230 
231   SchemaStore(SchemaStore&&) = default;
232   SchemaStore& operator=(SchemaStore&&) = default;
233 
234   SchemaStore(const SchemaStore&) = delete;
235   SchemaStore& operator=(const SchemaStore&) = delete;
236 
237   // Persists and updates checksum of subcomponents.
238   ~SchemaStore();
239 
240   // Retrieve the current schema if it exists.
241   //
242   // Returns:
243   //   SchemaProto* if exists
244   //   INTERNAL_ERROR on any IO errors
245   //   NOT_FOUND_ERROR if a schema hasn't been set before
246   libtextclassifier3::StatusOr<const SchemaProto*> GetSchema() const;
247 
248   // Update our current schema if it's compatible. Does not accept incompatible
249   // schema. Compatibility rules defined by
250   // SchemaUtil::ComputeCompatibilityDelta.
251   //
252   // If ignore_errors_and_delete_documents is set to true, then incompatible
253   // schema are allowed and we'll force set the schema, meaning
254   // SetSchemaResult.success will always be true.
255   //
256   // Returns:
257   //   SetSchemaResult that encapsulates the differences between the old and new
258   //   schema, as well as if the new schema can be set.
259   //   INTERNAL_ERROR on any IO errors
260   libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema(
261       const SchemaProto& new_schema,
262       bool ignore_errors_and_delete_documents,
263       bool allow_circular_schema_definitions);
264   libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema(
265       SchemaProto&& new_schema,
266       bool ignore_errors_and_delete_documents,
267       bool allow_circular_schema_definitions);
268 
269   // Get the SchemaTypeConfigProto of schema_type name.
270   //
271   // Returns:
272   //   SchemaTypeConfigProto on success
273   //   FAILED_PRECONDITION if schema hasn't been set yet
274   //   NOT_FOUND if schema type name doesn't exist
275   //   INTERNAL on any I/O errors
276   libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
277   GetSchemaTypeConfig(std::string_view schema_type) const;
278 
279   // Returns the schema type of the passed in SchemaTypeId
280   //
281   // Returns:
282   //   schema type on success
283   //   FAILED_PRECONDITION if schema hasn't been set yet
284   //   INVALID_ARGUMENT if schema type id is invalid
285   libtextclassifier3::StatusOr<const std::string*> GetSchemaType(
286       SchemaTypeId schema_type_id) const;
287 
288   // Returns the SchemaTypeId of the passed in schema type
289   //
290   // Returns:
291   //   SchemaTypeId on success
292   //   FAILED_PRECONDITION if schema hasn't been set yet
293   //   NOT_FOUND_ERROR if we don't know about the schema type
294   //   INTERNAL_ERROR on IO error
295   libtextclassifier3::StatusOr<SchemaTypeId> GetSchemaTypeId(
296       std::string_view schema_type) const;
297 
298   // Similar to GetSchemaTypeId but will return a set of SchemaTypeId to also
299   // include child types.
300   //
301   // Returns:
302   //   A set of SchemaTypeId on success
303   //   FAILED_PRECONDITION if schema hasn't been set yet
304   //   NOT_FOUND_ERROR if we don't know about the schema type
305   //   INTERNAL_ERROR on IO error
306   libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
307   GetSchemaTypeIdsWithChildren(std::string_view schema_type) const;
308 
309   // Returns the SectionMetadata associated with the SectionId that's in the
310   // SchemaTypeId.
311   //
312   // Returns:
313   //   Valid pointer to SectionMetadata on success
314   //   FAILED_PRECONDITION if schema hasn't been set yet
315   //   INVALID_ARGUMENT if schema type id or section id is invalid
316   libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata(
317       SchemaTypeId schema_type_id, SectionId section_id) const;
318 
319   // Returns true if a property is defined in the said schema, regardless of
320   // whether it is indexed or not.
321   bool IsPropertyDefinedInSchema(SchemaTypeId schema_type_id,
322                                  const std::string& property) const;
323 
324   // Extracts all sections of different types from the given document and group
325   // them by type.
326   // - Each Section vector is sorted by section Id in ascending order. The
327   //   sorted section ids may not be continuous, since not all sections are
328   //   present in the document.
329   // - Sections with empty content won't be returned.
330   // - For example, we may extract:
331   //   string_sections: [2, 7, 10]
332   //   integer_sections: [3, 5, 8]
333   //
334   // Returns:
335   //   A SectionGroup instance on success
336   //   FAILED_PRECONDITION if schema hasn't been set yet
337   //   NOT_FOUND if type config name of document not found
338   libtextclassifier3::StatusOr<SectionGroup> ExtractSections(
339       const DocumentProto& document) const;
340 
341   // Returns the JoinablePropertyMetadata associated with property_path that's
342   // in the SchemaTypeId.
343   //
344   // Returns:
345   //   Valid pointer to JoinablePropertyMetadata on success
346   //   nullptr if property_path doesn't exist (or is not joinable) in the
347   //     joinable metadata list of the schema
348   //   FAILED_PRECONDITION if schema hasn't been set yet
349   //   INVALID_ARGUMENT if schema type id is invalid
350   libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
351   GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,
352                               const std::string& property_path) const;
353 
354   // Extracts all joinable property contents of different types from the given
355   // document and group them by joinable value type.
356   // - Joinable properties are sorted by joinable property id in ascending
357   //   order. The sorted joinable property ids may not be continuous, since not
358   //   all joinable properties are present in the document.
359   // - Joinable property ids start from 0.
360   // - Joinable properties with empty content won't be returned.
361   //
362   // Returns:
363   //   A JoinablePropertyGroup instance on success
364   //   FAILED_PRECONDITION if schema hasn't been set yet
365   //   NOT_FOUND if the type config name of document not found
366   libtextclassifier3::StatusOr<JoinablePropertyGroup> ExtractJoinableProperties(
367       const DocumentProto& document) const;
368 
369   // Syncs all the data changes to disk.
370   //
371   // Returns:
372   //   OK on success
373   //   INTERNAL on I/O errors.
374   libtextclassifier3::Status PersistToDisk();
375 
376   // Computes the combined checksum of the schema store - includes the ground
377   // truth and all derived files.
378   //
379   // Returns:
380   //   Combined checksum on success
381   //   INTERNAL_ERROR on compute error
382   libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
383 
384   // Returns:
385   //   - On success, the section metadata list for the specified schema type
386   //   - NOT_FOUND if the schema type is not present in the schema
387   libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
388   GetSectionMetadata(const std::string& schema_type) const;
389 
390   // Calculates the StorageInfo for the Schema Store.
391   //
392   // If an IO error occurs while trying to calculate the value for a field, then
393   // that field will be set to -1.
394   SchemaStoreStorageInfoProto GetStorageInfo() const;
395 
396   // Get debug information for the schema store.
397   //
398   // Returns:
399   //   SchemaDebugInfoProto on success
400   //   INTERNAL_ERROR on IO errors, crc compute error
401   libtextclassifier3::StatusOr<SchemaDebugInfoProto> GetDebugInfo() const;
402 
403   // Expands the provided type_property_masks into a vector of
404   // ExpandedTypePropertyMasks to account for polymorphism. If both a parent
405   // type and one of its child type appears in the masks, the parent type's
406   // paths will be merged into the child's.
407   //
408   // For example, assume that we have two schema types A and B, and we have
409   // - A is the parent type of B
410   // - Paths of A: {P1, P2}
411   // - Paths of B: {P3}
412   //
413   // Then, we will have the following in the result.
414   // - Expanded paths of A: {P1, P2}
415   // - Expanded paths of B: {P1, P2, P3}
416   std::vector<ExpandedTypePropertyMask> ExpandTypePropertyMasks(
417       const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks)
418       const;
419 
420  private:
421   // Factory function to create a SchemaStore and set its schema. The created
422   // instance does not take ownership of any input components and all pointers
423   // must refer to valid objects that outlive the created SchemaStore instance.
424   // The base_dir must already exist. No schema must have set in base_dir prior
425   // to this.
426   //
427   // Returns:
428   //   A SchemaStore on success
429   //   FAILED_PRECONDITION on any null pointer input or if there has already
430   //       been a schema set for this path.
431   //   INTERNAL_ERROR on any IO errors
432   static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
433       const Filesystem* filesystem, const std::string& base_dir,
434       const Clock* clock, SchemaProto schema);
435 
436   // Use SchemaStore::Create instead.
437   explicit SchemaStore(const Filesystem* filesystem, std::string base_dir,
438                        const Clock* clock);
439 
440   // Deletes the overlay schema and ensures that the Header is correctly set.
441   //
442   // RETURNS:
443   //   OK on success
444   //   INTERNAL_ERROR on any IO errors
445   static libtextclassifier3::Status DiscardOverlaySchema(
446       const Filesystem* filesystem, const std::string& base_dir,
447       Header& header);
448 
449   // Verifies that there is no error retrieving a previously set schema. Then
450   // initializes like normal.
451   //
452   // Returns:
453   //   OK on success
454   //   INTERNAL_ERROR on IO error
455   libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats);
456 
457   // First, blindly writes new_schema to the schema_file. Then initializes like
458   // normal.
459   //
460   // Returns:
461   //   OK on success
462   //   INTERNAL_ERROR on IO error
463   //   FAILED_PRECONDITION if there is already a schema set for the schema_file.
464   libtextclassifier3::Status Initialize(SchemaProto new_schema);
465 
466   // Handles initializing the SchemaStore and regenerating any data if needed.
467   //
468   // Returns:
469   //   OK on success
470   //   INTERNAL_ERROR on IO error
471   libtextclassifier3::Status InitializeInternal(
472       bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats);
473 
474   // Creates sub-components and verifies the integrity of each sub-component.
475   //
476   // Returns:
477   //   OK on success
478   //   INTERNAL_ERROR on IO error
479   libtextclassifier3::Status InitializeDerivedFiles();
480 
481   // Populates any derived data structures off of the schema.
482   //
483   // Returns:
484   //   OK on success
485   //   NOT_FOUND_ERROR if a schema proto has not been set
486   //   INTERNAL_ERROR on any IO errors
487   libtextclassifier3::Status RegenerateDerivedFiles(
488       bool create_overlay_if_necessary);
489 
490   // Build type_config_map_, schema_subtype_id_map_, and schema_type_manager_.
491   //
492   // Returns:
493   //   OK on success
494   //   NOT_FOUND_ERROR if a schema proto has not been set
495   //   INTERNAL_ERROR on any IO errors
496   libtextclassifier3::Status BuildInMemoryCache();
497 
498   // Update and replace the header file. Creates the header file if it doesn't
499   // exist.
500   //
501   // Returns:
502   //   OK on success
503   //   INTERNAL on I/O error
504   libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
505 
506   // Resets the unique_ptr to the schema_type_mapper_, deletes the underlying
507   // file, and re-creates a new instance of the schema_type_mapper_. Does not
508   // populate the schema_type_mapper_.
509   //
510   // Returns any IO errors.
511   libtextclassifier3::Status ResetSchemaTypeMapper();
512 
513   // Creates a new schema store with new_schema and then swaps that new schema
514   // store with the existing one. This function guarantees that either: this
515   // instance will be fully updated to the new schema or no changes will take
516   // effect.
517   //
518   // Returns:
519   //   OK on success
520   //   INTERNAL on I/O error.
521   libtextclassifier3::Status ApplySchemaChange(SchemaProto new_schema);
522 
CheckSchemaSet()523   libtextclassifier3::Status CheckSchemaSet() const {
524     return has_schema_successfully_set_
525                ? libtextclassifier3::Status::OK
526                : absl_ports::FailedPreconditionError("Schema not set yet.");
527   }
528 
529   // Correctly loads the Header, schema_file_ and (if present) the
530   // overlay_schema_file_.
531   // RETURNS:
532   //   - OK on success
533   //   - INTERNAL if an IO error is encountered when reading the Header or
534   //   schemas.
535   //     Or an invalid schema configuration is present.
536   libtextclassifier3::Status LoadSchema();
537 
538   const Filesystem* filesystem_;
539   std::string base_dir_;
540   const Clock* clock_;
541 
542   // Used internally to indicate whether the class has been successfully
543   // initialized with a valid schema. Will be false if Initialize failed or no
544   // schema has ever been set.
545   bool has_schema_successfully_set_ = false;
546 
547   // Cached schema
548   std::unique_ptr<FileBackedProto<SchemaProto>> schema_file_;
549 
550   // This schema holds the definition of any schema types that are not
551   // compatible with older versions of Icing code.
552   std::unique_ptr<FileBackedProto<SchemaProto>> overlay_schema_file_;
553 
554   // Maps schema types to a densely-assigned unique id.
555   std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_;
556 
557   // Maps schema type ids to the corresponding schema type. This is an inverse
558   // map of schema_type_mapper_.
559   std::unordered_map<SchemaTypeId, std::string> reverse_schema_type_mapper_;
560 
561   // A hash map of (type config name -> type config), allows faster lookup of
562   // type config in schema. The O(1) type config access makes schema-related and
563   // section-related operations faster.
564   SchemaUtil::TypeConfigMap type_config_map_;
565 
566   // Maps from each type id to all of its subtype ids.
567   // T2 is a subtype of T1, if and only if one of the following conditions is
568   // met:
569   // - T2 is T1
570   // - T2 extends T1
571   // - There exists a type U, such that T2 is a subtype of U, and U is a subtype
572   //   of T1
573   std::unordered_map<SchemaTypeId, std::unordered_set<SchemaTypeId>>
574       schema_subtype_id_map_;
575 
576   // Manager of section (indexable property) and joinable property related
577   // metadata for all Schemas.
578   std::unique_ptr<const SchemaTypeManager> schema_type_manager_;
579 
580   std::unique_ptr<Header> header_;
581 };
582 
583 }  // namespace lib
584 }  // namespace icing
585 
586 #endif  // ICING_SCHEMA_SCHEMA_STORE_H_
587