• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_SCHEMA_SCHEMA_STORE_H_
16 #define ICING_SCHEMA_SCHEMA_STORE_H_
17 
18 #include <cstdint>
19 #include <cstring>
20 #include <limits>
21 #include <memory>
22 #include <string>
23 #include <string_view>
24 #include <unordered_map>
25 #include <unordered_set>
26 #include <vector>
27 
28 #include "icing/text_classifier/lib3/utils/base/status.h"
29 #include "icing/text_classifier/lib3/utils/base/statusor.h"
30 #include "icing/absl_ports/canonical_errors.h"
31 #include "icing/file/file-backed-proto.h"
32 #include "icing/file/filesystem.h"
33 #include "icing/file/version-util.h"
34 #include "icing/proto/debug.pb.h"
35 #include "icing/proto/document.pb.h"
36 #include "icing/proto/logging.pb.h"
37 #include "icing/proto/schema.pb.h"
38 #include "icing/proto/search.pb.h"
39 #include "icing/proto/storage.pb.h"
40 #include "icing/schema/joinable-property.h"
41 #include "icing/schema/schema-type-manager.h"
42 #include "icing/schema/schema-util.h"
43 #include "icing/schema/section.h"
44 #include "icing/store/document-filter-data.h"
45 #include "icing/store/key-mapper.h"
46 #include "icing/util/clock.h"
47 #include "icing/util/crc32.h"
48 
49 namespace icing {
50 namespace lib {
51 
52 // Holds the ground truth schema proto. Tracks compatible changes to the schema
53 // and will update any derived data based on the schema proto, such as Sections,
54 // SchemaTypeConfigs, PropertyConfigs, and SchemaTypeIds. To ensure they have
55 // the most up-to-date data, callers should not save instances themselves and
56 // should always call Get* from the SchemaStore.
57 class SchemaStore {
58  public:
59   struct LegacyHeader {
60     // Holds the magic as a quick sanity check against file corruption.
61     int32_t magic;
62 
63     // Checksum of the SchemaStore's sub-component's checksums.
64     uint32_t checksum;
65   };
66 
67   class Header {
68    public:
69     static constexpr int32_t kMagic = 0x72650d0a;
70 
Header()71     explicit Header()
72         : magic_(kMagic),
73           checksum_(0),
74           overlay_created_(false),
75           min_overlay_version_compatibility_(
76               std::numeric_limits<int32_t>::max()) {
77       memset(overlay_created_padding_, 0, kOverlayCreatedPaddingSize);
78       memset(padding_, 0, kPaddingSize);
79     }
80 
81     // RETURNS:
82     //   - On success, a valid Header instance
83     //   - NOT_FOUND if header file doesn't exist
84     //   - INTERNAL if unable to read header
85     static libtextclassifier3::StatusOr<Header> Read(
86         const Filesystem* filesystem, const std::string& path);
87 
88     libtextclassifier3::Status Write(const Filesystem* filesystem,
89                                      const std::string& path);
90 
magic()91     int32_t magic() const { return magic_; }
92 
checksum()93     uint32_t checksum() const { return checksum_; }
set_checksum(uint32_t checksum)94     void set_checksum(uint32_t checksum) { checksum_ = checksum; }
95 
overlay_created()96     bool overlay_created() const { return overlay_created_; }
97 
min_overlay_version_compatibility()98     int32_t min_overlay_version_compatibility() const {
99       return min_overlay_version_compatibility_;
100     }
101 
SetOverlayInfo(bool overlay_created,int32_t min_overlay_version_compatibility)102     void SetOverlayInfo(bool overlay_created,
103                         int32_t min_overlay_version_compatibility) {
104       overlay_created_ = overlay_created;
105       min_overlay_version_compatibility_ = min_overlay_version_compatibility;
106     }
107 
108    private:
109     // Holds the magic as a quick sanity check against file corruption.
110     int32_t magic_;
111 
112     // Checksum of the SchemaStore's sub-component's checksums.
113     uint32_t checksum_;
114 
115     bool overlay_created_;
116     // Three bytes of padding due to the fact that
117     // min_overlay_version_compatibility_ has an alignof() == 4 and the offset
118     // of overlay_created_padding_ == 9.
119     static constexpr int kOverlayCreatedPaddingSize = 3;
120     uint8_t overlay_created_padding_[kOverlayCreatedPaddingSize];
121 
122     int32_t min_overlay_version_compatibility_;
123 
124     static constexpr int kPaddingSize = 1008;
125     // Padding exists just to reserve space for additional values.
126     uint8_t padding_[kPaddingSize];
127   };
128   static_assert(sizeof(Header) == 1024);
129 
130   // Holds information on what may have been affected by the new schema. This is
131   // generally data that other classes may depend on from the SchemaStore,
132   // so that we can know if we should go update those classes as well.
133   struct SetSchemaResult {
134     // Whether we are able to write the schema as determined by SetSchema's
135     // arguments. This boolean reflects SetSchema's logic, and does not reflect
136     // any system level IO errors that may prevent the schema from being written
137     // to file.
138     bool success = false;
139 
140     // SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if:
141     //   1. Schema types are added in the middle of the SchemaProto
142     //   2. Schema types are removed from the middle of the SchemaProto
143     //   3. Schema types are reordered in the SchemaProto
144     //
145     // SchemaTypeIds are not changed if schema types are added/removed to the
146     // end of the SchemaProto.
147     std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
148 
149     // Schema types that have been removed from the new schema. Represented by
150     // the `schema_type` field in the SchemaTypeConfigProto.
151     std::unordered_set<std::string> schema_types_deleted_by_name;
152 
153     // Schema types that have been removed from the new schema. Represented by
154     // the SchemaTypeId assigned to this SchemaTypeConfigProto in the *old*
155     // schema.
156     std::unordered_set<SchemaTypeId> schema_types_deleted_by_id;
157 
158     // Schema types whose SchemaTypeConfigProto has changed in an incompatible
159     // manner in the new schema. Compatibility determined in
160     // SchemaUtil::ComputeCompatibilityDelta. Represented by the `schema_type`
161     // field in the SchemaTypeConfigProto.
162     std::unordered_set<std::string> schema_types_incompatible_by_name;
163 
164     // Schema types whose SchemaTypeConfigProto has changed in an incompatible
165     // manner in the new schema. Compatibility determined in
166     // SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId
167     // assigned to this SchemaTypeConfigProto in the *old* schema.
168     std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id;
169 
170     // Schema types that were added in the new schema. Represented by the
171     // `schema_type` field in the SchemaTypeConfigProto.
172     std::unordered_set<std::string> schema_types_new_by_name;
173 
174     // Schema types that were changed in a way that was backwards compatible and
175     // didn't invalidate the index. Represented by the `schema_type` field in
176     // the SchemaTypeConfigProto.
177     std::unordered_set<std::string>
178         schema_types_changed_fully_compatible_by_name;
179 
180     // Schema types that were changed in a way that was backwards compatible,
181     // but invalidated the index. Represented by the `schema_type` field in the
182     // SchemaTypeConfigProto.
183     std::unordered_set<std::string> schema_types_index_incompatible_by_name;
184 
185     // Schema types that were changed in a way that was backwards compatible,
186     // but invalidated the joinable cache. Represented by the `schema_type`
187     // field in the SchemaTypeConfigProto.
188     std::unordered_set<std::string> schema_types_join_incompatible_by_name;
189   };
190 
191   struct ExpandedTypePropertyMask {
192     std::string schema_type;
193     std::unordered_set<std::string> paths;
194   };
195 
196   static constexpr std::string_view kSchemaTypeWildcard = "*";
197 
198   // Factory function to create a SchemaStore which does not take ownership
199   // of any input components, and all pointers must refer to valid objects that
200   // outlive the created SchemaStore instance. The base_dir must already exist.
201   // There does not need to be an existing schema already.
202   //
203   // If initialize_stats is present, the fields related to SchemaStore will be
204   // populated.
205   //
206   // Returns:
207   //   A SchemaStore on success
208   //   FAILED_PRECONDITION on any null pointer input
209   //   INTERNAL_ERROR on any IO errors
210   static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
211       const Filesystem* filesystem, const std::string& base_dir,
212       const Clock* clock, InitializeStatsProto* initialize_stats = nullptr);
213 
214   // Migrates schema files (backup v.s. new schema) according to version state
215   // change.
216   //
217   // Returns:
218   //   OK on success or nothing to migrate
219   static libtextclassifier3::Status MigrateSchema(
220       const Filesystem* filesystem, const std::string& base_dir,
221       version_util::StateChange version_state_change, int32_t new_version);
222 
223   // Discards all derived data in the schema store.
224   //
225   // Returns:
226   //   OK on success or nothing to discard
227   //   INTERNAL_ERROR on any I/O errors
228   static libtextclassifier3::Status DiscardDerivedFiles(
229       const Filesystem* filesystem, const std::string& base_dir);
230 
231   SchemaStore(SchemaStore&&) = default;
232   SchemaStore& operator=(SchemaStore&&) = default;
233 
234   SchemaStore(const SchemaStore&) = delete;
235   SchemaStore& operator=(const SchemaStore&) = delete;
236 
237   // Persists and updates checksum of subcomponents.
238   ~SchemaStore();
239 
240   // Retrieve the current schema if it exists.
241   //
242   // Returns:
243   //   SchemaProto* if exists
244   //   INTERNAL_ERROR on any IO errors
245   //   NOT_FOUND_ERROR if a schema hasn't been set before
246   libtextclassifier3::StatusOr<const SchemaProto*> GetSchema() const;
247 
248   // Update our current schema if it's compatible. Does not accept incompatible
249   // schema. Compatibility rules defined by
250   // SchemaUtil::ComputeCompatibilityDelta.
251   //
252   // If ignore_errors_and_delete_documents is set to true, then incompatible
253   // schema are allowed and we'll force set the schema, meaning
254   // SetSchemaResult.success will always be true.
255   //
256   // Returns:
257   //   SetSchemaResult that encapsulates the differences between the old and new
258   //   schema, as well as if the new schema can be set.
259   //   INTERNAL_ERROR on any IO errors
260   libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema(
261       const SchemaProto& new_schema,
262       bool ignore_errors_and_delete_documents,
263       bool allow_circular_schema_definitions);
264   libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema(
265       SchemaProto&& new_schema,
266       bool ignore_errors_and_delete_documents,
267       bool allow_circular_schema_definitions);
268 
269   // Get the SchemaTypeConfigProto of schema_type name.
270   //
271   // Returns:
272   //   SchemaTypeConfigProto on success
273   //   FAILED_PRECONDITION if schema hasn't been set yet
274   //   NOT_FOUND if schema type name doesn't exist
275   //   INTERNAL on any I/O errors
276   libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
277   GetSchemaTypeConfig(std::string_view schema_type) const;
278 
279   // Returns the SchemaTypeId of the passed in schema type
280   //
281   // Returns:
282   //   SchemaTypeId on success
283   //   FAILED_PRECONDITION if schema hasn't been set yet
284   //   NOT_FOUND_ERROR if we don't know about the schema type
285   //   INTERNAL_ERROR on IO error
286   libtextclassifier3::StatusOr<SchemaTypeId> GetSchemaTypeId(
287       std::string_view schema_type) const;
288 
289   // Similar to GetSchemaTypeId but will return a set of SchemaTypeId to also
290   // include child types.
291   //
292   // Returns:
293   //   A set of SchemaTypeId on success
294   //   FAILED_PRECONDITION if schema hasn't been set yet
295   //   NOT_FOUND_ERROR if we don't know about the schema type
296   //   INTERNAL_ERROR on IO error
297   libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
298   GetSchemaTypeIdsWithChildren(std::string_view schema_type) const;
299 
300   // Returns the SectionMetadata associated with the SectionId that's in the
301   // SchemaTypeId.
302   //
303   // Returns:
304   //   Valid pointer to SectionMetadata on success
305   //   FAILED_PRECONDITION if schema hasn't been set yet
306   //   INVALID_ARGUMENT if schema type id or section id is invalid
307   libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata(
308       SchemaTypeId schema_type_id, SectionId section_id) const;
309 
310   // Returns true if a property is defined in the said schema, regardless of
311   // whether it is indexed or not.
312   bool IsPropertyDefinedInSchema(SchemaTypeId schema_type_id,
313                                  const std::string& property) const;
314 
315   // Extracts all sections of different types from the given document and group
316   // them by type.
317   // - Each Section vector is sorted by section Id in ascending order. The
318   //   sorted section ids may not be continuous, since not all sections are
319   //   present in the document.
320   // - Sections with empty content won't be returned.
321   // - For example, we may extract:
322   //   string_sections: [2, 7, 10]
323   //   integer_sections: [3, 5, 8]
324   //
325   // Returns:
326   //   A SectionGroup instance on success
327   //   FAILED_PRECONDITION if schema hasn't been set yet
328   //   NOT_FOUND if type config name of document not found
329   libtextclassifier3::StatusOr<SectionGroup> ExtractSections(
330       const DocumentProto& document) const;
331 
332   // Returns the JoinablePropertyMetadata associated with property_path that's
333   // in the SchemaTypeId.
334   //
335   // Returns:
336   //   Valid pointer to JoinablePropertyMetadata on success
337   //   nullptr if property_path doesn't exist (or is not joinable) in the
338   //     joinable metadata list of the schema
339   //   FAILED_PRECONDITION if schema hasn't been set yet
340   //   INVALID_ARGUMENT if schema type id is invalid
341   libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
342   GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,
343                               const std::string& property_path) const;
344 
345   // Extracts all joinable property contents of different types from the given
346   // document and group them by joinable value type.
347   // - Joinable properties are sorted by joinable property id in ascending
348   //   order. The sorted joinable property ids may not be continuous, since not
349   //   all joinable properties are present in the document.
350   // - Joinable property ids start from 0.
351   // - Joinable properties with empty content won't be returned.
352   //
353   // Returns:
354   //   A JoinablePropertyGroup instance on success
355   //   FAILED_PRECONDITION if schema hasn't been set yet
356   //   NOT_FOUND if the type config name of document not found
357   libtextclassifier3::StatusOr<JoinablePropertyGroup> ExtractJoinableProperties(
358       const DocumentProto& document) const;
359 
360   // Syncs all the data changes to disk.
361   //
362   // Returns:
363   //   OK on success
364   //   INTERNAL on I/O errors.
365   libtextclassifier3::Status PersistToDisk();
366 
367   // Computes the combined checksum of the schema store - includes the ground
368   // truth and all derived files.
369   //
370   // Returns:
371   //   Combined checksum on success
372   //   INTERNAL_ERROR on compute error
373   libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
374 
375   // Returns:
376   //   - On success, the section metadata list for the specified schema type
377   //   - NOT_FOUND if the schema type is not present in the schema
378   libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
379   GetSectionMetadata(const std::string& schema_type) const;
380 
381   // Calculates the StorageInfo for the Schema Store.
382   //
383   // If an IO error occurs while trying to calculate the value for a field, then
384   // that field will be set to -1.
385   SchemaStoreStorageInfoProto GetStorageInfo() const;
386 
387   // Get debug information for the schema store.
388   //
389   // Returns:
390   //   SchemaDebugInfoProto on success
391   //   INTERNAL_ERROR on IO errors, crc compute error
392   libtextclassifier3::StatusOr<SchemaDebugInfoProto> GetDebugInfo() const;
393 
394   // Expands the provided type_property_masks into a vector of
395   // ExpandedTypePropertyMasks to account for polymorphism. If both a parent
396   // type and one of its child type appears in the masks, the parent type's
397   // paths will be merged into the child's.
398   //
399   // For example, assume that we have two schema types A and B, and we have
400   // - A is the parent type of B
401   // - Paths of A: {P1, P2}
402   // - Paths of B: {P3}
403   //
404   // Then, we will have the following in the result.
405   // - Expanded paths of A: {P1, P2}
406   // - Expanded paths of B: {P1, P2, P3}
407   std::vector<ExpandedTypePropertyMask> ExpandTypePropertyMasks(
408       const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks)
409       const;
410 
411  private:
412   // Factory function to create a SchemaStore and set its schema. The created
413   // instance does not take ownership of any input components and all pointers
414   // must refer to valid objects that outlive the created SchemaStore instance.
415   // The base_dir must already exist. No schema must have set in base_dir prior
416   // to this.
417   //
418   // Returns:
419   //   A SchemaStore on success
420   //   FAILED_PRECONDITION on any null pointer input or if there has already
421   //       been a schema set for this path.
422   //   INTERNAL_ERROR on any IO errors
423   static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
424       const Filesystem* filesystem, const std::string& base_dir,
425       const Clock* clock, SchemaProto schema);
426 
427   // Use SchemaStore::Create instead.
428   explicit SchemaStore(const Filesystem* filesystem, std::string base_dir,
429                        const Clock* clock);
430 
431   // Deletes the overlay schema and ensures that the Header is correctly set.
432   //
433   // RETURNS:
434   //   OK on success
435   //   INTERNAL_ERROR on any IO errors
436   static libtextclassifier3::Status DiscardOverlaySchema(
437       const Filesystem* filesystem, const std::string& base_dir,
438       Header& header);
439 
440   // Verifies that there is no error retrieving a previously set schema. Then
441   // initializes like normal.
442   //
443   // Returns:
444   //   OK on success
445   //   INTERNAL_ERROR on IO error
446   libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats);
447 
448   // First, blindly writes new_schema to the schema_file. Then initializes like
449   // normal.
450   //
451   // Returns:
452   //   OK on success
453   //   INTERNAL_ERROR on IO error
454   //   FAILED_PRECONDITION if there is already a schema set for the schema_file.
455   libtextclassifier3::Status Initialize(SchemaProto new_schema);
456 
457   // Handles initializing the SchemaStore and regenerating any data if needed.
458   //
459   // Returns:
460   //   OK on success
461   //   INTERNAL_ERROR on IO error
462   libtextclassifier3::Status InitializeInternal(
463       bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats);
464 
465   // Creates sub-components and verifies the integrity of each sub-component.
466   //
467   // Returns:
468   //   OK on success
469   //   INTERNAL_ERROR on IO error
470   libtextclassifier3::Status InitializeDerivedFiles();
471 
472   // Populates any derived data structures off of the schema.
473   //
474   // Returns:
475   //   OK on success
476   //   NOT_FOUND_ERROR if a schema proto has not been set
477   //   INTERNAL_ERROR on any IO errors
478   libtextclassifier3::Status RegenerateDerivedFiles(
479       bool create_overlay_if_necessary);
480 
481   // Build type_config_map_, schema_subtype_id_map_, and schema_type_manager_.
482   //
483   // Returns:
484   //   OK on success
485   //   NOT_FOUND_ERROR if a schema proto has not been set
486   //   INTERNAL_ERROR on any IO errors
487   libtextclassifier3::Status BuildInMemoryCache();
488 
489   // Update and replace the header file. Creates the header file if it doesn't
490   // exist.
491   //
492   // Returns:
493   //   OK on success
494   //   INTERNAL on I/O error
495   libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
496 
497   // Resets the unique_ptr to the schema_type_mapper_, deletes the underlying
498   // file, and re-creates a new instance of the schema_type_mapper_. Does not
499   // populate the schema_type_mapper_.
500   //
501   // Returns any IO errors.
502   libtextclassifier3::Status ResetSchemaTypeMapper();
503 
504   // Creates a new schema store with new_schema and then swaps that new schema
505   // store with the existing one. This function guarantees that either: this
506   // instance will be fully updated to the new schema or no changes will take
507   // effect.
508   //
509   // Returns:
510   //   OK on success
511   //   INTERNAL on I/O error.
512   libtextclassifier3::Status ApplySchemaChange(SchemaProto new_schema);
513 
CheckSchemaSet()514   libtextclassifier3::Status CheckSchemaSet() const {
515     return has_schema_successfully_set_
516                ? libtextclassifier3::Status::OK
517                : absl_ports::FailedPreconditionError("Schema not set yet.");
518   }
519 
520   // Correctly loads the Header, schema_file_ and (if present) the
521   // overlay_schema_file_.
522   // RETURNS:
523   //   - OK on success
524   //   - INTERNAL if an IO error is encountered when reading the Header or
525   //   schemas.
526   //     Or an invalid schema configuration is present.
527   libtextclassifier3::Status LoadSchema();
528 
529   const Filesystem* filesystem_;
530   std::string base_dir_;
531   const Clock* clock_;
532 
533   // Used internally to indicate whether the class has been successfully
534   // initialized with a valid schema. Will be false if Initialize failed or no
535   // schema has ever been set.
536   bool has_schema_successfully_set_ = false;
537 
538   // Cached schema
539   std::unique_ptr<FileBackedProto<SchemaProto>> schema_file_;
540 
541   // This schema holds the definition of any schema types that are not
542   // compatible with older versions of Icing code.
543   std::unique_ptr<FileBackedProto<SchemaProto>> overlay_schema_file_;
544 
545   // Maps schema types to a densely-assigned unique id.
546   std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_;
547 
548   // Maps schema type ids to the corresponding schema type. This is an inverse
549   // map of schema_type_mapper_.
550   std::unordered_map<SchemaTypeId, std::string> reverse_schema_type_mapper_;
551 
552   // A hash map of (type config name -> type config), allows faster lookup of
553   // type config in schema. The O(1) type config access makes schema-related and
554   // section-related operations faster.
555   SchemaUtil::TypeConfigMap type_config_map_;
556 
557   // Maps from each type id to all of its subtype ids.
558   // T2 is a subtype of T1, if and only if one of the following conditions is
559   // met:
560   // - T2 is T1
561   // - T2 extends T1
562   // - There exists a type U, such that T2 is a subtype of U, and U is a subtype
563   //   of T1
564   std::unordered_map<SchemaTypeId, std::unordered_set<SchemaTypeId>>
565       schema_subtype_id_map_;
566 
567   // Manager of section (indexable property) and joinable property related
568   // metadata for all Schemas.
569   std::unique_ptr<const SchemaTypeManager> schema_type_manager_;
570 
571   std::unique_ptr<Header> header_;
572 };
573 
574 }  // namespace lib
575 }  // namespace icing
576 
577 #endif  // ICING_SCHEMA_SCHEMA_STORE_H_
578