• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_SCHEMA_SCHEMA_STORE_H_
16 #define ICING_SCHEMA_SCHEMA_STORE_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_set>
23 #include <vector>
24 
25 #include "icing/text_classifier/lib3/utils/base/status.h"
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/file/file-backed-proto.h"
28 #include "icing/file/filesystem.h"
29 #include "icing/proto/document.pb.h"
30 #include "icing/proto/logging.pb.h"
31 #include "icing/proto/schema.pb.h"
32 #include "icing/proto/storage.pb.h"
33 #include "icing/schema/schema-util.h"
34 #include "icing/schema/section-manager.h"
35 #include "icing/schema/section.h"
36 #include "icing/store/document-filter-data.h"
37 #include "icing/store/key-mapper.h"
38 #include "icing/util/clock.h"
39 #include "icing/util/crc32.h"
40 
41 namespace icing {
42 namespace lib {
43 
44 // Holds the ground truth schema proto. Tracks compatible changes to the schema
45 // and will update any derived data based on the schema proto, such as Sections,
46 // SchemaTypeConfigs, PropertyConfigs, and SchemaTypeIds. To ensure they have
47 // the most up-to-date data, callers should not save instances themselves and
48 // should always call Get* from the SchemaStore.
49 class SchemaStore {
50  public:
51   struct Header {
52     static constexpr int32_t kMagic = 0x72650d0a;
53 
54     // Holds the magic as a quick sanity check against file corruption.
55     int32_t magic;
56 
57     // Checksum of the SchemaStore's sub-component's checksums.
58     uint32_t checksum;
59   };
60 
61   // Holds information on what may have been affected by the new schema. This is
62   // generally data that other classes may depend on from the SchemaStore,
63   // so that we can know if we should go update those classes as well.
64   struct SetSchemaResult {
65     // Whether we are able to write the schema as determined by SetSchema's
66     // arguments. This boolean reflects SetSchema's logic, and does not reflect
67     // any system level IO errors that may prevent the schema from being written
68     // to file.
69     bool success = false;
70 
71     // Whether the new schema changes invalidate the index.
72     bool index_incompatible = false;
73 
74     // SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if:
75     //   1. Schema types are added in the middle of the SchemaProto
76     //   2. Schema types are removed from the middle of the SchemaProto
77     //   3. Schema types are reordered in the SchemaProto
78     //
79     // SchemaTypeIds are not changed if schema types are added/removed to the
80     // end of the SchemaProto.
81     std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
82 
83     // Schema types that have been removed from the new schema. Represented by
84     // the `schema_type` field in the SchemaTypeConfigProto.
85     std::unordered_set<std::string> schema_types_deleted_by_name;
86 
87     // Schema types that have been removed from the new schema. Represented by
88     // the SchemaTypeId assigned to this SchemaTypeConfigProto in the *old*
89     // schema.
90     std::unordered_set<SchemaTypeId> schema_types_deleted_by_id;
91 
92     // Schema types whose SchemaTypeConfigProto has changed in an incompatible
93     // manner in the new schema. Compatibility determined in
94     // SchemaUtil::ComputeCompatibilityDelta. Represented by the `schema_type`
95     // field in the SchemaTypeConfigProto.
96     std::unordered_set<std::string> schema_types_incompatible_by_name;
97 
98     // Schema types whose SchemaTypeConfigProto has changed in an incompatible
99     // manner in the new schema. Compatibility determined in
100     // SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId
101     // assigned to this SchemaTypeConfigProto in the *old* schema.
102     std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id;
103   };
104 
105   // Factory function to create a SchemaStore which does not take ownership
106   // of any input components, and all pointers must refer to valid objects that
107   // outlive the created SchemaStore instance. The base_dir must already exist.
108   // There does not need to be an existing schema already.
109   //
110   // If initialize_stats is present, the fields related to SchemaStore will be
111   // populated.
112   //
113   // Returns:
114   //   A SchemaStore on success
115   //   FAILED_PRECONDITION on any null pointer input
116   //   INTERNAL_ERROR on any IO errors
117   static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
118       const Filesystem* filesystem, const std::string& base_dir,
119       const Clock* clock, InitializeStatsProto* initialize_stats = nullptr);
120 
121   // Not copyable
122   SchemaStore(const SchemaStore&) = delete;
123   SchemaStore& operator=(const SchemaStore&) = delete;
124 
125   // Persists and updates checksum of subcomponents.
126   ~SchemaStore();
127 
128   // Retrieve the current schema if it exists. Caller does not get ownership of
129   // the schema proto and modifying the returned pointer does not affect the
130   // underlying schema proto.
131   //
132   // Returns:
133   //   SchemaProto* if exists
134   //   INTERNAL_ERROR on any IO errors
135   //   NOT_FOUND_ERROR if a schema hasn't been set before
136   libtextclassifier3::StatusOr<const SchemaProto*> GetSchema() const;
137 
138   // Update our current schema if it's compatible. Does not accept incompatible
139   // schema. Compatibility rules defined by
140   // SchemaUtil::ComputeCompatibilityDelta.
141   //
142   // If ignore_errors_and_delete_documents is set to true, then incompatible
143   // schema are allowed and we'll force set the schema, meaning
144   // SetSchemaResult.success will always be true.
145   //
146   // Returns:
147   //   SetSchemaResult that encapsulates the differences between the old and new
148   //   schema, as well as if the new schema can be set.
149   //   INTERNAL_ERROR on any IO errors
150   libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema(
151       const SchemaProto& new_schema,
152       bool ignore_errors_and_delete_documents = false);
153   libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema(
154       SchemaProto&& new_schema,
155       bool ignore_errors_and_delete_documents = false);
156 
157   // Get the SchemaTypeConfigProto of schema_type name.
158   //
159   // Returns:
160   //   SchemaTypeConfigProto on success
161   //   FAILED_PRECONDITION if schema hasn't been set yet
162   //   NOT_FOUND if schema type name doesn't exist
163   //   INTERNAL on any I/O errors
164   libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
165   GetSchemaTypeConfig(std::string_view schema_type) const;
166 
167   // Returns the SchemaTypeId of the passed in schema type
168   //
169   // Returns:
170   //   SchemaTypeId on success
171   //   FAILED_PRECONDITION if schema hasn't been set yet
172   //   NOT_FOUND_ERROR if we don't know about the schema type
173   //   INTERNAL_ERROR on IO error
174   libtextclassifier3::StatusOr<SchemaTypeId> GetSchemaTypeId(
175       std::string_view schema_type) const;
176 
177   // Finds content of a section by section path (e.g. property1.property2)
178   //
179   // Returns:
180   //   A string of content on success
181   //   FAILED_PRECONDITION if schema hasn't been set yet
182   //   NOT_FOUND if:
183   //     1. Property is optional and not found in the document
184   //     2. section_path is invalid
185   //     3. Content is empty
186   libtextclassifier3::StatusOr<std::vector<std::string_view>>
187   GetStringSectionContent(const DocumentProto& document,
188                           std::string_view section_path) const;
189 
190   // Finds content of a section by id
191   //
192   // Returns:
193   //   A string of content on success
194   //   FAILED_PRECONDITION if schema hasn't been set yet
195   //   INVALID_ARGUMENT if section id is invalid
196   //   NOT_FOUND if type config name of document not found
197   libtextclassifier3::StatusOr<std::vector<std::string_view>>
198   GetStringSectionContent(const DocumentProto& document,
199                           SectionId section_id) const;
200 
201   // Returns the SectionMetadata associated with the SectionId that's in the
202   // SchemaTypeId.
203   //
204   // Returns:
205   //   pointer to SectionMetadata on success
206   //   FAILED_PRECONDITION if schema hasn't been set yet
207   //   INVALID_ARGUMENT if schema type id or section is invalid
208   libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata(
209       SchemaTypeId schema_type_id, SectionId section_id) const;
210 
211   // Extracts all sections from the given document, sections are sorted by
212   // section id in increasing order. Section ids start from 0. Sections with
213   // empty content won't be returned.
214   //
215   // Returns:
216   //   A list of sections on success
217   //   FAILED_PRECONDITION if schema hasn't been set yet
218   //   NOT_FOUND if type config name of document not found
219   libtextclassifier3::StatusOr<std::vector<Section>> ExtractSections(
220       const DocumentProto& document) const;
221 
222   // Syncs all the data changes to disk.
223   //
224   // Returns:
225   //   OK on success
226   //   INTERNAL on I/O errors.
227   libtextclassifier3::Status PersistToDisk();
228 
229   // Computes the combined checksum of the schema store - includes the ground
230   // truth and all derived files.
231   //
232   // Returns:
233   //   Combined checksum on success
234   //   INTERNAL_ERROR on compute error
235   libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
236 
237   // Calculates the StorageInfo for the Schema Store.
238   //
239   // If an IO error occurs while trying to calculate the value for a field, then
240   // that field will be set to -1.
241   SchemaStoreStorageInfoProto GetStorageInfo() const;
242 
243  private:
244   // Use SchemaStore::Create instead.
245   explicit SchemaStore(const Filesystem* filesystem, std::string base_dir,
246                        const Clock* clock);
247 
248   // Handles initializing the SchemaStore and regenerating any data if needed.
249   //
250   // Returns:
251   //   OK on success
252   //   INTERNAL_ERROR on IO error
253   libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats);
254 
255   // Creates sub-components and verifies the integrity of each sub-component.
256   //
257   // Returns:
258   //   OK on success
259   //   INTERNAL_ERROR on IO error
260   libtextclassifier3::Status InitializeDerivedFiles();
261 
262   // Populates any derived data structures off of the schema.
263   //
264   // Returns:
265   //   OK on success
266   //   NOT_FOUND_ERROR if a schema proto has not been set
267   //   INTERNAL_ERROR on any IO errors
268   libtextclassifier3::Status RegenerateDerivedFiles();
269 
270   // Checks if the header exists already. This does not create the header file
271   // if it doesn't exist.
272   bool HeaderExists();
273 
274   // Update and replace the header file. Creates the header file if it doesn't
275   // exist.
276   //
277   // Returns:
278   //   OK on success
279   //   INTERNAL on I/O error
280   libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
281 
282   // Resets the unique_ptr to the schema_type_mapper_, deletes the underlying
283   // file, and re-creates a new instance of the schema_type_mapper_. Does not
284   // populate the schema_type_mapper_.
285   //
286   // Returns any IO errors.
287   libtextclassifier3::Status ResetSchemaTypeMapper();
288 
CheckSchemaSet()289   libtextclassifier3::Status CheckSchemaSet() const {
290     return has_schema_successfully_set_
291                ? libtextclassifier3::Status::OK
292                : absl_ports::FailedPreconditionError("Schema not set yet.");
293   }
294 
295   const Filesystem& filesystem_;
296   const std::string base_dir_;
297   const Clock& clock_;
298 
299   // Used internally to indicate whether the class has been successfully
300   // initialized with a valid schema. Will be false if Initialize failed or no
301   // schema has ever been set.
302   bool has_schema_successfully_set_ = false;
303 
304   // Cached schema
305   FileBackedProto<SchemaProto> schema_file_;
306 
307   // A hash map of (type config name -> type config), allows faster lookup of
308   // type config in schema. The O(1) type config access makes schema-related and
309   // section-related operations faster.
310   SchemaUtil::TypeConfigMap type_config_map_;
311 
312   // Maps schema types to a densely-assigned unique id.
313   std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_;
314 
315   // Manager of indexed section related metadata.
316   std::unique_ptr<const SectionManager> section_manager_;
317 };
318 
319 }  // namespace lib
320 }  // namespace icing
321 
322 #endif  // ICING_SCHEMA_SCHEMA_STORE_H_
323