• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/schema/schema-store.h"
16 
17 #include <cinttypes>
18 #include <cstddef>
19 #include <cstdint>
20 #include <limits>
21 #include <memory>
22 #include <optional>
23 #include <string>
24 #include <string_view>
25 #include <unordered_map>
26 #include <unordered_set>
27 #include <utility>
28 #include <vector>
29 
30 #include "icing/text_classifier/lib3/utils/base/status.h"
31 #include "icing/text_classifier/lib3/utils/base/statusor.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/str_cat.h"
34 #include "icing/feature-flags.h"
35 #include "icing/file/destructible-directory.h"
36 #include "icing/file/file-backed-proto.h"
37 #include "icing/file/filesystem.h"
38 #include "icing/file/version-util.h"
39 #include "icing/legacy/core/icing-string-util.h"
40 #include "icing/proto/debug.pb.h"
41 #include "icing/proto/document.pb.h"
42 #include "icing/proto/logging.pb.h"
43 #include "icing/proto/schema.pb.h"
44 #include "icing/proto/search.pb.h"
45 #include "icing/proto/storage.pb.h"
46 #include "icing/schema/backup-schema-producer.h"
47 #include "icing/schema/joinable-property.h"
48 #include "icing/schema/schema-property-iterator.h"
49 #include "icing/schema/schema-type-manager.h"
50 #include "icing/schema/schema-util.h"
51 #include "icing/schema/scorable_property_manager.h"
52 #include "icing/schema/section.h"
53 #include "icing/store/document-filter-data.h"
54 #include "icing/store/dynamic-trie-key-mapper.h"
55 #include "icing/util/clock.h"
56 #include "icing/util/crc32.h"
57 #include "icing/util/logging.h"
58 #include "icing/util/status-macros.h"
59 
60 namespace icing {
61 namespace lib {
62 
63 namespace {
64 
65 constexpr char kSchemaStoreHeaderFilename[] = "schema_store_header";
66 constexpr char kSchemaFilename[] = "schema.pb";
67 constexpr char kOverlaySchemaFilename[] = "overlay_schema.pb";
68 constexpr char kSchemaTypeMapperFilename[] = "schema_type_mapper";
69 
70 // This should be kept consistent with the delimiter used in AppSearch.
71 // See:
72 // https://cs.android.com/androidx/platform/frameworks/support/+/androidx-main:appsearch/appsearch-local-storage/src/main/java/androidx/appsearch/localstorage/util/PrefixUtil.java;l=42;drc=ffaf979c6f0cbd26caafd7a9d07a6bad12fe3a2a
73 
74 constexpr char kAppSearchDatabaseDelimiter = '/';
75 
76 // A DynamicTrieKeyMapper stores its data across 3 arrays internally. Giving
77 // each array 128KiB for storage means the entire DynamicTrieKeyMapper requires
78 // 384KiB.
79 constexpr int32_t kSchemaTypeMapperMaxSize = 3 * 128 * 1024;  // 384 KiB
80 
MakeHeaderFilename(const std::string & base_dir)81 std::string MakeHeaderFilename(const std::string& base_dir) {
82   return absl_ports::StrCat(base_dir, "/", kSchemaStoreHeaderFilename);
83 }
84 
MakeSchemaFilename(const std::string & base_dir)85 std::string MakeSchemaFilename(const std::string& base_dir) {
86   return absl_ports::StrCat(base_dir, "/", kSchemaFilename);
87 }
88 
MakeOverlaySchemaFilename(const std::string & base_dir)89 std::string MakeOverlaySchemaFilename(const std::string& base_dir) {
90   return absl_ports::StrCat(base_dir, "/", kOverlaySchemaFilename);
91 }
92 
MakeSchemaTypeMapperFilename(const std::string & base_dir)93 std::string MakeSchemaTypeMapperFilename(const std::string& base_dir) {
94   return absl_ports::StrCat(base_dir, "/", kSchemaTypeMapperFilename);
95 }
96 
97 // Assuming that SchemaTypeIds are assigned to schema types based on their order
98 // in the SchemaProto. Check if the schema type->SchemaTypeId mapping would
99 // change with the new schema.
SchemaTypeIdsChanged(const SchemaProto & old_schema,const SchemaProto & new_schema)100 std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged(
101     const SchemaProto& old_schema, const SchemaProto& new_schema) {
102   std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
103 
104   std::unordered_map<std::string, int> old_types_and_index;
105   for (int i = 0; i < old_schema.types().size(); ++i) {
106     old_types_and_index.emplace(old_schema.types(i).schema_type(), i);
107   }
108 
109   std::unordered_map<std::string, int> new_types_and_index;
110   for (int i = 0; i < new_schema.types().size(); ++i) {
111     new_types_and_index.emplace(new_schema.types(i).schema_type(), i);
112   }
113 
114   for (const auto& old_type_index : old_types_and_index) {
115     const auto& iter = new_types_and_index.find(old_type_index.first);
116     // We only care if the type exists in both the old and new schema. If the
117     // type has been deleted, then it'll be captured in
118     // SetSchemaResult.schema_types_deleted*. If the type has been added in the
119     // new schema then we also don't care because nothing needs to be updated.
120     if (iter != new_types_and_index.end()) {
121       // Since the SchemaTypeId of the schema type is just the index of it in
122       // the SchemaProto, compare the index and save it if it's not the same
123       if (old_type_index.second != iter->second) {
124         old_schema_type_ids_changed.emplace(old_type_index.second);
125       }
126     }
127   }
128 
129   return old_schema_type_ids_changed;
130 }
131 
132 // Returns the database from the schema type name if it exists.
133 //
134 // The schema type is expected to be in the format of
135 // <database><delimiter><actual_type_name>.
136 //
137 // Returns an empty string if the schema type name is not in the database
138 // format.
GetDatabaseFromSchemaType(const std::string & schema_type,char database_delimeter)139 std::string GetDatabaseFromSchemaType(const std::string& schema_type,
140                                       char database_delimeter) {
141   size_t db_index = schema_type.find(database_delimeter);
142   std::string database;
143   if (db_index != std::string::npos) {
144     database = schema_type.substr(0, db_index);
145   }
146   return database;
147 }
148 
149 // For each schema type in the schema proto, parses out the database from the
150 // type name, and sets it as the database field in the input proto in
151 // place. The schema_type name field itself is not modified.
152 //
153 // If the schema type name does not contain an AppSearch database, then
154 // SchemaTypeConfigProto is not modified.
155 //
156 // Returns:
157 //   - True if any SchemaTypeConfigProto in the schema proto is rewritten.
158 //   - False otherwise.
ParseAndPopulateAppSearchDatabaseField(SchemaProto & schema_proto)159 bool ParseAndPopulateAppSearchDatabaseField(SchemaProto& schema_proto) {
160   bool populated_database_field = false;
161   for (auto& type : *schema_proto.mutable_types()) {
162     std::string database = GetDatabaseFromSchemaType(
163         type.schema_type(), kAppSearchDatabaseDelimiter);
164     if (type.database() != database) {
165       type.set_database(std::move(database));
166       populated_database_field = true;
167     }
168   }
169   return populated_database_field;
170 }
171 
172 // Compares the schema types list defined in two schemas, ignoring order.
173 //
174 // Requires: old_schema.schema_database() == new_schema.schema_database()
175 //
176 // Returns: true if the types in `new_schema` are identical to the types
177 // in `old_schema`, otherwise returns false.
AreSchemaTypesEqual(const SchemaProto & old_schema,const SchemaProto & new_schema)178 bool AreSchemaTypesEqual(const SchemaProto& old_schema,
179                          const SchemaProto& new_schema) {
180   if (old_schema.types().size() != new_schema.types().size()) {
181     return false;
182   }
183 
184   // Create a map of old schema types to and check that the new schema's types
185   // are identical.
186   std::unordered_map<std::string_view, const SchemaTypeConfigProto&>
187       old_schema_types;
188   old_schema_types.reserve(old_schema.types().size());
189   for (const SchemaTypeConfigProto& old_type : old_schema.types()) {
190     old_schema_types.emplace(old_type.schema_type(), old_type);
191   }
192   for (const SchemaTypeConfigProto& new_type : new_schema.types()) {
193     auto old_type_itr = old_schema_types.find(new_type.schema_type());
194     if (old_type_itr == old_schema_types.end()) {
195       return false;
196     }
197     if (old_type_itr->second.SerializeAsString() !=
198         new_type.SerializeAsString()) {
199       return false;
200     }
201   }
202 
203   return true;
204 }
205 
206 }  // namespace
207 
208 /* static */ libtextclassifier3::StatusOr<SchemaStore::Header>
Read(const Filesystem * filesystem,std::string path)209 SchemaStore::Header::Read(const Filesystem* filesystem, std::string path) {
210   if (!filesystem->FileExists(path.c_str())) {
211     return absl_ports::NotFoundError(
212         absl_ports::StrCat("Header file is empty: ", path));
213   }
214 
215   SerializedHeader serialized_header;
216   ScopedFd sfd(filesystem->OpenForWrite(path.c_str()));
217   if (!sfd.is_valid()) {
218     return absl_ports::InternalError("Unable to open or create header file.");
219   }
220 
221   // If file is sizeof(LegacyHeader), then it must be LegacyHeader.
222   int64_t file_size = filesystem->GetFileSize(sfd.get());
223   if (file_size == sizeof(LegacyHeader)) {
224     LegacyHeader legacy_header;
225     if (!filesystem->Read(sfd.get(), &legacy_header, sizeof(legacy_header))) {
226       return absl_ports::InternalError(
227           absl_ports::StrCat("Couldn't read: ", path));
228     }
229     if (legacy_header.magic != Header::kMagic) {
230       return absl_ports::InternalError(
231           absl_ports::StrCat("Invalid header kMagic for file: ", path));
232     }
233     serialized_header.checksum = legacy_header.checksum;
234   } else if (file_size == sizeof(SerializedHeader)) {
235     if (!filesystem->Read(sfd.get(), &serialized_header,
236                           sizeof(serialized_header))) {
237       return absl_ports::InternalError(
238           absl_ports::StrCat("Couldn't read: ", path));
239     }
240     if (serialized_header.magic != Header::kMagic) {
241       return absl_ports::InternalError(
242           absl_ports::StrCat("Invalid header kMagic for file: ", path));
243     }
244   } else if (file_size != 0) {
245     // file is neither the legacy header, the new header nor empty. Something is
246     // wrong here.
247     int legacy_header_size = sizeof(LegacyHeader);
248     int header_size = sizeof(SerializedHeader);
249     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
250         "Unexpected header size %" PRId64 ". Expected %d or %d", file_size,
251         legacy_header_size, header_size));
252   }
253   return Header(serialized_header, std::move(path), std::move(sfd), filesystem);
254 }
255 
Write()256 libtextclassifier3::Status SchemaStore::Header::Write() {
257   if (!dirty_) {
258     return libtextclassifier3::Status::OK;
259   }
260   if (!header_fd_.is_valid() && !filesystem_->FileExists(path_.c_str())) {
261     header_fd_.reset(filesystem_->OpenForWrite(path_.c_str()));
262   }
263   // This should overwrite the header.
264   if (!header_fd_.is_valid() ||
265       !filesystem_->PWrite(header_fd_.get(), /*offset=*/0, &serialized_header_,
266                            sizeof(serialized_header_))) {
267     return absl_ports::InternalError(
268         absl_ports::StrCat("Failed to write SchemaStore header"));
269   }
270   dirty_ = false;
271   return libtextclassifier3::Status::OK;
272 }
273 
PersistToDisk()274 libtextclassifier3::Status SchemaStore::Header::PersistToDisk() {
275   if (dirty_) {
276     ICING_RETURN_IF_ERROR(Write());
277   }
278   // This should overwrite the header.
279   if (!header_fd_.is_valid() || !filesystem_->DataSync(header_fd_.get())) {
280     return absl_ports::InternalError(
281         absl_ports::StrCat("Failed to sync SchemaStore header."));
282   }
283   return libtextclassifier3::Status::OK;
284 }
285 
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const FeatureFlags * feature_flags,InitializeStatsProto * initialize_stats)286 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
287     const Filesystem* filesystem, const std::string& base_dir,
288     const Clock* clock, const FeatureFlags* feature_flags,
289     InitializeStatsProto* initialize_stats) {
290   ICING_RETURN_ERROR_IF_NULL(filesystem);
291   ICING_RETURN_ERROR_IF_NULL(clock);
292   ICING_RETURN_ERROR_IF_NULL(feature_flags);
293 
294   if (!filesystem->DirectoryExists(base_dir.c_str())) {
295     return absl_ports::FailedPreconditionError(
296         "Schema store base directory does not exist!");
297   }
298   std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
299       new SchemaStore(filesystem, base_dir, clock, feature_flags));
300   ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats));
301   return schema_store;
302 }
303 
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const FeatureFlags * feature_flags,SchemaProto schema)304 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
305     const Filesystem* filesystem, const std::string& base_dir,
306     const Clock* clock, const FeatureFlags* feature_flags, SchemaProto schema) {
307   ICING_RETURN_ERROR_IF_NULL(filesystem);
308   ICING_RETURN_ERROR_IF_NULL(clock);
309   ICING_RETURN_ERROR_IF_NULL(feature_flags);
310 
311   if (!filesystem->DirectoryExists(base_dir.c_str())) {
312     return absl_ports::FailedPreconditionError(
313         "Schema store base directory does not exist!");
314   }
315   std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
316       new SchemaStore(filesystem, base_dir, clock, feature_flags));
317   ICING_RETURN_IF_ERROR(schema_store->Initialize(std::move(schema)));
318   return schema_store;
319 }
320 
321 /* static */ libtextclassifier3::Status
PopulateSchemaDatabaseFieldForSchemaFile(const Filesystem * filesystem,const std::string & schema_filename)322 SchemaStore::PopulateSchemaDatabaseFieldForSchemaFile(
323     const Filesystem* filesystem, const std::string& schema_filename) {
324   FileBackedProto<SchemaProto> schema_file(*filesystem, schema_filename);
325   auto schema_proto_or = schema_file.Read();
326   if (absl_ports::IsNotFound(schema_proto_or.status())) {
327     // Don't have an existing schema proto, that's fine
328     return libtextclassifier3::Status::OK;
329   } else if (!schema_proto_or.ok()) {
330     // Real error when trying to read the existing schema
331     return schema_proto_or.status();
332   }
333 
334   SchemaProto schema_proto_copy = *schema_proto_or.ValueOrDie();
335   bool schema_changed =
336       ParseAndPopulateAppSearchDatabaseField(schema_proto_copy);
337   if (!schema_changed) {
338     // Nothing to do if the schema is not changed.
339     return libtextclassifier3::Status::OK;
340   }
341 
342   // Create a temporary schema file and schema proto copy to update the
343   // schema.
344   std::string temp_schema_filename = schema_filename + ".tmp";
345   if (!filesystem->DeleteFile(temp_schema_filename.c_str())) {
346     return absl_ports::InternalError(
347         "Unable to delete temp schema file to prepare for schema database "
348         "migration.");
349   }
350 
351   {
352     FileBackedProto<SchemaProto> temp_schema_file(*filesystem,
353                                                   temp_schema_filename);
354     ICING_RETURN_IF_ERROR(temp_schema_file.Write(
355         std::make_unique<SchemaProto>(schema_proto_copy)));
356   }
357 
358   // Swap the temp schema file with the original schema file.
359   if (!filesystem->SwapFiles(temp_schema_filename.c_str(),
360                              schema_filename.c_str())) {
361     return absl_ports::InternalError(
362         "Unable to apply migrated schema with database due to failed swap!");
363   }
364   // Clean up the temp schema file.
365   if (!filesystem->DeleteFile(temp_schema_filename.c_str())) {
366     return absl_ports::InternalError(
367         "Unable to delete temp schema file after schema database migration.");
368   }
369 
370   return libtextclassifier3::Status::OK;
371 }
372 
DiscardOverlaySchema(const Filesystem * filesystem,const std::string & base_dir,Header & header)373 /* static */ libtextclassifier3::Status SchemaStore::DiscardOverlaySchema(
374     const Filesystem* filesystem, const std::string& base_dir, Header& header) {
375   std::string header_filename = MakeHeaderFilename(base_dir);
376   if (header.overlay_created()) {
377     header.SetOverlayInfo(
378         /*overlay_created=*/false,
379         /*min_overlay_version_compatibility=*/std::numeric_limits<
380             int32_t>::max());
381     ICING_RETURN_IF_ERROR(header.Write());
382   }
383   std::string schema_overlay_filename = MakeOverlaySchemaFilename(base_dir);
384   if (!filesystem->DeleteFile(schema_overlay_filename.c_str())) {
385     return absl_ports::InternalError(
386         "Unable to delete stale schema overlay file.");
387   }
388   return libtextclassifier3::Status::OK;
389 }
390 
MigrateSchema(const Filesystem * filesystem,const std::string & base_dir,version_util::StateChange version_state_change,int32_t new_version,bool perform_schema_database_migration)391 /* static */ libtextclassifier3::Status SchemaStore::MigrateSchema(
392     const Filesystem* filesystem, const std::string& base_dir,
393     version_util::StateChange version_state_change, int32_t new_version,
394     bool perform_schema_database_migration) {
395   if (!filesystem->DirectoryExists(base_dir.c_str())) {
396     // Situations when schema store directory doesn't exist:
397     // - Initializing new Icing instance: don't have to do anything now. The
398     //   directory will be created later.
399     // - Lose schema store: there is nothing we can do now. The logic will be
400     //   handled later by initializing.
401     //
402     // Therefore, just simply return OK here.
403     return libtextclassifier3::Status::OK;
404   }
405 
406   ICING_RETURN_IF_ERROR(HandleOverlaySchemaForVersionChange(
407       filesystem, base_dir, version_state_change, new_version));
408 
409   // Perform schema database migration if needed.
410   // - This populates the the database field in the schema proto and writes it
411   //   to the schema file.
412   // - If the overlay schema file exists at this point, does the same for the
413   //   overlay schema.
414   if (perform_schema_database_migration) {
415     std::string base_schema_filename = MakeSchemaFilename(base_dir);
416     ICING_RETURN_IF_ERROR(PopulateSchemaDatabaseFieldForSchemaFile(
417         filesystem, base_schema_filename));
418 
419     std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir);
420     if (filesystem->FileExists(overlay_schema_filename.c_str())) {
421       ICING_RETURN_IF_ERROR(PopulateSchemaDatabaseFieldForSchemaFile(
422           filesystem, overlay_schema_filename));
423     }
424   }
425 
426   return libtextclassifier3::Status::OK;
427 }
428 
429 /* static */ libtextclassifier3::Status
HandleOverlaySchemaForVersionChange(const Filesystem * filesystem,const std::string & base_dir,version_util::StateChange version_state_change,int32_t new_version)430 SchemaStore::HandleOverlaySchemaForVersionChange(
431     const Filesystem* filesystem, const std::string& base_dir,
432     version_util::StateChange version_state_change, int32_t new_version) {
433   std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir);
434   if (!filesystem->FileExists(overlay_schema_filename.c_str())) {
435     // The overlay doesn't exist. So there should be nothing particularly
436     // interesting to worry about.
437     return libtextclassifier3::Status::OK;
438   }
439 
440   std::string header_filename = MakeHeaderFilename(base_dir);
441   libtextclassifier3::StatusOr<Header> header_or;
442   switch (version_state_change) {
443     // No necessary actions for normal upgrades or no version change. The data
444     // that was produced by the previous version is fully compatible with this
445     // version and there's no stale data for us to clean up.
446     // The same is true for a normal rollforward. A normal rollforward implies
447     // that the previous version was one that understood the concept of the
448     // overlay schema and would have already discarded it if it was unusable.
449     case version_util::StateChange::kVersionZeroUpgrade:
450       // fallthrough
451     case version_util::StateChange::kUpgrade:
452       // fallthrough
453     case version_util::StateChange::kRollForward:
454       // fallthrough
455     case version_util::StateChange::kCompatible:
456       return libtextclassifier3::Status::OK;
457     case version_util::StateChange::kVersionZeroRollForward: {
458       // We've rolled forward. The schema overlay file, if it exists, is
459       // possibly stale. We must throw it out.
460       header_or = Header::Read(filesystem, header_filename);
461       ICING_RETURN_IF_ERROR(header_or.status());
462       return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
463                                                header_or.ValueOrDie());
464     }
465     case version_util::StateChange::kRollBack: {
466       header_or = Header::Read(filesystem, header_filename);
467       ICING_RETURN_IF_ERROR(header_or.status());
468       if (header_or.ValueOrDie().min_overlay_version_compatibility() <=
469           new_version) {
470         // We've been rolled back, but the overlay schema claims that it
471         // supports this version. So we can safely return.
472         return libtextclassifier3::Status::OK;
473       }
474       // We've been rolled back to a version that the overlay schema doesn't
475       // support. We must throw it out.
476       return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
477                                                header_or.ValueOrDie());
478     }
479     case version_util::StateChange::kUndetermined:
480       // It's not clear what version we're on, but the base schema should always
481       // be safe to use. Throw out the overlay.
482       header_or = Header::Read(filesystem, header_filename);
483       ICING_RETURN_IF_ERROR(header_or.status());
484       return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
485                                                header_or.ValueOrDie());
486   }
487   return libtextclassifier3::Status::OK;
488 }
489 
DiscardDerivedFiles(const Filesystem * filesystem,const std::string & base_dir)490 /* static */ libtextclassifier3::Status SchemaStore::DiscardDerivedFiles(
491     const Filesystem* filesystem, const std::string& base_dir) {
492   // Schema type mapper
493   return DynamicTrieKeyMapper<SchemaTypeId>::Delete(
494       *filesystem, MakeSchemaTypeMapperFilename(base_dir));
495 }
496 
SchemaStore(const Filesystem * filesystem,std::string base_dir,const Clock * clock,const FeatureFlags * feature_flags)497 SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir,
498                          const Clock* clock, const FeatureFlags* feature_flags)
499     : filesystem_(filesystem),
500       base_dir_(std::move(base_dir)),
501       clock_(clock),
502       feature_flags_(feature_flags),
503       schema_file_(filesystem, MakeSchemaFilename(base_dir_)) {}
504 
~SchemaStore()505 SchemaStore::~SchemaStore() {
506   if (has_schema_successfully_set_ && schema_type_mapper_ != nullptr &&
507       schema_type_manager_ != nullptr) {
508     if (!PersistToDisk().ok()) {
509       ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor";
510     }
511   }
512 }
513 
Initialize(SchemaProto new_schema)514 libtextclassifier3::Status SchemaStore::Initialize(SchemaProto new_schema) {
515   ICING_RETURN_IF_ERROR(LoadSchema());
516   if (!absl_ports::IsNotFound(GetSchema().status())) {
517     return absl_ports::FailedPreconditionError(
518         "Incorrectly tried to initialize schema store with a new schema, when "
519         "one is already set!");
520   }
521   // ResetSchemaFileIfNeeded() will be called in InitializeInternal below.
522   ICING_RETURN_IF_ERROR(
523       schema_file_.Write(std::make_unique<SchemaProto>(std::move(new_schema))));
524   return InitializeInternal(/*create_overlay_if_necessary=*/true,
525                             /*initialize_stats=*/nullptr);
526 }
527 
Initialize(InitializeStatsProto * initialize_stats)528 libtextclassifier3::Status SchemaStore::Initialize(
529     InitializeStatsProto* initialize_stats) {
530   ICING_RETURN_IF_ERROR(LoadSchema());
531   auto schema_proto_or = GetSchema();
532   if (absl_ports::IsNotFound(schema_proto_or.status())) {
533     // Don't have an existing schema proto, that's fine
534     return libtextclassifier3::Status::OK;
535   } else if (!schema_proto_or.ok()) {
536     // Real error when trying to read the existing schema
537     return schema_proto_or.status();
538   }
539   return InitializeInternal(/*create_overlay_if_necessary=*/false,
540                             initialize_stats);
541 }
542 
LoadSchema()543 libtextclassifier3::Status SchemaStore::LoadSchema() {
544   libtextclassifier3::StatusOr<Header> header_or =
545       Header::Read(filesystem_, MakeHeaderFilename(base_dir_));
546   bool header_exists = false;
547   if (!header_or.ok() && !absl_ports::IsNotFound(header_or.status())) {
548     return header_or.status();
549   } else if (!header_or.ok()) {
550     header_ =
551         std::make_unique<Header>(filesystem_, MakeHeaderFilename(base_dir_));
552   } else {
553     header_exists = true;
554     header_ = std::make_unique<Header>(std::move(header_or).ValueOrDie());
555   }
556 
557   std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir_);
558   bool overlay_schema_file_exists =
559       filesystem_->FileExists(overlay_schema_filename.c_str());
560 
561   libtextclassifier3::Status base_schema_state = schema_file_.Read().status();
562   if (!base_schema_state.ok() && !absl_ports::IsNotFound(base_schema_state)) {
563     ResetSchemaFileIfNeeded();
564     return base_schema_state;
565   }
566 
567   // There are three valid cases:
568   // 1. Everything is missing. This is an empty schema store.
569   if (!base_schema_state.ok() && !overlay_schema_file_exists &&
570       !header_exists) {
571     ResetSchemaFileIfNeeded();
572     return libtextclassifier3::Status::OK;
573   }
574 
575   // 2. There never was a overlay schema. The header exists, the base schema
576   //    exists and the header says the overlay schema shouldn't exist
577   if (base_schema_state.ok() && !overlay_schema_file_exists && header_exists &&
578       !header_->overlay_created()) {
579     // Nothing else to do. Just return safely.
580     ResetSchemaFileIfNeeded();
581     return libtextclassifier3::Status::OK;
582   }
583 
584   // 3. There is an overlay schema and a base schema and a header. The header
585   // says that the overlay schema should exist.
586   if (base_schema_state.ok() && overlay_schema_file_exists && header_exists &&
587       header_->overlay_created()) {
588     overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
589         *filesystem_, MakeOverlaySchemaFilename(base_dir_));
590     ResetSchemaFileIfNeeded();
591     return libtextclassifier3::Status::OK;
592   }
593 
594   // Something has gone wrong. We've lost part of the schema ground truth.
595   // Return an error.
596   bool overlay_created = header_->overlay_created();
597   bool base_schema_exists = base_schema_state.ok();
598   ResetSchemaFileIfNeeded();
599   return absl_ports::InternalError(IcingStringUtil::StringPrintf(
600       "Unable to properly load schema. Header {exists:%d, overlay_created:%d}, "
601       "base schema exists: %d, overlay_schema_exists: %d",
602       header_exists, overlay_created, base_schema_exists,
603       overlay_schema_file_exists));
604 }
605 
InitializeInternal(bool create_overlay_if_necessary,InitializeStatsProto * initialize_stats)606 libtextclassifier3::Status SchemaStore::InitializeInternal(
607     bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats) {
608   if (!InitializeDerivedFiles().ok()) {
609     ICING_VLOG(3)
610         << "Couldn't find derived files or failed to initialize them, "
611            "regenerating derived files for SchemaStore.";
612     std::unique_ptr<Timer> regenerate_timer = clock_->GetNewTimer();
613     if (initialize_stats != nullptr) {
614       initialize_stats->set_schema_store_recovery_cause(
615           InitializeStatsProto::IO_ERROR);
616     }
617     ICING_RETURN_IF_ERROR(RegenerateDerivedFiles(create_overlay_if_necessary));
618     if (initialize_stats != nullptr) {
619       initialize_stats->set_schema_store_recovery_latency_ms(
620           regenerate_timer->GetElapsedMilliseconds());
621     }
622   }
623 
624   if (initialize_stats != nullptr) {
625     initialize_stats->set_num_schema_types(type_config_map_.size());
626   }
627   has_schema_successfully_set_ = true;
628 
629   return libtextclassifier3::Status::OK;
630 }
631 
InitializeDerivedFiles()632 libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() {
633   ICING_ASSIGN_OR_RETURN(
634       schema_type_mapper_,
635       DynamicTrieKeyMapper<SchemaTypeId>::Create(
636           *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
637           kSchemaTypeMapperMaxSize));
638 
639   Crc32 expected_checksum(header_->checksum());
640   ICING_ASSIGN_OR_RETURN(Crc32 checksum, GetChecksum());
641   if (checksum != expected_checksum) {
642     return absl_ports::InternalError(
643         "Combined checksum of SchemaStore was inconsistent");
644   }
645 
646   ICING_RETURN_IF_ERROR(BuildInMemoryCache());
647   return libtextclassifier3::Status::OK;
648 }
649 
RegenerateDerivedFiles(bool create_overlay_if_necessary)650 libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles(
651     bool create_overlay_if_necessary) {
652   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
653 
654   ICING_RETURN_IF_ERROR(ResetSchemaTypeMapper());
655 
656   for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
657     // Assign a SchemaTypeId to the type
658     ICING_RETURN_IF_ERROR(schema_type_mapper_->Put(
659         type_config.schema_type(), schema_type_mapper_->num_keys()));
660   }
661   ICING_RETURN_IF_ERROR(BuildInMemoryCache());
662 
663   if (create_overlay_if_necessary) {
664     BackupSchemaProducer producer(feature_flags_);
665     ICING_ASSIGN_OR_RETURN(
666         BackupSchemaProducer::BackupSchemaResult backup_result,
667         producer.Produce(*schema_proto,
668                          schema_type_manager_->section_manager()));
669 
670     if (backup_result.backup_schema_produced) {
671       // The overlay schema should be written to the overlay file location.
672       overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
673           *filesystem_, MakeOverlaySchemaFilename(base_dir_));
674       auto schema_ptr = std::make_unique<SchemaProto>(std::move(*schema_proto));
675       ICING_RETURN_IF_ERROR(overlay_schema_file_->Write(std::move(schema_ptr)));
676 
677       // The base schema should be written to the original file
678       auto base_schema_ptr =
679           std::make_unique<SchemaProto>(std::move(backup_result.backup_schema));
680       ICING_RETURN_IF_ERROR(schema_file_.Write(std::move(base_schema_ptr)));
681 
682       // LINT.IfChange(min_overlay_version_compatibility)
683       // Although the current version is 5, the schema is compatible with
684       // version 1, so min_overlay_version_compatibility should be 1.
685       int32_t min_overlay_version_compatibility = version_util::kVersionOne;
686       // LINT.ThenChange(//depot/google3/icing/file/version-util.h:kVersion)
687       header_->SetOverlayInfo(
688           /*overlay_created=*/true, min_overlay_version_compatibility);
689       // Rebuild in memory data - references to the old schema will be invalid
690       // now.
691       ICING_RETURN_IF_ERROR(BuildInMemoryCache());
692     }
693   }
694 
695   // Write the header
696   ICING_RETURN_IF_ERROR(UpdateChecksum());
697   ResetSchemaFileIfNeeded();
698   return libtextclassifier3::Status::OK;
699 }
700 
BuildInMemoryCache()701 libtextclassifier3::Status SchemaStore::BuildInMemoryCache() {
702   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
703   ICING_ASSIGN_OR_RETURN(
704       SchemaUtil::InheritanceMap inheritance_map,
705       SchemaUtil::BuildTransitiveInheritanceGraph(*schema_proto));
706 
707   reverse_schema_type_mapper_.clear();
708   database_type_map_.clear();
709   type_config_map_.clear();
710   schema_subtype_id_map_.clear();
711   for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
712     const std::string& database = type_config.database();
713     const std::string& type_name = type_config.schema_type();
714     ICING_ASSIGN_OR_RETURN(SchemaTypeId type_id,
715                            schema_type_mapper_->Get(type_name));
716 
717     // Build reverse_schema_type_mapper_
718     reverse_schema_type_mapper_.insert({type_id, type_name});
719 
720     // Build database_type_map_
721     database_type_map_[database].push_back(type_name);
722 
723     // Build type_config_map_
724     type_config_map_.insert({type_name, type_config});
725 
726     // Build schema_subtype_id_map_
727     std::unordered_set<SchemaTypeId>& subtype_id_set =
728         schema_subtype_id_map_[type_id];
729     // Find all child types
730     auto child_types_names = inheritance_map.find(type_name);
731     if (child_types_names != inheritance_map.end()) {
732       subtype_id_set.reserve(child_types_names->second.size() + 1);
733       for (const auto& [child_type_name, is_direct_child] :
734            child_types_names->second) {
735         ICING_ASSIGN_OR_RETURN(SchemaTypeId child_type_id,
736                                schema_type_mapper_->Get(child_type_name));
737         subtype_id_set.insert(child_type_id);
738       }
739     }
740     // Every type is a subtype of itself.
741     subtype_id_set.insert(type_id);
742   }
743 
744   // Build schema_type_manager_
745   ICING_ASSIGN_OR_RETURN(
746       schema_type_manager_,
747       SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
748 
749   scorable_property_manager_ = std::make_unique<ScorablePropertyManager>();
750 
751   return libtextclassifier3::Status::OK;
752 }
753 
ResetSchemaTypeMapper()754 libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
755   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
756   schema_type_mapper_.reset();
757   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
758   // that can support error logging.
759   libtextclassifier3::Status status =
760       DynamicTrieKeyMapper<SchemaTypeId>::Delete(
761           *filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
762   if (!status.ok()) {
763     ICING_LOG(ERROR) << status.error_message()
764                      << "Failed to delete old schema_type mapper";
765     return status;
766   }
767   ICING_ASSIGN_OR_RETURN(
768       schema_type_mapper_,
769       DynamicTrieKeyMapper<SchemaTypeId>::Create(
770           *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
771           kSchemaTypeMapperMaxSize));
772 
773   return libtextclassifier3::Status::OK;
774 }
775 
GetChecksum() const776 libtextclassifier3::StatusOr<Crc32> SchemaStore::GetChecksum() const {
777   ICING_ASSIGN_OR_RETURN(Crc32 schema_checksum, schema_file_.GetChecksum());
778   // We've gotten the schema_checksum successfully. Sadly, we still need to
779   // differentiate between an existing, but empty schema and a non-existent
780   // schema (both of which will have a checksum of 0). For existing, but empty
781   // schemas, we need to continue with the checksum calculation of the other
782   // components.
783   if (schema_checksum == Crc32() && !has_schema_successfully_set_) {
784     return schema_checksum;
785   }
786 
787   Crc32 total_checksum;
788   total_checksum.Append(std::to_string(schema_checksum.Get()));
789   if (overlay_schema_file_ != nullptr) {
790     ICING_ASSIGN_OR_RETURN(Crc32 overlay_schema_checksum,
791                            overlay_schema_file_->GetChecksum());
792     total_checksum.Append(std::to_string(overlay_schema_checksum.Get()));
793   }
794 
795   ICING_ASSIGN_OR_RETURN(Crc32 schema_type_mapper_checksum,
796                          schema_type_mapper_->GetChecksum());
797   total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
798   return total_checksum;
799 }
800 
UpdateChecksum()801 libtextclassifier3::StatusOr<Crc32> SchemaStore::UpdateChecksum() {
802   ICING_ASSIGN_OR_RETURN(Crc32 schema_checksum, schema_file_.GetChecksum());
803   // We've gotten the schema_checksum successfully. Sadly, we still need to
804   // differentiate between an existing, but empty schema and a non-existent
805   // schema (both of which will have a checksum of 0). For existing, but empty
806   // schemas, we need to continue with the checksum calculation of the other
807   // components.
808   if (schema_checksum == Crc32() && !has_schema_successfully_set_) {
809     return schema_checksum;
810   }
811   Crc32 total_checksum;
812   total_checksum.Append(std::to_string(schema_checksum.Get()));
813 
814   if (overlay_schema_file_ != nullptr) {
815     ICING_ASSIGN_OR_RETURN(Crc32 overlay_schema_checksum,
816                            overlay_schema_file_->GetChecksum());
817     total_checksum.Append(std::to_string(overlay_schema_checksum.Get()));
818   }
819 
820   ICING_ASSIGN_OR_RETURN(Crc32 schema_type_mapper_checksum,
821                          schema_type_mapper_->UpdateChecksum());
822   total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
823 
824   header_->set_checksum(total_checksum.Get());
825   ICING_RETURN_IF_ERROR(header_->Write());
826   return total_checksum;
827 }
828 
GetSchema() const829 libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema()
830     const {
831   if (overlay_schema_file_ != nullptr) {
832     return overlay_schema_file_->Read();
833   }
834 
835   return schema_file_.Read();
836 }
837 
GetSchema(const std::string & database) const838 libtextclassifier3::StatusOr<SchemaProto> SchemaStore::GetSchema(
839     const std::string& database) const {
840   if (!has_schema_successfully_set_) {
841     return absl_ports::NotFoundError("No schema found.");
842   }
843 
844   const auto database_type_map_itr_ = database_type_map_.find(database);
845   if (database_type_map_itr_ == database_type_map_.end()) {
846     return absl_ports::NotFoundError(
847         absl_ports::StrCat("No schema found for database '", database, "'."));
848   }
849 
850   SchemaProto schema_proto;
851   for (const std::string& type_name : database_type_map_itr_->second) {
852     ICING_ASSIGN_OR_RETURN(const SchemaTypeConfigProto* type_config,
853                            GetSchemaTypeConfig(type_name));
854     *schema_proto.add_types() = *type_config;
855   }
856   return schema_proto;
857 }
858 
859 // TODO - b/337913932 - Remove this method once all callers are migrated to
860 // SetSchema(SetSchemaRequestProto&& set_schema_request). This should just be
861 // used in our tests.
862 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetSchema(SchemaProto new_schema,bool ignore_errors_and_delete_documents)863 SchemaStore::SetSchema(SchemaProto new_schema,
864                        bool ignore_errors_and_delete_documents) {
865   SetSchemaRequestProto set_schema_request;
866   *set_schema_request.mutable_schema() = std::move(new_schema);
867   set_schema_request.set_ignore_errors_and_delete_documents(
868       ignore_errors_and_delete_documents);
869 
870   return SetSchema(std::move(set_schema_request));
871 }
872 
873 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetSchema(SetSchemaRequestProto && set_schema_request)874 SchemaStore::SetSchema(SetSchemaRequestProto&& set_schema_request) {
875   bool ignore_errors_and_delete_documents =
876       set_schema_request.ignore_errors_and_delete_documents();
877 
878   if (feature_flags_->enable_schema_database()) {
879     // Step 1: (Only required if schema database is enabled)
880     // Do some preliminary checks on the new schema before formal validation and
881     // delta computation. This checks that:
882     // - The database field in the new schema's types match the provided
883     //   database.
884     // - The new schema's type names are not already in use from other
885     //   databases.
886     ICING_RETURN_IF_ERROR(ValidateSchemaDatabase(
887         set_schema_request.schema(), set_schema_request.database()));
888 
889     // Step 2: Schema validation and delta computation -- try to get the
890     // existing schema for the database to compare to the new schema.
891     libtextclassifier3::StatusOr<SchemaProto> schema_proto =
892         GetSchema(set_schema_request.database());
893     if (absl_ports::IsNotFound(schema_proto.status())) {
894       // Case 1: No preexisting schema for this database.
895       return SetInitialSchemaForDatabase(
896           std::move(*set_schema_request.mutable_schema()),
897           set_schema_request.database(), ignore_errors_and_delete_documents);
898     }
899 
900     if (!schema_proto.ok()) {
901       // Case 2: Real error
902       return schema_proto.status();
903     }
904 
905     // Case 3: At this point, we're guaranteed that we have an existing schema
906     // for this database.
907     const SchemaProto& old_schema = schema_proto.ValueOrDie();
908     return SetSchemaWithDatabaseOverride(
909         std::move(*set_schema_request.mutable_schema()), old_schema,
910         set_schema_request.database(), ignore_errors_and_delete_documents);
911   }
912 
913   // Get the full schema if schema database is disabled.
914   libtextclassifier3::StatusOr<const SchemaProto*> schema_proto = GetSchema();
915   if (absl_ports::IsNotFound(schema_proto.status())) {
916     // Case 1: No preexisting schema
917     return SetInitialSchemaForDatabase(
918         std::move(*set_schema_request.mutable_schema()),
919         set_schema_request.database(), ignore_errors_and_delete_documents);
920   }
921 
922   if (!schema_proto.ok()) {
923     // Case 2: Real error
924     return schema_proto.status();
925   }
926 
927   // Case 3: At this point, we're guaranteed that we have an existing schema
928   const SchemaProto& old_schema = *schema_proto.ValueOrDie();
929   return SetSchemaWithDatabaseOverride(
930       std::move(*set_schema_request.mutable_schema()), old_schema,
931       set_schema_request.database(), ignore_errors_and_delete_documents);
932 }
933 
934 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetInitialSchemaForDatabase(SchemaProto new_schema,const std::string & database,bool ignore_errors_and_delete_documents)935 SchemaStore::SetInitialSchemaForDatabase(
936     SchemaProto new_schema, const std::string& database,
937     bool ignore_errors_and_delete_documents) {
938   SetSchemaResult result;
939 
940   ICING_RETURN_IF_ERROR(SchemaUtil::Validate(new_schema, *feature_flags_));
941 
942   result.success = true;
943   for (const SchemaTypeConfigProto& type_config : new_schema.types()) {
944     result.schema_types_new_by_name.insert(type_config.schema_type());
945   }
946   // Get the full new SchemaProto that is a combination of the existing schema
947   // and new_schema. This is needed as we can only write the full proto to the
948   // schema file.
949   ICING_ASSIGN_OR_RETURN(
950       SchemaProto full_new_schema,
951       GetFullSchemaProtoWithUpdatedDb(std::move(new_schema), database));
952   ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(full_new_schema)));
953   has_schema_successfully_set_ = true;
954 
955   return result;
956 }
957 
958 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetSchemaWithDatabaseOverride(SchemaProto new_schema,const SchemaProto & old_schema,const std::string & database,bool ignore_errors_and_delete_documents)959 SchemaStore::SetSchemaWithDatabaseOverride(
960     SchemaProto new_schema, const SchemaProto& old_schema,
961     const std::string& database, bool ignore_errors_and_delete_documents) {
962   // Assume we can set the schema unless proven otherwise.
963   SetSchemaResult result;
964   result.success = true;
965 
966   if (feature_flags_->enable_schema_database()) {
967     // Sanity check to make sure that we're comparing schemas from the same
968     // database.
969     // The new code path ensures that old_schema contains types from exactly one
970     // database since it's obtained using GetSchema(database), which is
971     // guaranteed to only return types from the single provided database.
972     libtextclassifier3::Status validate_old_schema_database =
973         ValidateSchemaDatabase(old_schema, database);
974     if (!validate_old_schema_database.ok()) {
975       return absl_ports::InvalidArgumentError(
976           "Schema database mismatch between new and old schemas. This should "
977           "never happen");
978     }
979 
980     // Check if the schema types are the same between the new and old schema,
981     // ignoring order.
982     if (AreSchemaTypesEqual(new_schema, old_schema)) {
983       return result;
984     }
985   } else {
986     // Old equality check that is sensitive to type definition order.
987     if (new_schema.SerializeAsString() == old_schema.SerializeAsString()) {
988       // Same schema as before. No need to update anything
989       return result;
990     }
991   }
992 
993   // Different schema -- we need to validate the schema and track the
994   // differences to see if we can still write it.
995   //
996   // Validate the new schema and compute the delta between the old and new
997   // schema.
998   ICING_ASSIGN_OR_RETURN(SchemaUtil::DependentMap new_dependent_map,
999                          SchemaUtil::Validate(new_schema, *feature_flags_));
1000   SchemaUtil::SchemaDelta schema_delta = SchemaUtil::ComputeCompatibilityDelta(
1001       old_schema, new_schema, new_dependent_map, *feature_flags_);
1002 
1003   result.schema_types_new_by_name = std::move(schema_delta.schema_types_new);
1004   result.schema_types_changed_fully_compatible_by_name =
1005       std::move(schema_delta.schema_types_changed_fully_compatible);
1006   result.schema_types_index_incompatible_by_name =
1007       std::move(schema_delta.schema_types_index_incompatible);
1008   result.schema_types_join_incompatible_by_name =
1009       std::move(schema_delta.schema_types_join_incompatible);
1010   result.schema_types_scorable_property_inconsistent_by_name =
1011       std::move(schema_delta.schema_types_scorable_property_inconsistent);
1012 
1013   for (const std::string& schema_type : schema_delta.schema_types_deleted) {
1014     // We currently don't support deletions, so mark this as not possible.
1015     // This will change once we allow force-set schemas.
1016     result.success = false;
1017 
1018     result.schema_types_deleted_by_name.emplace(schema_type);
1019 
1020     ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1021                            GetSchemaTypeId(schema_type));
1022     result.schema_types_deleted_by_id.emplace(schema_type_id);
1023   }
1024 
1025   for (const std::string& schema_type :
1026        schema_delta.schema_types_incompatible) {
1027     // We currently don't support incompatible schemas, so mark this as
1028     // not possible. This will change once we allow force-set schemas.
1029     result.success = false;
1030 
1031     result.schema_types_incompatible_by_name.emplace(schema_type);
1032 
1033     ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1034                            GetSchemaTypeId(schema_type));
1035     result.schema_types_incompatible_by_id.emplace(schema_type_id);
1036   }
1037 
1038   // Get the full new SchemaProto that is a combination of the existing schema
1039   // and new_schema. This is needed to calculate the updated SchemaTypeIds, and
1040   // for writing the full proto to the schema file.
1041   ICING_ASSIGN_OR_RETURN(
1042       SchemaProto full_new_schema,
1043       GetFullSchemaProtoWithUpdatedDb(std::move(new_schema), database));
1044 
1045   // We still need to update old_schema_type_ids_changed. We need to retrieve
1046   // the entire old schema for this, as type ids are assigned for the entire
1047   // schema, and not on a per-database level.
1048   //
1049   // SchemaTypeIds changing is fine, we can update the DocumentStore.
1050   ICING_ASSIGN_OR_RETURN(const SchemaProto* full_old_schema, GetSchema());
1051   result.old_schema_type_ids_changed =
1052       SchemaTypeIdsChanged(*full_old_schema, full_new_schema);
1053 
1054   // We can force set the schema if the caller has told us to ignore any errors
1055   result.success = result.success || ignore_errors_and_delete_documents;
1056 
1057   // Step 3: Apply the schema change if success. This updates persisted files
1058   // and derived data structures.
1059   if (result.success) {
1060     ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(full_new_schema)));
1061     has_schema_successfully_set_ = true;
1062   }
1063 
1064   // Convert schema types to SchemaTypeIds after the new schema is applied.
1065   if (feature_flags_->enable_scorable_properties()) {
1066     for (const std::string& schema_type :
1067          result.schema_types_scorable_property_inconsistent_by_name) {
1068       libtextclassifier3::StatusOr<SchemaTypeId> schema_type_id_or =
1069           GetSchemaTypeId(schema_type);
1070       if (!schema_type_id_or.ok()) {
1071         if (absl_ports::IsNotFound(schema_type_id_or.status())) {
1072           continue;
1073         }
1074         return schema_type_id_or.status();
1075       }
1076       result.schema_types_scorable_property_inconsistent_by_id.insert(
1077           schema_type_id_or.ValueOrDie());
1078     }
1079   }
1080 
1081   return result;
1082 }
1083 
ApplySchemaChange(SchemaProto new_schema)1084 libtextclassifier3::Status SchemaStore::ApplySchemaChange(
1085     SchemaProto new_schema) {
1086   // We need to ensure that we either 1) successfully set the schema and
1087   // update all derived data structures or 2) fail and leave the schema store
1088   // unchanged.
1089   // So, first, we create an empty temporary directory to build a new schema
1090   // store in.
1091   std::string temp_schema_store_dir_path = base_dir_ + "_temp";
1092   if (!filesystem_->DeleteDirectoryRecursively(
1093           temp_schema_store_dir_path.c_str())) {
1094     ICING_LOG(ERROR) << "Recursively deleting "
1095                      << temp_schema_store_dir_path.c_str();
1096     return absl_ports::InternalError(
1097         "Unable to delete temp directory to prepare to build new schema "
1098         "store.");
1099   }
1100 
1101   DestructibleDirectory temp_schema_store_dir(
1102       filesystem_, std::move(temp_schema_store_dir_path));
1103   if (!temp_schema_store_dir.is_valid()) {
1104     return absl_ports::InternalError(
1105         "Unable to create temp directory to build new schema store.");
1106   }
1107 
1108   // Then we create our new schema store with the new schema.
1109   ICING_ASSIGN_OR_RETURN(
1110       std::unique_ptr<SchemaStore> new_schema_store,
1111       SchemaStore::Create(filesystem_, temp_schema_store_dir.dir(), clock_,
1112                           feature_flags_, std::move(new_schema)));
1113 
1114   // Then we swap the new schema file + new derived files with the old files.
1115   if (!filesystem_->SwapFiles(base_dir_.c_str(),
1116                               temp_schema_store_dir.dir().c_str())) {
1117     return absl_ports::InternalError(
1118         "Unable to apply new schema due to failed swap!");
1119   }
1120 
1121   std::string old_base_dir = std::move(base_dir_);
1122   *this = std::move(*new_schema_store);
1123 
1124   // After the std::move, the filepaths saved in this instance and in the
1125   // schema_file_ instance will still be the one from temp_schema_store_dir
1126   // even though they now point to files that are within old_base_dir.
1127   // Manually set them to the correct paths.
1128   base_dir_ = std::move(old_base_dir);
1129   schema_file_.SetSwappedFilepath(MakeSchemaFilename(base_dir_));
1130   if (overlay_schema_file_ != nullptr) {
1131     overlay_schema_file_->SetSwappedFilepath(
1132         MakeOverlaySchemaFilename(base_dir_));
1133   }
1134 
1135   return libtextclassifier3::Status::OK;
1136 }
1137 
1138 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
GetSchemaTypeConfig(std::string_view schema_type) const1139 SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const {
1140   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1141   const auto& type_config_iter =
1142       type_config_map_.find(std::string(schema_type));
1143   if (type_config_iter == type_config_map_.end()) {
1144     return absl_ports::NotFoundError(
1145         absl_ports::StrCat("Schema type config '", schema_type, "' not found"));
1146   }
1147   return &type_config_iter->second;
1148 }
1149 
GetSchemaTypeId(std::string_view schema_type) const1150 libtextclassifier3::StatusOr<SchemaTypeId> SchemaStore::GetSchemaTypeId(
1151     std::string_view schema_type) const {
1152   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1153   return schema_type_mapper_->Get(schema_type);
1154 }
1155 
GetSchemaType(SchemaTypeId schema_type_id) const1156 libtextclassifier3::StatusOr<const std::string*> SchemaStore::GetSchemaType(
1157     SchemaTypeId schema_type_id) const {
1158   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1159   if (const auto it = reverse_schema_type_mapper_.find(schema_type_id);
1160       it == reverse_schema_type_mapper_.end()) {
1161     return absl_ports::InvalidArgumentError("Invalid schema type id");
1162   } else {
1163     return &it->second;
1164   }
1165 }
1166 
1167 libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
GetSchemaTypeIdsWithChildren(std::string_view schema_type) const1168 SchemaStore::GetSchemaTypeIdsWithChildren(std::string_view schema_type) const {
1169   ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1170                          GetSchemaTypeId(schema_type));
1171   auto iter = schema_subtype_id_map_.find(schema_type_id);
1172   if (iter == schema_subtype_id_map_.end()) {
1173     // This should never happen, unless there is an inconsistency or IO error.
1174     return absl_ports::InternalError(absl_ports::StrCat(
1175         "Schema type '", schema_type, "' is not found in the subtype map."));
1176   }
1177   return &iter->second;
1178 }
1179 
1180 libtextclassifier3::StatusOr<const SectionMetadata*>
GetSectionMetadata(SchemaTypeId schema_type_id,SectionId section_id) const1181 SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id,
1182                                 SectionId section_id) const {
1183   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1184   return schema_type_manager_->section_manager().GetSectionMetadata(
1185       schema_type_id, section_id);
1186 }
1187 
ExtractSections(const DocumentProto & document) const1188 libtextclassifier3::StatusOr<SectionGroup> SchemaStore::ExtractSections(
1189     const DocumentProto& document) const {
1190   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1191   return schema_type_manager_->section_manager().ExtractSections(document);
1192 }
1193 
1194 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,const std::string & property_path) const1195 SchemaStore::GetJoinablePropertyMetadata(
1196     SchemaTypeId schema_type_id, const std::string& property_path) const {
1197   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1198   return schema_type_manager_->joinable_property_manager()
1199       .GetJoinablePropertyMetadata(schema_type_id, property_path);
1200 }
1201 
1202 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,JoinablePropertyId joinable_property_id) const1203 SchemaStore::GetJoinablePropertyMetadata(
1204     SchemaTypeId schema_type_id,
1205     JoinablePropertyId joinable_property_id) const {
1206   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1207   return schema_type_manager_->joinable_property_manager()
1208       .GetJoinablePropertyMetadata(schema_type_id, joinable_property_id);
1209 }
1210 
1211 libtextclassifier3::StatusOr<JoinablePropertyGroup>
ExtractJoinableProperties(const DocumentProto & document) const1212 SchemaStore::ExtractJoinableProperties(const DocumentProto& document) const {
1213   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1214   return schema_type_manager_->joinable_property_manager()
1215       .ExtractJoinableProperties(document);
1216 }
1217 
1218 libtextclassifier3::StatusOr<std::optional<int>>
GetScorablePropertyIndex(SchemaTypeId schema_type_id,std::string_view property_path) const1219 SchemaStore::GetScorablePropertyIndex(SchemaTypeId schema_type_id,
1220                                       std::string_view property_path) const {
1221   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1222   if (!feature_flags_->enable_scorable_properties()) {
1223     return std::nullopt;
1224   }
1225   return scorable_property_manager_->GetScorablePropertyIndex(
1226       schema_type_id, property_path, type_config_map_,
1227       reverse_schema_type_mapper_);
1228 }
1229 
1230 libtextclassifier3::StatusOr<
1231     const std::vector<ScorablePropertyManager::ScorablePropertyInfo>*>
GetOrderedScorablePropertyInfo(SchemaTypeId schema_type_id) const1232 SchemaStore::GetOrderedScorablePropertyInfo(SchemaTypeId schema_type_id) const {
1233   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1234   if (!feature_flags_->enable_scorable_properties()) {
1235     return nullptr;
1236   }
1237   return scorable_property_manager_->GetOrderedScorablePropertyInfo(
1238       schema_type_id, type_config_map_, reverse_schema_type_mapper_);
1239 }
1240 
PersistToDisk()1241 libtextclassifier3::Status SchemaStore::PersistToDisk() {
1242   if (!has_schema_successfully_set_) {
1243     return libtextclassifier3::Status::OK;
1244   }
1245   ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk());
1246   ICING_RETURN_IF_ERROR(UpdateChecksum());
1247   ICING_RETURN_IF_ERROR(header_->PersistToDisk());
1248   return libtextclassifier3::Status::OK;
1249 }
1250 
GetStorageInfo() const1251 SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
1252   SchemaStoreStorageInfoProto storage_info;
1253   int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
1254   storage_info.set_schema_store_size(
1255       Filesystem::SanitizeFileSize(directory_size));
1256   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
1257   storage_info.set_num_schema_types(schema->types().size());
1258   int total_sections = 0;
1259   int num_types_sections_exhausted = 0;
1260   for (const SchemaTypeConfigProto& type : schema->types()) {
1261     auto sections_list_or =
1262         schema_type_manager_->section_manager().GetMetadataList(
1263             type.schema_type());
1264     if (!sections_list_or.ok()) {
1265       continue;
1266     }
1267     total_sections += sections_list_or.ValueOrDie()->size();
1268     if (sections_list_or.ValueOrDie()->size() == kTotalNumSections) {
1269       ++num_types_sections_exhausted;
1270     }
1271   }
1272 
1273   storage_info.set_num_total_sections(total_sections);
1274   storage_info.set_num_schema_types_sections_exhausted(
1275       num_types_sections_exhausted);
1276   return storage_info;
1277 }
1278 
1279 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
GetSectionMetadata(const std::string & schema_type) const1280 SchemaStore::GetSectionMetadata(const std::string& schema_type) const {
1281   return schema_type_manager_->section_manager().GetMetadataList(schema_type);
1282 }
1283 
IsPropertyDefinedInSchema(SchemaTypeId schema_type_id,const std::string & property_path) const1284 bool SchemaStore::IsPropertyDefinedInSchema(
1285     SchemaTypeId schema_type_id, const std::string& property_path) const {
1286   auto schema_name_itr = reverse_schema_type_mapper_.find(schema_type_id);
1287   if (schema_name_itr == reverse_schema_type_mapper_.end()) {
1288     return false;
1289   }
1290   const std::string* current_type_name = &schema_name_itr->second;
1291 
1292   std::vector<std::string_view> property_path_parts =
1293       property_util::SplitPropertyPathExpr(property_path);
1294   for (int i = 0; i < property_path_parts.size(); ++i) {
1295     auto type_config_itr = type_config_map_.find(*current_type_name);
1296     if (type_config_itr == type_config_map_.end()) {
1297       return false;
1298     }
1299     std::string_view property_name = property_path_parts.at(i);
1300     const PropertyConfigProto* selected_property = nullptr;
1301     for (const PropertyConfigProto& property :
1302          type_config_itr->second.properties()) {
1303       if (property.property_name() == property_name) {
1304         selected_property = &property;
1305         break;
1306       }
1307     }
1308     if (selected_property == nullptr) {
1309       return false;
1310     }
1311     if (i == property_path_parts.size() - 1) {
1312       // We've found a property at the final part of the path.
1313       return true;
1314     }
1315     if (selected_property->data_type() !=
1316         PropertyConfigProto::DataType::DOCUMENT) {
1317       // If this isn't final part of the path, but this property isn't a
1318       // document, so we know that this path doesn't exist.
1319       return false;
1320     }
1321     current_type_name = &selected_property->schema_type();
1322   }
1323 
1324   // We should never reach this point.
1325   return false;
1326 }
1327 
GetDebugInfo() const1328 libtextclassifier3::StatusOr<SchemaDebugInfoProto> SchemaStore::GetDebugInfo()
1329     const {
1330   SchemaDebugInfoProto debug_info;
1331   if (has_schema_successfully_set_) {
1332     ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
1333     *debug_info.mutable_schema() = *schema;
1334   }
1335   ICING_ASSIGN_OR_RETURN(Crc32 crc, GetChecksum());
1336   debug_info.set_crc(crc.Get());
1337   return debug_info;
1338 }
1339 
1340 std::vector<SchemaStore::ExpandedTypePropertyMask>
ExpandTypePropertyMasks(const google::protobuf::RepeatedPtrField<TypePropertyMask> & type_property_masks) const1341 SchemaStore::ExpandTypePropertyMasks(
1342     const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks)
1343     const {
1344   std::unordered_map<SchemaTypeId, ExpandedTypePropertyMask> result_map;
1345   for (const TypePropertyMask& type_field_mask : type_property_masks) {
1346     if (type_field_mask.schema_type() == kSchemaTypeWildcard) {
1347       ExpandedTypePropertyMask entry{type_field_mask.schema_type(),
1348                                      /*paths=*/{}};
1349       entry.paths.insert(type_field_mask.paths().begin(),
1350                          type_field_mask.paths().end());
1351       result_map.insert({kInvalidSchemaTypeId, std::move(entry)});
1352     } else {
1353       auto schema_type_ids_or =
1354           GetSchemaTypeIdsWithChildren(type_field_mask.schema_type());
1355       // If we can't find the SchemaTypeIds, just throw it away
1356       if (!schema_type_ids_or.ok()) {
1357         continue;
1358       }
1359       const std::unordered_set<SchemaTypeId>* schema_type_ids =
1360           schema_type_ids_or.ValueOrDie();
1361       for (SchemaTypeId schema_type_id : *schema_type_ids) {
1362         auto schema_type_name_iter =
1363             reverse_schema_type_mapper_.find(schema_type_id);
1364         if (schema_type_name_iter == reverse_schema_type_mapper_.end()) {
1365           // This should never happen, unless there is an inconsistency or IO
1366           // error.
1367           ICING_LOG(ERROR) << "Got unknown schema type id: " << schema_type_id;
1368           continue;
1369         }
1370 
1371         auto iter = result_map.find(schema_type_id);
1372         if (iter == result_map.end()) {
1373           ExpandedTypePropertyMask entry{schema_type_name_iter->second,
1374                                          /*paths=*/{}};
1375           iter = result_map.insert({schema_type_id, std::move(entry)}).first;
1376         }
1377         iter->second.paths.insert(type_field_mask.paths().begin(),
1378                                   type_field_mask.paths().end());
1379       }
1380     }
1381   }
1382   std::vector<ExpandedTypePropertyMask> result;
1383   result.reserve(result_map.size());
1384   for (auto& entry : result_map) {
1385     result.push_back(std::move(entry.second));
1386   }
1387   return result;
1388 }
1389 
1390 libtextclassifier3::StatusOr<
1391     std::unordered_map<std::string, std::vector<std::string>>>
ConstructBlobPropertyMap() const1392 SchemaStore::ConstructBlobPropertyMap() const {
1393   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
1394   std::unordered_map<std::string, std::vector<std::string>> blob_property_map;
1395   for (const SchemaTypeConfigProto& type_config : schema->types()) {
1396     SchemaPropertyIterator iterator(type_config, type_config_map_);
1397     std::vector<std::string> blob_properties;
1398 
1399     libtextclassifier3::Status status = iterator.Advance();
1400     while (status.ok()) {
1401       if (iterator.GetCurrentPropertyConfig().data_type() ==
1402           PropertyConfigProto::DataType::BLOB_HANDLE) {
1403         blob_properties.push_back(iterator.GetCurrentPropertyPath());
1404       }
1405       status = iterator.Advance();
1406     }
1407     if (!absl_ports::IsOutOfRange(status)) {
1408       return status;
1409     }
1410     if (!blob_properties.empty()) {
1411       blob_property_map.insert(
1412           {type_config.schema_type(), std::move(blob_properties)});
1413     }
1414   }
1415   return blob_property_map;
1416 }
1417 
ValidateSchemaDatabase(const SchemaProto & new_schema,const std::string & database) const1418 libtextclassifier3::Status SchemaStore::ValidateSchemaDatabase(
1419     const SchemaProto& new_schema, const std::string& database) const {
1420   if (!feature_flags_->enable_schema_database() || new_schema.types().empty()) {
1421     return libtextclassifier3::Status::OK;
1422   }
1423 
1424   // Loop through new_schema's types and validate it. The input SchemaProto
1425   // contains a list of SchemaTypeConfigProtos without deduplication. We need to
1426   // check that:
1427   // 1. All SchemaTypeConfigProtos have the same database value.
1428   // 2. The SchemaTypeConfigProtos's schema_type field is unique within both
1429   //    new_schema, as well as the existing schema (recorded in
1430   //    type_config_map_).
1431   for (const SchemaTypeConfigProto& type_config : new_schema.types()) {
1432     // Check database consistency.
1433     if (database != type_config.database()) {
1434       return absl_ports::InvalidArgumentError(absl_ports::StrCat(
1435           "Mismatch between the set schema request's database and the new "
1436           "schema types' database. Expected '",
1437           database, "' but got '", type_config.database(), "'."));
1438     }
1439 
1440     // Check type name uniqueness. This is only necessary if there is a
1441     // pre-existing schema.
1442     if (has_schema_successfully_set_) {
1443       auto iter = type_config_map_.find(type_config.schema_type());
1444       if (iter != type_config_map_.end() &&
1445           database != iter->second.database()) {
1446         return absl_ports::AlreadyExistsError(
1447             absl_ports::StrCat("schema_type name: '", type_config.schema_type(),
1448                                "' is already in use by a different database."));
1449       }
1450     }
1451   }
1452   return libtextclassifier3::Status::OK;
1453 }
1454 
1455 libtextclassifier3::StatusOr<SchemaProto>
GetFullSchemaProtoWithUpdatedDb(SchemaProto input_database_schema,const std::string & database_to_update) const1456 SchemaStore::GetFullSchemaProtoWithUpdatedDb(
1457     SchemaProto input_database_schema,
1458     const std::string& database_to_update) const {
1459   if (!feature_flags_->enable_schema_database()) {
1460     // If the schema database is not enabled, the input schema is already the
1461     // full schema, so we don't need to do any merges.
1462     return input_database_schema;
1463   }
1464 
1465   libtextclassifier3::StatusOr<const SchemaProto*> schema_proto = GetSchema();
1466   if (absl_ports::IsNotFound(schema_proto.status())) {
1467     // We don't have a pre-existing schema -- we can return the input database
1468     // schema as it's already the full schema.
1469     return input_database_schema;
1470   }
1471 
1472   if (!schema_proto.ok()) {
1473     // Real error.
1474     return schema_proto.status();
1475   }
1476 
1477   if (!has_schema_successfully_set_) {
1478     return absl_ports::InternalError(
1479         "Schema store was not initialized properly.");
1480   }
1481 
1482   // At this point, we have a pre-existing schema -- we need to merge the
1483   // updated database with the existing schema.
1484   if (database_type_map_.size() == 1 &&
1485       database_type_map_.find(database_to_update) != database_type_map_.end()) {
1486     // No other databases in the schema -- we can return the input database
1487     // schema.
1488     return input_database_schema;
1489   }
1490 
1491   const SchemaProto* existing_schema = schema_proto.ValueOrDie();
1492   SchemaProto full_schema;
1493 
1494   // 1. Add types from the existing schema, replacing existing types with the
1495   // input types if the database is the one being updated by the input schema.
1496   // - For database_to_update, we replace the existing types with the input
1497   //   types. Any existing type not included in input_database_schema is
1498   //   deleted.
1499   // - If there are more input types than existing types for database_to_update,
1500   //   the rest of the input types are appended to the end of the full_schema.
1501   // - If there are fewer input types than existing types for
1502   //   database_to_update, we shift forward all existing types that appear after
1503   //   the last input type.
1504   // - For existing types from other databases, we preserve the existing order
1505   //   after adding to full_schema. Note that the type-ids of existing types
1506   //   might still change if some types are deleted in the database_to_update as
1507   //   this will cause all subsequent types ids to shift forward.
1508   int input_schema_index = 0, existing_schema_index = 0;
1509   while (input_schema_index < input_database_schema.types().size() &&
1510          existing_schema_index < existing_schema->types().size()) {
1511     const SchemaTypeConfigProto& existing_type_config =
1512         existing_schema->types(existing_schema_index);
1513     SchemaTypeConfigProto& input_type_config =
1514         *input_database_schema.mutable_types(input_schema_index);
1515 
1516     if (existing_type_config.database() == database_to_update) {
1517       // If the database is the one being updated by the input schema, replace
1518       // the existing type with a type from the input schema.
1519       *full_schema.add_types() = std::move(input_type_config);
1520       ++input_schema_index;
1521     } else {
1522       *full_schema.add_types() = existing_type_config;
1523     }
1524     ++existing_schema_index;
1525   }
1526 
1527   // 2. Append remaining types to the end of the SchemaProto.
1528   for (; input_schema_index < input_database_schema.types().size();
1529        ++input_schema_index) {
1530     // Case 1: Append all remaining types from the input schema. This happens
1531     // when more types are added in input_database_schema than what's in the
1532     // existing schema. In this case, we've used up the space for the database
1533     // in the existing schema, so we can just append the rest of the types to
1534     // the end.
1535     SchemaTypeConfigProto& input_type_config =
1536         *input_database_schema.mutable_types(input_schema_index);
1537     *full_schema.add_types() = std::move(input_type_config);
1538   }
1539   for (; existing_schema_index < existing_schema->types().size();
1540        ++existing_schema_index) {
1541     // Case 2: Add remaining types from the existing schema, but skip the ones
1542     // that are from input_database, since existing types from input_database
1543     // are replaced with input_database_schema.
1544     if (existing_schema->types(existing_schema_index).database() !=
1545         database_to_update) {
1546       *full_schema.add_types() = existing_schema->types(existing_schema_index);
1547     }
1548   }
1549 
1550   return full_schema;
1551 }
1552 
1553 }  // namespace lib
1554 }  // namespace icing
1555