1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/schema/schema-store.h"
16
17 #include <cinttypes>
18 #include <cstddef>
19 #include <cstdint>
20 #include <limits>
21 #include <memory>
22 #include <optional>
23 #include <string>
24 #include <string_view>
25 #include <unordered_map>
26 #include <unordered_set>
27 #include <utility>
28 #include <vector>
29
30 #include "icing/text_classifier/lib3/utils/base/status.h"
31 #include "icing/text_classifier/lib3/utils/base/statusor.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/str_cat.h"
34 #include "icing/feature-flags.h"
35 #include "icing/file/destructible-directory.h"
36 #include "icing/file/file-backed-proto.h"
37 #include "icing/file/filesystem.h"
38 #include "icing/file/version-util.h"
39 #include "icing/legacy/core/icing-string-util.h"
40 #include "icing/proto/debug.pb.h"
41 #include "icing/proto/document.pb.h"
42 #include "icing/proto/logging.pb.h"
43 #include "icing/proto/schema.pb.h"
44 #include "icing/proto/search.pb.h"
45 #include "icing/proto/storage.pb.h"
46 #include "icing/schema/backup-schema-producer.h"
47 #include "icing/schema/joinable-property.h"
48 #include "icing/schema/schema-property-iterator.h"
49 #include "icing/schema/schema-type-manager.h"
50 #include "icing/schema/schema-util.h"
51 #include "icing/schema/scorable_property_manager.h"
52 #include "icing/schema/section.h"
53 #include "icing/store/document-filter-data.h"
54 #include "icing/store/dynamic-trie-key-mapper.h"
55 #include "icing/util/clock.h"
56 #include "icing/util/crc32.h"
57 #include "icing/util/logging.h"
58 #include "icing/util/status-macros.h"
59
60 namespace icing {
61 namespace lib {
62
63 namespace {
64
65 constexpr char kSchemaStoreHeaderFilename[] = "schema_store_header";
66 constexpr char kSchemaFilename[] = "schema.pb";
67 constexpr char kOverlaySchemaFilename[] = "overlay_schema.pb";
68 constexpr char kSchemaTypeMapperFilename[] = "schema_type_mapper";
69
70 // This should be kept consistent with the delimiter used in AppSearch.
71 // See:
72 // https://cs.android.com/androidx/platform/frameworks/support/+/androidx-main:appsearch/appsearch-local-storage/src/main/java/androidx/appsearch/localstorage/util/PrefixUtil.java;l=42;drc=ffaf979c6f0cbd26caafd7a9d07a6bad12fe3a2a
73
74 constexpr char kAppSearchDatabaseDelimiter = '/';
75
76 // A DynamicTrieKeyMapper stores its data across 3 arrays internally. Giving
77 // each array 128KiB for storage means the entire DynamicTrieKeyMapper requires
78 // 384KiB.
79 constexpr int32_t kSchemaTypeMapperMaxSize = 3 * 128 * 1024; // 384 KiB
80
MakeHeaderFilename(const std::string & base_dir)81 std::string MakeHeaderFilename(const std::string& base_dir) {
82 return absl_ports::StrCat(base_dir, "/", kSchemaStoreHeaderFilename);
83 }
84
MakeSchemaFilename(const std::string & base_dir)85 std::string MakeSchemaFilename(const std::string& base_dir) {
86 return absl_ports::StrCat(base_dir, "/", kSchemaFilename);
87 }
88
MakeOverlaySchemaFilename(const std::string & base_dir)89 std::string MakeOverlaySchemaFilename(const std::string& base_dir) {
90 return absl_ports::StrCat(base_dir, "/", kOverlaySchemaFilename);
91 }
92
MakeSchemaTypeMapperFilename(const std::string & base_dir)93 std::string MakeSchemaTypeMapperFilename(const std::string& base_dir) {
94 return absl_ports::StrCat(base_dir, "/", kSchemaTypeMapperFilename);
95 }
96
97 // Assuming that SchemaTypeIds are assigned to schema types based on their order
98 // in the SchemaProto. Check if the schema type->SchemaTypeId mapping would
99 // change with the new schema.
SchemaTypeIdsChanged(const SchemaProto & old_schema,const SchemaProto & new_schema)100 std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged(
101 const SchemaProto& old_schema, const SchemaProto& new_schema) {
102 std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
103
104 std::unordered_map<std::string, int> old_types_and_index;
105 for (int i = 0; i < old_schema.types().size(); ++i) {
106 old_types_and_index.emplace(old_schema.types(i).schema_type(), i);
107 }
108
109 std::unordered_map<std::string, int> new_types_and_index;
110 for (int i = 0; i < new_schema.types().size(); ++i) {
111 new_types_and_index.emplace(new_schema.types(i).schema_type(), i);
112 }
113
114 for (const auto& old_type_index : old_types_and_index) {
115 const auto& iter = new_types_and_index.find(old_type_index.first);
116 // We only care if the type exists in both the old and new schema. If the
117 // type has been deleted, then it'll be captured in
118 // SetSchemaResult.schema_types_deleted*. If the type has been added in the
119 // new schema then we also don't care because nothing needs to be updated.
120 if (iter != new_types_and_index.end()) {
121 // Since the SchemaTypeId of the schema type is just the index of it in
122 // the SchemaProto, compare the index and save it if it's not the same
123 if (old_type_index.second != iter->second) {
124 old_schema_type_ids_changed.emplace(old_type_index.second);
125 }
126 }
127 }
128
129 return old_schema_type_ids_changed;
130 }
131
132 // Returns the database from the schema type name if it exists.
133 //
134 // The schema type is expected to be in the format of
135 // <database><delimiter><actual_type_name>.
136 //
137 // Returns an empty string if the schema type name is not in the database
138 // format.
GetDatabaseFromSchemaType(const std::string & schema_type,char database_delimeter)139 std::string GetDatabaseFromSchemaType(const std::string& schema_type,
140 char database_delimeter) {
141 size_t db_index = schema_type.find(database_delimeter);
142 std::string database;
143 if (db_index != std::string::npos) {
144 database = schema_type.substr(0, db_index);
145 }
146 return database;
147 }
148
149 // For each schema type in the schema proto, parses out the database from the
150 // type name, and sets it as the database field in the input proto in
151 // place. The schema_type name field itself is not modified.
152 //
153 // If the schema type name does not contain an AppSearch database, then
154 // SchemaTypeConfigProto is not modified.
155 //
156 // Returns:
157 // - True if any SchemaTypeConfigProto in the schema proto is rewritten.
158 // - False otherwise.
ParseAndPopulateAppSearchDatabaseField(SchemaProto & schema_proto)159 bool ParseAndPopulateAppSearchDatabaseField(SchemaProto& schema_proto) {
160 bool populated_database_field = false;
161 for (auto& type : *schema_proto.mutable_types()) {
162 std::string database = GetDatabaseFromSchemaType(
163 type.schema_type(), kAppSearchDatabaseDelimiter);
164 if (type.database() != database) {
165 type.set_database(std::move(database));
166 populated_database_field = true;
167 }
168 }
169 return populated_database_field;
170 }
171
172 // Compares the schema types list defined in two schemas, ignoring order.
173 //
174 // Requires: old_schema.schema_database() == new_schema.schema_database()
175 //
176 // Returns: true if the types in `new_schema` are identical to the types
177 // in `old_schema`, otherwise returns false.
AreSchemaTypesEqual(const SchemaProto & old_schema,const SchemaProto & new_schema)178 bool AreSchemaTypesEqual(const SchemaProto& old_schema,
179 const SchemaProto& new_schema) {
180 if (old_schema.types().size() != new_schema.types().size()) {
181 return false;
182 }
183
184 // Create a map of old schema types to and check that the new schema's types
185 // are identical.
186 std::unordered_map<std::string_view, const SchemaTypeConfigProto&>
187 old_schema_types;
188 old_schema_types.reserve(old_schema.types().size());
189 for (const SchemaTypeConfigProto& old_type : old_schema.types()) {
190 old_schema_types.emplace(old_type.schema_type(), old_type);
191 }
192 for (const SchemaTypeConfigProto& new_type : new_schema.types()) {
193 auto old_type_itr = old_schema_types.find(new_type.schema_type());
194 if (old_type_itr == old_schema_types.end()) {
195 return false;
196 }
197 if (old_type_itr->second.SerializeAsString() !=
198 new_type.SerializeAsString()) {
199 return false;
200 }
201 }
202
203 return true;
204 }
205
206 } // namespace
207
208 /* static */ libtextclassifier3::StatusOr<SchemaStore::Header>
Read(const Filesystem * filesystem,std::string path)209 SchemaStore::Header::Read(const Filesystem* filesystem, std::string path) {
210 if (!filesystem->FileExists(path.c_str())) {
211 return absl_ports::NotFoundError(
212 absl_ports::StrCat("Header file is empty: ", path));
213 }
214
215 SerializedHeader serialized_header;
216 ScopedFd sfd(filesystem->OpenForWrite(path.c_str()));
217 if (!sfd.is_valid()) {
218 return absl_ports::InternalError("Unable to open or create header file.");
219 }
220
221 // If file is sizeof(LegacyHeader), then it must be LegacyHeader.
222 int64_t file_size = filesystem->GetFileSize(sfd.get());
223 if (file_size == sizeof(LegacyHeader)) {
224 LegacyHeader legacy_header;
225 if (!filesystem->Read(sfd.get(), &legacy_header, sizeof(legacy_header))) {
226 return absl_ports::InternalError(
227 absl_ports::StrCat("Couldn't read: ", path));
228 }
229 if (legacy_header.magic != Header::kMagic) {
230 return absl_ports::InternalError(
231 absl_ports::StrCat("Invalid header kMagic for file: ", path));
232 }
233 serialized_header.checksum = legacy_header.checksum;
234 } else if (file_size == sizeof(SerializedHeader)) {
235 if (!filesystem->Read(sfd.get(), &serialized_header,
236 sizeof(serialized_header))) {
237 return absl_ports::InternalError(
238 absl_ports::StrCat("Couldn't read: ", path));
239 }
240 if (serialized_header.magic != Header::kMagic) {
241 return absl_ports::InternalError(
242 absl_ports::StrCat("Invalid header kMagic for file: ", path));
243 }
244 } else if (file_size != 0) {
245 // file is neither the legacy header, the new header nor empty. Something is
246 // wrong here.
247 int legacy_header_size = sizeof(LegacyHeader);
248 int header_size = sizeof(SerializedHeader);
249 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
250 "Unexpected header size %" PRId64 ". Expected %d or %d", file_size,
251 legacy_header_size, header_size));
252 }
253 return Header(serialized_header, std::move(path), std::move(sfd), filesystem);
254 }
255
Write()256 libtextclassifier3::Status SchemaStore::Header::Write() {
257 if (!dirty_) {
258 return libtextclassifier3::Status::OK;
259 }
260 if (!header_fd_.is_valid() && !filesystem_->FileExists(path_.c_str())) {
261 header_fd_.reset(filesystem_->OpenForWrite(path_.c_str()));
262 }
263 // This should overwrite the header.
264 if (!header_fd_.is_valid() ||
265 !filesystem_->PWrite(header_fd_.get(), /*offset=*/0, &serialized_header_,
266 sizeof(serialized_header_))) {
267 return absl_ports::InternalError(
268 absl_ports::StrCat("Failed to write SchemaStore header"));
269 }
270 dirty_ = false;
271 return libtextclassifier3::Status::OK;
272 }
273
PersistToDisk()274 libtextclassifier3::Status SchemaStore::Header::PersistToDisk() {
275 if (dirty_) {
276 ICING_RETURN_IF_ERROR(Write());
277 }
278 // This should overwrite the header.
279 if (!header_fd_.is_valid() || !filesystem_->DataSync(header_fd_.get())) {
280 return absl_ports::InternalError(
281 absl_ports::StrCat("Failed to sync SchemaStore header."));
282 }
283 return libtextclassifier3::Status::OK;
284 }
285
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const FeatureFlags * feature_flags,InitializeStatsProto * initialize_stats)286 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
287 const Filesystem* filesystem, const std::string& base_dir,
288 const Clock* clock, const FeatureFlags* feature_flags,
289 InitializeStatsProto* initialize_stats) {
290 ICING_RETURN_ERROR_IF_NULL(filesystem);
291 ICING_RETURN_ERROR_IF_NULL(clock);
292 ICING_RETURN_ERROR_IF_NULL(feature_flags);
293
294 if (!filesystem->DirectoryExists(base_dir.c_str())) {
295 return absl_ports::FailedPreconditionError(
296 "Schema store base directory does not exist!");
297 }
298 std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
299 new SchemaStore(filesystem, base_dir, clock, feature_flags));
300 ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats));
301 return schema_store;
302 }
303
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const FeatureFlags * feature_flags,SchemaProto schema)304 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
305 const Filesystem* filesystem, const std::string& base_dir,
306 const Clock* clock, const FeatureFlags* feature_flags, SchemaProto schema) {
307 ICING_RETURN_ERROR_IF_NULL(filesystem);
308 ICING_RETURN_ERROR_IF_NULL(clock);
309 ICING_RETURN_ERROR_IF_NULL(feature_flags);
310
311 if (!filesystem->DirectoryExists(base_dir.c_str())) {
312 return absl_ports::FailedPreconditionError(
313 "Schema store base directory does not exist!");
314 }
315 std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
316 new SchemaStore(filesystem, base_dir, clock, feature_flags));
317 ICING_RETURN_IF_ERROR(schema_store->Initialize(std::move(schema)));
318 return schema_store;
319 }
320
321 /* static */ libtextclassifier3::Status
PopulateSchemaDatabaseFieldForSchemaFile(const Filesystem * filesystem,const std::string & schema_filename)322 SchemaStore::PopulateSchemaDatabaseFieldForSchemaFile(
323 const Filesystem* filesystem, const std::string& schema_filename) {
324 FileBackedProto<SchemaProto> schema_file(*filesystem, schema_filename);
325 auto schema_proto_or = schema_file.Read();
326 if (absl_ports::IsNotFound(schema_proto_or.status())) {
327 // Don't have an existing schema proto, that's fine
328 return libtextclassifier3::Status::OK;
329 } else if (!schema_proto_or.ok()) {
330 // Real error when trying to read the existing schema
331 return schema_proto_or.status();
332 }
333
334 SchemaProto schema_proto_copy = *schema_proto_or.ValueOrDie();
335 bool schema_changed =
336 ParseAndPopulateAppSearchDatabaseField(schema_proto_copy);
337 if (!schema_changed) {
338 // Nothing to do if the schema is not changed.
339 return libtextclassifier3::Status::OK;
340 }
341
342 // Create a temporary schema file and schema proto copy to update the
343 // schema.
344 std::string temp_schema_filename = schema_filename + ".tmp";
345 if (!filesystem->DeleteFile(temp_schema_filename.c_str())) {
346 return absl_ports::InternalError(
347 "Unable to delete temp schema file to prepare for schema database "
348 "migration.");
349 }
350
351 {
352 FileBackedProto<SchemaProto> temp_schema_file(*filesystem,
353 temp_schema_filename);
354 ICING_RETURN_IF_ERROR(temp_schema_file.Write(
355 std::make_unique<SchemaProto>(schema_proto_copy)));
356 }
357
358 // Swap the temp schema file with the original schema file.
359 if (!filesystem->SwapFiles(temp_schema_filename.c_str(),
360 schema_filename.c_str())) {
361 return absl_ports::InternalError(
362 "Unable to apply migrated schema with database due to failed swap!");
363 }
364 // Clean up the temp schema file.
365 if (!filesystem->DeleteFile(temp_schema_filename.c_str())) {
366 return absl_ports::InternalError(
367 "Unable to delete temp schema file after schema database migration.");
368 }
369
370 return libtextclassifier3::Status::OK;
371 }
372
DiscardOverlaySchema(const Filesystem * filesystem,const std::string & base_dir,Header & header)373 /* static */ libtextclassifier3::Status SchemaStore::DiscardOverlaySchema(
374 const Filesystem* filesystem, const std::string& base_dir, Header& header) {
375 std::string header_filename = MakeHeaderFilename(base_dir);
376 if (header.overlay_created()) {
377 header.SetOverlayInfo(
378 /*overlay_created=*/false,
379 /*min_overlay_version_compatibility=*/std::numeric_limits<
380 int32_t>::max());
381 ICING_RETURN_IF_ERROR(header.Write());
382 }
383 std::string schema_overlay_filename = MakeOverlaySchemaFilename(base_dir);
384 if (!filesystem->DeleteFile(schema_overlay_filename.c_str())) {
385 return absl_ports::InternalError(
386 "Unable to delete stale schema overlay file.");
387 }
388 return libtextclassifier3::Status::OK;
389 }
390
MigrateSchema(const Filesystem * filesystem,const std::string & base_dir,version_util::StateChange version_state_change,int32_t new_version,bool perform_schema_database_migration)391 /* static */ libtextclassifier3::Status SchemaStore::MigrateSchema(
392 const Filesystem* filesystem, const std::string& base_dir,
393 version_util::StateChange version_state_change, int32_t new_version,
394 bool perform_schema_database_migration) {
395 if (!filesystem->DirectoryExists(base_dir.c_str())) {
396 // Situations when schema store directory doesn't exist:
397 // - Initializing new Icing instance: don't have to do anything now. The
398 // directory will be created later.
399 // - Lose schema store: there is nothing we can do now. The logic will be
400 // handled later by initializing.
401 //
402 // Therefore, just simply return OK here.
403 return libtextclassifier3::Status::OK;
404 }
405
406 ICING_RETURN_IF_ERROR(HandleOverlaySchemaForVersionChange(
407 filesystem, base_dir, version_state_change, new_version));
408
409 // Perform schema database migration if needed.
410 // - This populates the the database field in the schema proto and writes it
411 // to the schema file.
412 // - If the overlay schema file exists at this point, does the same for the
413 // overlay schema.
414 if (perform_schema_database_migration) {
415 std::string base_schema_filename = MakeSchemaFilename(base_dir);
416 ICING_RETURN_IF_ERROR(PopulateSchemaDatabaseFieldForSchemaFile(
417 filesystem, base_schema_filename));
418
419 std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir);
420 if (filesystem->FileExists(overlay_schema_filename.c_str())) {
421 ICING_RETURN_IF_ERROR(PopulateSchemaDatabaseFieldForSchemaFile(
422 filesystem, overlay_schema_filename));
423 }
424 }
425
426 return libtextclassifier3::Status::OK;
427 }
428
429 /* static */ libtextclassifier3::Status
HandleOverlaySchemaForVersionChange(const Filesystem * filesystem,const std::string & base_dir,version_util::StateChange version_state_change,int32_t new_version)430 SchemaStore::HandleOverlaySchemaForVersionChange(
431 const Filesystem* filesystem, const std::string& base_dir,
432 version_util::StateChange version_state_change, int32_t new_version) {
433 std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir);
434 if (!filesystem->FileExists(overlay_schema_filename.c_str())) {
435 // The overlay doesn't exist. So there should be nothing particularly
436 // interesting to worry about.
437 return libtextclassifier3::Status::OK;
438 }
439
440 std::string header_filename = MakeHeaderFilename(base_dir);
441 libtextclassifier3::StatusOr<Header> header_or;
442 switch (version_state_change) {
443 // No necessary actions for normal upgrades or no version change. The data
444 // that was produced by the previous version is fully compatible with this
445 // version and there's no stale data for us to clean up.
446 // The same is true for a normal rollforward. A normal rollforward implies
447 // that the previous version was one that understood the concept of the
448 // overlay schema and would have already discarded it if it was unusable.
449 case version_util::StateChange::kVersionZeroUpgrade:
450 // fallthrough
451 case version_util::StateChange::kUpgrade:
452 // fallthrough
453 case version_util::StateChange::kRollForward:
454 // fallthrough
455 case version_util::StateChange::kCompatible:
456 return libtextclassifier3::Status::OK;
457 case version_util::StateChange::kVersionZeroRollForward: {
458 // We've rolled forward. The schema overlay file, if it exists, is
459 // possibly stale. We must throw it out.
460 header_or = Header::Read(filesystem, header_filename);
461 ICING_RETURN_IF_ERROR(header_or.status());
462 return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
463 header_or.ValueOrDie());
464 }
465 case version_util::StateChange::kRollBack: {
466 header_or = Header::Read(filesystem, header_filename);
467 ICING_RETURN_IF_ERROR(header_or.status());
468 if (header_or.ValueOrDie().min_overlay_version_compatibility() <=
469 new_version) {
470 // We've been rolled back, but the overlay schema claims that it
471 // supports this version. So we can safely return.
472 return libtextclassifier3::Status::OK;
473 }
474 // We've been rolled back to a version that the overlay schema doesn't
475 // support. We must throw it out.
476 return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
477 header_or.ValueOrDie());
478 }
479 case version_util::StateChange::kUndetermined:
480 // It's not clear what version we're on, but the base schema should always
481 // be safe to use. Throw out the overlay.
482 header_or = Header::Read(filesystem, header_filename);
483 ICING_RETURN_IF_ERROR(header_or.status());
484 return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
485 header_or.ValueOrDie());
486 }
487 return libtextclassifier3::Status::OK;
488 }
489
DiscardDerivedFiles(const Filesystem * filesystem,const std::string & base_dir)490 /* static */ libtextclassifier3::Status SchemaStore::DiscardDerivedFiles(
491 const Filesystem* filesystem, const std::string& base_dir) {
492 // Schema type mapper
493 return DynamicTrieKeyMapper<SchemaTypeId>::Delete(
494 *filesystem, MakeSchemaTypeMapperFilename(base_dir));
495 }
496
SchemaStore(const Filesystem * filesystem,std::string base_dir,const Clock * clock,const FeatureFlags * feature_flags)497 SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir,
498 const Clock* clock, const FeatureFlags* feature_flags)
499 : filesystem_(filesystem),
500 base_dir_(std::move(base_dir)),
501 clock_(clock),
502 feature_flags_(feature_flags),
503 schema_file_(filesystem, MakeSchemaFilename(base_dir_)) {}
504
~SchemaStore()505 SchemaStore::~SchemaStore() {
506 if (has_schema_successfully_set_ && schema_type_mapper_ != nullptr &&
507 schema_type_manager_ != nullptr) {
508 if (!PersistToDisk().ok()) {
509 ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor";
510 }
511 }
512 }
513
Initialize(SchemaProto new_schema)514 libtextclassifier3::Status SchemaStore::Initialize(SchemaProto new_schema) {
515 ICING_RETURN_IF_ERROR(LoadSchema());
516 if (!absl_ports::IsNotFound(GetSchema().status())) {
517 return absl_ports::FailedPreconditionError(
518 "Incorrectly tried to initialize schema store with a new schema, when "
519 "one is already set!");
520 }
521 // ResetSchemaFileIfNeeded() will be called in InitializeInternal below.
522 ICING_RETURN_IF_ERROR(
523 schema_file_.Write(std::make_unique<SchemaProto>(std::move(new_schema))));
524 return InitializeInternal(/*create_overlay_if_necessary=*/true,
525 /*initialize_stats=*/nullptr);
526 }
527
Initialize(InitializeStatsProto * initialize_stats)528 libtextclassifier3::Status SchemaStore::Initialize(
529 InitializeStatsProto* initialize_stats) {
530 ICING_RETURN_IF_ERROR(LoadSchema());
531 auto schema_proto_or = GetSchema();
532 if (absl_ports::IsNotFound(schema_proto_or.status())) {
533 // Don't have an existing schema proto, that's fine
534 return libtextclassifier3::Status::OK;
535 } else if (!schema_proto_or.ok()) {
536 // Real error when trying to read the existing schema
537 return schema_proto_or.status();
538 }
539 return InitializeInternal(/*create_overlay_if_necessary=*/false,
540 initialize_stats);
541 }
542
LoadSchema()543 libtextclassifier3::Status SchemaStore::LoadSchema() {
544 libtextclassifier3::StatusOr<Header> header_or =
545 Header::Read(filesystem_, MakeHeaderFilename(base_dir_));
546 bool header_exists = false;
547 if (!header_or.ok() && !absl_ports::IsNotFound(header_or.status())) {
548 return header_or.status();
549 } else if (!header_or.ok()) {
550 header_ =
551 std::make_unique<Header>(filesystem_, MakeHeaderFilename(base_dir_));
552 } else {
553 header_exists = true;
554 header_ = std::make_unique<Header>(std::move(header_or).ValueOrDie());
555 }
556
557 std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir_);
558 bool overlay_schema_file_exists =
559 filesystem_->FileExists(overlay_schema_filename.c_str());
560
561 libtextclassifier3::Status base_schema_state = schema_file_.Read().status();
562 if (!base_schema_state.ok() && !absl_ports::IsNotFound(base_schema_state)) {
563 ResetSchemaFileIfNeeded();
564 return base_schema_state;
565 }
566
567 // There are three valid cases:
568 // 1. Everything is missing. This is an empty schema store.
569 if (!base_schema_state.ok() && !overlay_schema_file_exists &&
570 !header_exists) {
571 ResetSchemaFileIfNeeded();
572 return libtextclassifier3::Status::OK;
573 }
574
575 // 2. There never was a overlay schema. The header exists, the base schema
576 // exists and the header says the overlay schema shouldn't exist
577 if (base_schema_state.ok() && !overlay_schema_file_exists && header_exists &&
578 !header_->overlay_created()) {
579 // Nothing else to do. Just return safely.
580 ResetSchemaFileIfNeeded();
581 return libtextclassifier3::Status::OK;
582 }
583
584 // 3. There is an overlay schema and a base schema and a header. The header
585 // says that the overlay schema should exist.
586 if (base_schema_state.ok() && overlay_schema_file_exists && header_exists &&
587 header_->overlay_created()) {
588 overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
589 *filesystem_, MakeOverlaySchemaFilename(base_dir_));
590 ResetSchemaFileIfNeeded();
591 return libtextclassifier3::Status::OK;
592 }
593
594 // Something has gone wrong. We've lost part of the schema ground truth.
595 // Return an error.
596 bool overlay_created = header_->overlay_created();
597 bool base_schema_exists = base_schema_state.ok();
598 ResetSchemaFileIfNeeded();
599 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
600 "Unable to properly load schema. Header {exists:%d, overlay_created:%d}, "
601 "base schema exists: %d, overlay_schema_exists: %d",
602 header_exists, overlay_created, base_schema_exists,
603 overlay_schema_file_exists));
604 }
605
InitializeInternal(bool create_overlay_if_necessary,InitializeStatsProto * initialize_stats)606 libtextclassifier3::Status SchemaStore::InitializeInternal(
607 bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats) {
608 if (!InitializeDerivedFiles().ok()) {
609 ICING_VLOG(3)
610 << "Couldn't find derived files or failed to initialize them, "
611 "regenerating derived files for SchemaStore.";
612 std::unique_ptr<Timer> regenerate_timer = clock_->GetNewTimer();
613 if (initialize_stats != nullptr) {
614 initialize_stats->set_schema_store_recovery_cause(
615 InitializeStatsProto::IO_ERROR);
616 }
617 ICING_RETURN_IF_ERROR(RegenerateDerivedFiles(create_overlay_if_necessary));
618 if (initialize_stats != nullptr) {
619 initialize_stats->set_schema_store_recovery_latency_ms(
620 regenerate_timer->GetElapsedMilliseconds());
621 }
622 }
623
624 if (initialize_stats != nullptr) {
625 initialize_stats->set_num_schema_types(type_config_map_.size());
626 }
627 has_schema_successfully_set_ = true;
628
629 return libtextclassifier3::Status::OK;
630 }
631
InitializeDerivedFiles()632 libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() {
633 ICING_ASSIGN_OR_RETURN(
634 schema_type_mapper_,
635 DynamicTrieKeyMapper<SchemaTypeId>::Create(
636 *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
637 kSchemaTypeMapperMaxSize));
638
639 Crc32 expected_checksum(header_->checksum());
640 ICING_ASSIGN_OR_RETURN(Crc32 checksum, GetChecksum());
641 if (checksum != expected_checksum) {
642 return absl_ports::InternalError(
643 "Combined checksum of SchemaStore was inconsistent");
644 }
645
646 ICING_RETURN_IF_ERROR(BuildInMemoryCache());
647 return libtextclassifier3::Status::OK;
648 }
649
RegenerateDerivedFiles(bool create_overlay_if_necessary)650 libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles(
651 bool create_overlay_if_necessary) {
652 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
653
654 ICING_RETURN_IF_ERROR(ResetSchemaTypeMapper());
655
656 for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
657 // Assign a SchemaTypeId to the type
658 ICING_RETURN_IF_ERROR(schema_type_mapper_->Put(
659 type_config.schema_type(), schema_type_mapper_->num_keys()));
660 }
661 ICING_RETURN_IF_ERROR(BuildInMemoryCache());
662
663 if (create_overlay_if_necessary) {
664 BackupSchemaProducer producer(feature_flags_);
665 ICING_ASSIGN_OR_RETURN(
666 BackupSchemaProducer::BackupSchemaResult backup_result,
667 producer.Produce(*schema_proto,
668 schema_type_manager_->section_manager()));
669
670 if (backup_result.backup_schema_produced) {
671 // The overlay schema should be written to the overlay file location.
672 overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
673 *filesystem_, MakeOverlaySchemaFilename(base_dir_));
674 auto schema_ptr = std::make_unique<SchemaProto>(std::move(*schema_proto));
675 ICING_RETURN_IF_ERROR(overlay_schema_file_->Write(std::move(schema_ptr)));
676
677 // The base schema should be written to the original file
678 auto base_schema_ptr =
679 std::make_unique<SchemaProto>(std::move(backup_result.backup_schema));
680 ICING_RETURN_IF_ERROR(schema_file_.Write(std::move(base_schema_ptr)));
681
682 // LINT.IfChange(min_overlay_version_compatibility)
683 // Although the current version is 5, the schema is compatible with
684 // version 1, so min_overlay_version_compatibility should be 1.
685 int32_t min_overlay_version_compatibility = version_util::kVersionOne;
686 // LINT.ThenChange(//depot/google3/icing/file/version-util.h:kVersion)
687 header_->SetOverlayInfo(
688 /*overlay_created=*/true, min_overlay_version_compatibility);
689 // Rebuild in memory data - references to the old schema will be invalid
690 // now.
691 ICING_RETURN_IF_ERROR(BuildInMemoryCache());
692 }
693 }
694
695 // Write the header
696 ICING_RETURN_IF_ERROR(UpdateChecksum());
697 ResetSchemaFileIfNeeded();
698 return libtextclassifier3::Status::OK;
699 }
700
BuildInMemoryCache()701 libtextclassifier3::Status SchemaStore::BuildInMemoryCache() {
702 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
703 ICING_ASSIGN_OR_RETURN(
704 SchemaUtil::InheritanceMap inheritance_map,
705 SchemaUtil::BuildTransitiveInheritanceGraph(*schema_proto));
706
707 reverse_schema_type_mapper_.clear();
708 database_type_map_.clear();
709 type_config_map_.clear();
710 schema_subtype_id_map_.clear();
711 for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
712 const std::string& database = type_config.database();
713 const std::string& type_name = type_config.schema_type();
714 ICING_ASSIGN_OR_RETURN(SchemaTypeId type_id,
715 schema_type_mapper_->Get(type_name));
716
717 // Build reverse_schema_type_mapper_
718 reverse_schema_type_mapper_.insert({type_id, type_name});
719
720 // Build database_type_map_
721 database_type_map_[database].push_back(type_name);
722
723 // Build type_config_map_
724 type_config_map_.insert({type_name, type_config});
725
726 // Build schema_subtype_id_map_
727 std::unordered_set<SchemaTypeId>& subtype_id_set =
728 schema_subtype_id_map_[type_id];
729 // Find all child types
730 auto child_types_names = inheritance_map.find(type_name);
731 if (child_types_names != inheritance_map.end()) {
732 subtype_id_set.reserve(child_types_names->second.size() + 1);
733 for (const auto& [child_type_name, is_direct_child] :
734 child_types_names->second) {
735 ICING_ASSIGN_OR_RETURN(SchemaTypeId child_type_id,
736 schema_type_mapper_->Get(child_type_name));
737 subtype_id_set.insert(child_type_id);
738 }
739 }
740 // Every type is a subtype of itself.
741 subtype_id_set.insert(type_id);
742 }
743
744 // Build schema_type_manager_
745 ICING_ASSIGN_OR_RETURN(
746 schema_type_manager_,
747 SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
748
749 scorable_property_manager_ = std::make_unique<ScorablePropertyManager>();
750
751 return libtextclassifier3::Status::OK;
752 }
753
ResetSchemaTypeMapper()754 libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
755 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
756 schema_type_mapper_.reset();
757 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
758 // that can support error logging.
759 libtextclassifier3::Status status =
760 DynamicTrieKeyMapper<SchemaTypeId>::Delete(
761 *filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
762 if (!status.ok()) {
763 ICING_LOG(ERROR) << status.error_message()
764 << "Failed to delete old schema_type mapper";
765 return status;
766 }
767 ICING_ASSIGN_OR_RETURN(
768 schema_type_mapper_,
769 DynamicTrieKeyMapper<SchemaTypeId>::Create(
770 *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
771 kSchemaTypeMapperMaxSize));
772
773 return libtextclassifier3::Status::OK;
774 }
775
GetChecksum() const776 libtextclassifier3::StatusOr<Crc32> SchemaStore::GetChecksum() const {
777 ICING_ASSIGN_OR_RETURN(Crc32 schema_checksum, schema_file_.GetChecksum());
778 // We've gotten the schema_checksum successfully. Sadly, we still need to
779 // differentiate between an existing, but empty schema and a non-existent
780 // schema (both of which will have a checksum of 0). For existing, but empty
781 // schemas, we need to continue with the checksum calculation of the other
782 // components.
783 if (schema_checksum == Crc32() && !has_schema_successfully_set_) {
784 return schema_checksum;
785 }
786
787 Crc32 total_checksum;
788 total_checksum.Append(std::to_string(schema_checksum.Get()));
789 if (overlay_schema_file_ != nullptr) {
790 ICING_ASSIGN_OR_RETURN(Crc32 overlay_schema_checksum,
791 overlay_schema_file_->GetChecksum());
792 total_checksum.Append(std::to_string(overlay_schema_checksum.Get()));
793 }
794
795 ICING_ASSIGN_OR_RETURN(Crc32 schema_type_mapper_checksum,
796 schema_type_mapper_->GetChecksum());
797 total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
798 return total_checksum;
799 }
800
UpdateChecksum()801 libtextclassifier3::StatusOr<Crc32> SchemaStore::UpdateChecksum() {
802 ICING_ASSIGN_OR_RETURN(Crc32 schema_checksum, schema_file_.GetChecksum());
803 // We've gotten the schema_checksum successfully. Sadly, we still need to
804 // differentiate between an existing, but empty schema and a non-existent
805 // schema (both of which will have a checksum of 0). For existing, but empty
806 // schemas, we need to continue with the checksum calculation of the other
807 // components.
808 if (schema_checksum == Crc32() && !has_schema_successfully_set_) {
809 return schema_checksum;
810 }
811 Crc32 total_checksum;
812 total_checksum.Append(std::to_string(schema_checksum.Get()));
813
814 if (overlay_schema_file_ != nullptr) {
815 ICING_ASSIGN_OR_RETURN(Crc32 overlay_schema_checksum,
816 overlay_schema_file_->GetChecksum());
817 total_checksum.Append(std::to_string(overlay_schema_checksum.Get()));
818 }
819
820 ICING_ASSIGN_OR_RETURN(Crc32 schema_type_mapper_checksum,
821 schema_type_mapper_->UpdateChecksum());
822 total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
823
824 header_->set_checksum(total_checksum.Get());
825 ICING_RETURN_IF_ERROR(header_->Write());
826 return total_checksum;
827 }
828
GetSchema() const829 libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema()
830 const {
831 if (overlay_schema_file_ != nullptr) {
832 return overlay_schema_file_->Read();
833 }
834
835 return schema_file_.Read();
836 }
837
GetSchema(const std::string & database) const838 libtextclassifier3::StatusOr<SchemaProto> SchemaStore::GetSchema(
839 const std::string& database) const {
840 if (!has_schema_successfully_set_) {
841 return absl_ports::NotFoundError("No schema found.");
842 }
843
844 const auto database_type_map_itr_ = database_type_map_.find(database);
845 if (database_type_map_itr_ == database_type_map_.end()) {
846 return absl_ports::NotFoundError(
847 absl_ports::StrCat("No schema found for database '", database, "'."));
848 }
849
850 SchemaProto schema_proto;
851 for (const std::string& type_name : database_type_map_itr_->second) {
852 ICING_ASSIGN_OR_RETURN(const SchemaTypeConfigProto* type_config,
853 GetSchemaTypeConfig(type_name));
854 *schema_proto.add_types() = *type_config;
855 }
856 return schema_proto;
857 }
858
859 // TODO - b/337913932 - Remove this method once all callers are migrated to
860 // SetSchema(SetSchemaRequestProto&& set_schema_request). This should just be
861 // used in our tests.
862 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetSchema(SchemaProto new_schema,bool ignore_errors_and_delete_documents)863 SchemaStore::SetSchema(SchemaProto new_schema,
864 bool ignore_errors_and_delete_documents) {
865 SetSchemaRequestProto set_schema_request;
866 *set_schema_request.mutable_schema() = std::move(new_schema);
867 set_schema_request.set_ignore_errors_and_delete_documents(
868 ignore_errors_and_delete_documents);
869
870 return SetSchema(std::move(set_schema_request));
871 }
872
873 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetSchema(SetSchemaRequestProto && set_schema_request)874 SchemaStore::SetSchema(SetSchemaRequestProto&& set_schema_request) {
875 bool ignore_errors_and_delete_documents =
876 set_schema_request.ignore_errors_and_delete_documents();
877
878 if (feature_flags_->enable_schema_database()) {
879 // Step 1: (Only required if schema database is enabled)
880 // Do some preliminary checks on the new schema before formal validation and
881 // delta computation. This checks that:
882 // - The database field in the new schema's types match the provided
883 // database.
884 // - The new schema's type names are not already in use from other
885 // databases.
886 ICING_RETURN_IF_ERROR(ValidateSchemaDatabase(
887 set_schema_request.schema(), set_schema_request.database()));
888
889 // Step 2: Schema validation and delta computation -- try to get the
890 // existing schema for the database to compare to the new schema.
891 libtextclassifier3::StatusOr<SchemaProto> schema_proto =
892 GetSchema(set_schema_request.database());
893 if (absl_ports::IsNotFound(schema_proto.status())) {
894 // Case 1: No preexisting schema for this database.
895 return SetInitialSchemaForDatabase(
896 std::move(*set_schema_request.mutable_schema()),
897 set_schema_request.database(), ignore_errors_and_delete_documents);
898 }
899
900 if (!schema_proto.ok()) {
901 // Case 2: Real error
902 return schema_proto.status();
903 }
904
905 // Case 3: At this point, we're guaranteed that we have an existing schema
906 // for this database.
907 const SchemaProto& old_schema = schema_proto.ValueOrDie();
908 return SetSchemaWithDatabaseOverride(
909 std::move(*set_schema_request.mutable_schema()), old_schema,
910 set_schema_request.database(), ignore_errors_and_delete_documents);
911 }
912
913 // Get the full schema if schema database is disabled.
914 libtextclassifier3::StatusOr<const SchemaProto*> schema_proto = GetSchema();
915 if (absl_ports::IsNotFound(schema_proto.status())) {
916 // Case 1: No preexisting schema
917 return SetInitialSchemaForDatabase(
918 std::move(*set_schema_request.mutable_schema()),
919 set_schema_request.database(), ignore_errors_and_delete_documents);
920 }
921
922 if (!schema_proto.ok()) {
923 // Case 2: Real error
924 return schema_proto.status();
925 }
926
927 // Case 3: At this point, we're guaranteed that we have an existing schema
928 const SchemaProto& old_schema = *schema_proto.ValueOrDie();
929 return SetSchemaWithDatabaseOverride(
930 std::move(*set_schema_request.mutable_schema()), old_schema,
931 set_schema_request.database(), ignore_errors_and_delete_documents);
932 }
933
934 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetInitialSchemaForDatabase(SchemaProto new_schema,const std::string & database,bool ignore_errors_and_delete_documents)935 SchemaStore::SetInitialSchemaForDatabase(
936 SchemaProto new_schema, const std::string& database,
937 bool ignore_errors_and_delete_documents) {
938 SetSchemaResult result;
939
940 ICING_RETURN_IF_ERROR(SchemaUtil::Validate(new_schema, *feature_flags_));
941
942 result.success = true;
943 for (const SchemaTypeConfigProto& type_config : new_schema.types()) {
944 result.schema_types_new_by_name.insert(type_config.schema_type());
945 }
946 // Get the full new SchemaProto that is a combination of the existing schema
947 // and new_schema. This is needed as we can only write the full proto to the
948 // schema file.
949 ICING_ASSIGN_OR_RETURN(
950 SchemaProto full_new_schema,
951 GetFullSchemaProtoWithUpdatedDb(std::move(new_schema), database));
952 ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(full_new_schema)));
953 has_schema_successfully_set_ = true;
954
955 return result;
956 }
957
958 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetSchemaWithDatabaseOverride(SchemaProto new_schema,const SchemaProto & old_schema,const std::string & database,bool ignore_errors_and_delete_documents)959 SchemaStore::SetSchemaWithDatabaseOverride(
960 SchemaProto new_schema, const SchemaProto& old_schema,
961 const std::string& database, bool ignore_errors_and_delete_documents) {
962 // Assume we can set the schema unless proven otherwise.
963 SetSchemaResult result;
964 result.success = true;
965
966 if (feature_flags_->enable_schema_database()) {
967 // Sanity check to make sure that we're comparing schemas from the same
968 // database.
969 // The new code path ensures that old_schema contains types from exactly one
970 // database since it's obtained using GetSchema(database), which is
971 // guaranteed to only return types from the single provided database.
972 libtextclassifier3::Status validate_old_schema_database =
973 ValidateSchemaDatabase(old_schema, database);
974 if (!validate_old_schema_database.ok()) {
975 return absl_ports::InvalidArgumentError(
976 "Schema database mismatch between new and old schemas. This should "
977 "never happen");
978 }
979
980 // Check if the schema types are the same between the new and old schema,
981 // ignoring order.
982 if (AreSchemaTypesEqual(new_schema, old_schema)) {
983 return result;
984 }
985 } else {
986 // Old equality check that is sensitive to type definition order.
987 if (new_schema.SerializeAsString() == old_schema.SerializeAsString()) {
988 // Same schema as before. No need to update anything
989 return result;
990 }
991 }
992
993 // Different schema -- we need to validate the schema and track the
994 // differences to see if we can still write it.
995 //
996 // Validate the new schema and compute the delta between the old and new
997 // schema.
998 ICING_ASSIGN_OR_RETURN(SchemaUtil::DependentMap new_dependent_map,
999 SchemaUtil::Validate(new_schema, *feature_flags_));
1000 SchemaUtil::SchemaDelta schema_delta = SchemaUtil::ComputeCompatibilityDelta(
1001 old_schema, new_schema, new_dependent_map, *feature_flags_);
1002
1003 result.schema_types_new_by_name = std::move(schema_delta.schema_types_new);
1004 result.schema_types_changed_fully_compatible_by_name =
1005 std::move(schema_delta.schema_types_changed_fully_compatible);
1006 result.schema_types_index_incompatible_by_name =
1007 std::move(schema_delta.schema_types_index_incompatible);
1008 result.schema_types_join_incompatible_by_name =
1009 std::move(schema_delta.schema_types_join_incompatible);
1010 result.schema_types_scorable_property_inconsistent_by_name =
1011 std::move(schema_delta.schema_types_scorable_property_inconsistent);
1012
1013 for (const std::string& schema_type : schema_delta.schema_types_deleted) {
1014 // We currently don't support deletions, so mark this as not possible.
1015 // This will change once we allow force-set schemas.
1016 result.success = false;
1017
1018 result.schema_types_deleted_by_name.emplace(schema_type);
1019
1020 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1021 GetSchemaTypeId(schema_type));
1022 result.schema_types_deleted_by_id.emplace(schema_type_id);
1023 }
1024
1025 for (const std::string& schema_type :
1026 schema_delta.schema_types_incompatible) {
1027 // We currently don't support incompatible schemas, so mark this as
1028 // not possible. This will change once we allow force-set schemas.
1029 result.success = false;
1030
1031 result.schema_types_incompatible_by_name.emplace(schema_type);
1032
1033 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1034 GetSchemaTypeId(schema_type));
1035 result.schema_types_incompatible_by_id.emplace(schema_type_id);
1036 }
1037
1038 // Get the full new SchemaProto that is a combination of the existing schema
1039 // and new_schema. This is needed to calculate the updated SchemaTypeIds, and
1040 // for writing the full proto to the schema file.
1041 ICING_ASSIGN_OR_RETURN(
1042 SchemaProto full_new_schema,
1043 GetFullSchemaProtoWithUpdatedDb(std::move(new_schema), database));
1044
1045 // We still need to update old_schema_type_ids_changed. We need to retrieve
1046 // the entire old schema for this, as type ids are assigned for the entire
1047 // schema, and not on a per-database level.
1048 //
1049 // SchemaTypeIds changing is fine, we can update the DocumentStore.
1050 ICING_ASSIGN_OR_RETURN(const SchemaProto* full_old_schema, GetSchema());
1051 result.old_schema_type_ids_changed =
1052 SchemaTypeIdsChanged(*full_old_schema, full_new_schema);
1053
1054 // We can force set the schema if the caller has told us to ignore any errors
1055 result.success = result.success || ignore_errors_and_delete_documents;
1056
1057 // Step 3: Apply the schema change if success. This updates persisted files
1058 // and derived data structures.
1059 if (result.success) {
1060 ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(full_new_schema)));
1061 has_schema_successfully_set_ = true;
1062 }
1063
1064 // Convert schema types to SchemaTypeIds after the new schema is applied.
1065 if (feature_flags_->enable_scorable_properties()) {
1066 for (const std::string& schema_type :
1067 result.schema_types_scorable_property_inconsistent_by_name) {
1068 libtextclassifier3::StatusOr<SchemaTypeId> schema_type_id_or =
1069 GetSchemaTypeId(schema_type);
1070 if (!schema_type_id_or.ok()) {
1071 if (absl_ports::IsNotFound(schema_type_id_or.status())) {
1072 continue;
1073 }
1074 return schema_type_id_or.status();
1075 }
1076 result.schema_types_scorable_property_inconsistent_by_id.insert(
1077 schema_type_id_or.ValueOrDie());
1078 }
1079 }
1080
1081 return result;
1082 }
1083
ApplySchemaChange(SchemaProto new_schema)1084 libtextclassifier3::Status SchemaStore::ApplySchemaChange(
1085 SchemaProto new_schema) {
1086 // We need to ensure that we either 1) successfully set the schema and
1087 // update all derived data structures or 2) fail and leave the schema store
1088 // unchanged.
1089 // So, first, we create an empty temporary directory to build a new schema
1090 // store in.
1091 std::string temp_schema_store_dir_path = base_dir_ + "_temp";
1092 if (!filesystem_->DeleteDirectoryRecursively(
1093 temp_schema_store_dir_path.c_str())) {
1094 ICING_LOG(ERROR) << "Recursively deleting "
1095 << temp_schema_store_dir_path.c_str();
1096 return absl_ports::InternalError(
1097 "Unable to delete temp directory to prepare to build new schema "
1098 "store.");
1099 }
1100
1101 DestructibleDirectory temp_schema_store_dir(
1102 filesystem_, std::move(temp_schema_store_dir_path));
1103 if (!temp_schema_store_dir.is_valid()) {
1104 return absl_ports::InternalError(
1105 "Unable to create temp directory to build new schema store.");
1106 }
1107
1108 // Then we create our new schema store with the new schema.
1109 ICING_ASSIGN_OR_RETURN(
1110 std::unique_ptr<SchemaStore> new_schema_store,
1111 SchemaStore::Create(filesystem_, temp_schema_store_dir.dir(), clock_,
1112 feature_flags_, std::move(new_schema)));
1113
1114 // Then we swap the new schema file + new derived files with the old files.
1115 if (!filesystem_->SwapFiles(base_dir_.c_str(),
1116 temp_schema_store_dir.dir().c_str())) {
1117 return absl_ports::InternalError(
1118 "Unable to apply new schema due to failed swap!");
1119 }
1120
1121 std::string old_base_dir = std::move(base_dir_);
1122 *this = std::move(*new_schema_store);
1123
1124 // After the std::move, the filepaths saved in this instance and in the
1125 // schema_file_ instance will still be the one from temp_schema_store_dir
1126 // even though they now point to files that are within old_base_dir.
1127 // Manually set them to the correct paths.
1128 base_dir_ = std::move(old_base_dir);
1129 schema_file_.SetSwappedFilepath(MakeSchemaFilename(base_dir_));
1130 if (overlay_schema_file_ != nullptr) {
1131 overlay_schema_file_->SetSwappedFilepath(
1132 MakeOverlaySchemaFilename(base_dir_));
1133 }
1134
1135 return libtextclassifier3::Status::OK;
1136 }
1137
1138 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
GetSchemaTypeConfig(std::string_view schema_type) const1139 SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const {
1140 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1141 const auto& type_config_iter =
1142 type_config_map_.find(std::string(schema_type));
1143 if (type_config_iter == type_config_map_.end()) {
1144 return absl_ports::NotFoundError(
1145 absl_ports::StrCat("Schema type config '", schema_type, "' not found"));
1146 }
1147 return &type_config_iter->second;
1148 }
1149
GetSchemaTypeId(std::string_view schema_type) const1150 libtextclassifier3::StatusOr<SchemaTypeId> SchemaStore::GetSchemaTypeId(
1151 std::string_view schema_type) const {
1152 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1153 return schema_type_mapper_->Get(schema_type);
1154 }
1155
GetSchemaType(SchemaTypeId schema_type_id) const1156 libtextclassifier3::StatusOr<const std::string*> SchemaStore::GetSchemaType(
1157 SchemaTypeId schema_type_id) const {
1158 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1159 if (const auto it = reverse_schema_type_mapper_.find(schema_type_id);
1160 it == reverse_schema_type_mapper_.end()) {
1161 return absl_ports::InvalidArgumentError("Invalid schema type id");
1162 } else {
1163 return &it->second;
1164 }
1165 }
1166
1167 libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
GetSchemaTypeIdsWithChildren(std::string_view schema_type) const1168 SchemaStore::GetSchemaTypeIdsWithChildren(std::string_view schema_type) const {
1169 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1170 GetSchemaTypeId(schema_type));
1171 auto iter = schema_subtype_id_map_.find(schema_type_id);
1172 if (iter == schema_subtype_id_map_.end()) {
1173 // This should never happen, unless there is an inconsistency or IO error.
1174 return absl_ports::InternalError(absl_ports::StrCat(
1175 "Schema type '", schema_type, "' is not found in the subtype map."));
1176 }
1177 return &iter->second;
1178 }
1179
1180 libtextclassifier3::StatusOr<const SectionMetadata*>
GetSectionMetadata(SchemaTypeId schema_type_id,SectionId section_id) const1181 SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id,
1182 SectionId section_id) const {
1183 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1184 return schema_type_manager_->section_manager().GetSectionMetadata(
1185 schema_type_id, section_id);
1186 }
1187
ExtractSections(const DocumentProto & document) const1188 libtextclassifier3::StatusOr<SectionGroup> SchemaStore::ExtractSections(
1189 const DocumentProto& document) const {
1190 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1191 return schema_type_manager_->section_manager().ExtractSections(document);
1192 }
1193
1194 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,const std::string & property_path) const1195 SchemaStore::GetJoinablePropertyMetadata(
1196 SchemaTypeId schema_type_id, const std::string& property_path) const {
1197 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1198 return schema_type_manager_->joinable_property_manager()
1199 .GetJoinablePropertyMetadata(schema_type_id, property_path);
1200 }
1201
1202 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,JoinablePropertyId joinable_property_id) const1203 SchemaStore::GetJoinablePropertyMetadata(
1204 SchemaTypeId schema_type_id,
1205 JoinablePropertyId joinable_property_id) const {
1206 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1207 return schema_type_manager_->joinable_property_manager()
1208 .GetJoinablePropertyMetadata(schema_type_id, joinable_property_id);
1209 }
1210
1211 libtextclassifier3::StatusOr<JoinablePropertyGroup>
ExtractJoinableProperties(const DocumentProto & document) const1212 SchemaStore::ExtractJoinableProperties(const DocumentProto& document) const {
1213 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1214 return schema_type_manager_->joinable_property_manager()
1215 .ExtractJoinableProperties(document);
1216 }
1217
1218 libtextclassifier3::StatusOr<std::optional<int>>
GetScorablePropertyIndex(SchemaTypeId schema_type_id,std::string_view property_path) const1219 SchemaStore::GetScorablePropertyIndex(SchemaTypeId schema_type_id,
1220 std::string_view property_path) const {
1221 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1222 if (!feature_flags_->enable_scorable_properties()) {
1223 return std::nullopt;
1224 }
1225 return scorable_property_manager_->GetScorablePropertyIndex(
1226 schema_type_id, property_path, type_config_map_,
1227 reverse_schema_type_mapper_);
1228 }
1229
1230 libtextclassifier3::StatusOr<
1231 const std::vector<ScorablePropertyManager::ScorablePropertyInfo>*>
GetOrderedScorablePropertyInfo(SchemaTypeId schema_type_id) const1232 SchemaStore::GetOrderedScorablePropertyInfo(SchemaTypeId schema_type_id) const {
1233 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1234 if (!feature_flags_->enable_scorable_properties()) {
1235 return nullptr;
1236 }
1237 return scorable_property_manager_->GetOrderedScorablePropertyInfo(
1238 schema_type_id, type_config_map_, reverse_schema_type_mapper_);
1239 }
1240
PersistToDisk()1241 libtextclassifier3::Status SchemaStore::PersistToDisk() {
1242 if (!has_schema_successfully_set_) {
1243 return libtextclassifier3::Status::OK;
1244 }
1245 ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk());
1246 ICING_RETURN_IF_ERROR(UpdateChecksum());
1247 ICING_RETURN_IF_ERROR(header_->PersistToDisk());
1248 return libtextclassifier3::Status::OK;
1249 }
1250
GetStorageInfo() const1251 SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
1252 SchemaStoreStorageInfoProto storage_info;
1253 int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
1254 storage_info.set_schema_store_size(
1255 Filesystem::SanitizeFileSize(directory_size));
1256 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
1257 storage_info.set_num_schema_types(schema->types().size());
1258 int total_sections = 0;
1259 int num_types_sections_exhausted = 0;
1260 for (const SchemaTypeConfigProto& type : schema->types()) {
1261 auto sections_list_or =
1262 schema_type_manager_->section_manager().GetMetadataList(
1263 type.schema_type());
1264 if (!sections_list_or.ok()) {
1265 continue;
1266 }
1267 total_sections += sections_list_or.ValueOrDie()->size();
1268 if (sections_list_or.ValueOrDie()->size() == kTotalNumSections) {
1269 ++num_types_sections_exhausted;
1270 }
1271 }
1272
1273 storage_info.set_num_total_sections(total_sections);
1274 storage_info.set_num_schema_types_sections_exhausted(
1275 num_types_sections_exhausted);
1276 return storage_info;
1277 }
1278
1279 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
GetSectionMetadata(const std::string & schema_type) const1280 SchemaStore::GetSectionMetadata(const std::string& schema_type) const {
1281 return schema_type_manager_->section_manager().GetMetadataList(schema_type);
1282 }
1283
IsPropertyDefinedInSchema(SchemaTypeId schema_type_id,const std::string & property_path) const1284 bool SchemaStore::IsPropertyDefinedInSchema(
1285 SchemaTypeId schema_type_id, const std::string& property_path) const {
1286 auto schema_name_itr = reverse_schema_type_mapper_.find(schema_type_id);
1287 if (schema_name_itr == reverse_schema_type_mapper_.end()) {
1288 return false;
1289 }
1290 const std::string* current_type_name = &schema_name_itr->second;
1291
1292 std::vector<std::string_view> property_path_parts =
1293 property_util::SplitPropertyPathExpr(property_path);
1294 for (int i = 0; i < property_path_parts.size(); ++i) {
1295 auto type_config_itr = type_config_map_.find(*current_type_name);
1296 if (type_config_itr == type_config_map_.end()) {
1297 return false;
1298 }
1299 std::string_view property_name = property_path_parts.at(i);
1300 const PropertyConfigProto* selected_property = nullptr;
1301 for (const PropertyConfigProto& property :
1302 type_config_itr->second.properties()) {
1303 if (property.property_name() == property_name) {
1304 selected_property = &property;
1305 break;
1306 }
1307 }
1308 if (selected_property == nullptr) {
1309 return false;
1310 }
1311 if (i == property_path_parts.size() - 1) {
1312 // We've found a property at the final part of the path.
1313 return true;
1314 }
1315 if (selected_property->data_type() !=
1316 PropertyConfigProto::DataType::DOCUMENT) {
1317 // If this isn't final part of the path, but this property isn't a
1318 // document, so we know that this path doesn't exist.
1319 return false;
1320 }
1321 current_type_name = &selected_property->schema_type();
1322 }
1323
1324 // We should never reach this point.
1325 return false;
1326 }
1327
GetDebugInfo() const1328 libtextclassifier3::StatusOr<SchemaDebugInfoProto> SchemaStore::GetDebugInfo()
1329 const {
1330 SchemaDebugInfoProto debug_info;
1331 if (has_schema_successfully_set_) {
1332 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
1333 *debug_info.mutable_schema() = *schema;
1334 }
1335 ICING_ASSIGN_OR_RETURN(Crc32 crc, GetChecksum());
1336 debug_info.set_crc(crc.Get());
1337 return debug_info;
1338 }
1339
1340 std::vector<SchemaStore::ExpandedTypePropertyMask>
ExpandTypePropertyMasks(const google::protobuf::RepeatedPtrField<TypePropertyMask> & type_property_masks) const1341 SchemaStore::ExpandTypePropertyMasks(
1342 const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks)
1343 const {
1344 std::unordered_map<SchemaTypeId, ExpandedTypePropertyMask> result_map;
1345 for (const TypePropertyMask& type_field_mask : type_property_masks) {
1346 if (type_field_mask.schema_type() == kSchemaTypeWildcard) {
1347 ExpandedTypePropertyMask entry{type_field_mask.schema_type(),
1348 /*paths=*/{}};
1349 entry.paths.insert(type_field_mask.paths().begin(),
1350 type_field_mask.paths().end());
1351 result_map.insert({kInvalidSchemaTypeId, std::move(entry)});
1352 } else {
1353 auto schema_type_ids_or =
1354 GetSchemaTypeIdsWithChildren(type_field_mask.schema_type());
1355 // If we can't find the SchemaTypeIds, just throw it away
1356 if (!schema_type_ids_or.ok()) {
1357 continue;
1358 }
1359 const std::unordered_set<SchemaTypeId>* schema_type_ids =
1360 schema_type_ids_or.ValueOrDie();
1361 for (SchemaTypeId schema_type_id : *schema_type_ids) {
1362 auto schema_type_name_iter =
1363 reverse_schema_type_mapper_.find(schema_type_id);
1364 if (schema_type_name_iter == reverse_schema_type_mapper_.end()) {
1365 // This should never happen, unless there is an inconsistency or IO
1366 // error.
1367 ICING_LOG(ERROR) << "Got unknown schema type id: " << schema_type_id;
1368 continue;
1369 }
1370
1371 auto iter = result_map.find(schema_type_id);
1372 if (iter == result_map.end()) {
1373 ExpandedTypePropertyMask entry{schema_type_name_iter->second,
1374 /*paths=*/{}};
1375 iter = result_map.insert({schema_type_id, std::move(entry)}).first;
1376 }
1377 iter->second.paths.insert(type_field_mask.paths().begin(),
1378 type_field_mask.paths().end());
1379 }
1380 }
1381 }
1382 std::vector<ExpandedTypePropertyMask> result;
1383 result.reserve(result_map.size());
1384 for (auto& entry : result_map) {
1385 result.push_back(std::move(entry.second));
1386 }
1387 return result;
1388 }
1389
1390 libtextclassifier3::StatusOr<
1391 std::unordered_map<std::string, std::vector<std::string>>>
ConstructBlobPropertyMap() const1392 SchemaStore::ConstructBlobPropertyMap() const {
1393 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
1394 std::unordered_map<std::string, std::vector<std::string>> blob_property_map;
1395 for (const SchemaTypeConfigProto& type_config : schema->types()) {
1396 SchemaPropertyIterator iterator(type_config, type_config_map_);
1397 std::vector<std::string> blob_properties;
1398
1399 libtextclassifier3::Status status = iterator.Advance();
1400 while (status.ok()) {
1401 if (iterator.GetCurrentPropertyConfig().data_type() ==
1402 PropertyConfigProto::DataType::BLOB_HANDLE) {
1403 blob_properties.push_back(iterator.GetCurrentPropertyPath());
1404 }
1405 status = iterator.Advance();
1406 }
1407 if (!absl_ports::IsOutOfRange(status)) {
1408 return status;
1409 }
1410 if (!blob_properties.empty()) {
1411 blob_property_map.insert(
1412 {type_config.schema_type(), std::move(blob_properties)});
1413 }
1414 }
1415 return blob_property_map;
1416 }
1417
ValidateSchemaDatabase(const SchemaProto & new_schema,const std::string & database) const1418 libtextclassifier3::Status SchemaStore::ValidateSchemaDatabase(
1419 const SchemaProto& new_schema, const std::string& database) const {
1420 if (!feature_flags_->enable_schema_database() || new_schema.types().empty()) {
1421 return libtextclassifier3::Status::OK;
1422 }
1423
1424 // Loop through new_schema's types and validate it. The input SchemaProto
1425 // contains a list of SchemaTypeConfigProtos without deduplication. We need to
1426 // check that:
1427 // 1. All SchemaTypeConfigProtos have the same database value.
1428 // 2. The SchemaTypeConfigProtos's schema_type field is unique within both
1429 // new_schema, as well as the existing schema (recorded in
1430 // type_config_map_).
1431 for (const SchemaTypeConfigProto& type_config : new_schema.types()) {
1432 // Check database consistency.
1433 if (database != type_config.database()) {
1434 return absl_ports::InvalidArgumentError(absl_ports::StrCat(
1435 "Mismatch between the set schema request's database and the new "
1436 "schema types' database. Expected '",
1437 database, "' but got '", type_config.database(), "'."));
1438 }
1439
1440 // Check type name uniqueness. This is only necessary if there is a
1441 // pre-existing schema.
1442 if (has_schema_successfully_set_) {
1443 auto iter = type_config_map_.find(type_config.schema_type());
1444 if (iter != type_config_map_.end() &&
1445 database != iter->second.database()) {
1446 return absl_ports::AlreadyExistsError(
1447 absl_ports::StrCat("schema_type name: '", type_config.schema_type(),
1448 "' is already in use by a different database."));
1449 }
1450 }
1451 }
1452 return libtextclassifier3::Status::OK;
1453 }
1454
1455 libtextclassifier3::StatusOr<SchemaProto>
GetFullSchemaProtoWithUpdatedDb(SchemaProto input_database_schema,const std::string & database_to_update) const1456 SchemaStore::GetFullSchemaProtoWithUpdatedDb(
1457 SchemaProto input_database_schema,
1458 const std::string& database_to_update) const {
1459 if (!feature_flags_->enable_schema_database()) {
1460 // If the schema database is not enabled, the input schema is already the
1461 // full schema, so we don't need to do any merges.
1462 return input_database_schema;
1463 }
1464
1465 libtextclassifier3::StatusOr<const SchemaProto*> schema_proto = GetSchema();
1466 if (absl_ports::IsNotFound(schema_proto.status())) {
1467 // We don't have a pre-existing schema -- we can return the input database
1468 // schema as it's already the full schema.
1469 return input_database_schema;
1470 }
1471
1472 if (!schema_proto.ok()) {
1473 // Real error.
1474 return schema_proto.status();
1475 }
1476
1477 if (!has_schema_successfully_set_) {
1478 return absl_ports::InternalError(
1479 "Schema store was not initialized properly.");
1480 }
1481
1482 // At this point, we have a pre-existing schema -- we need to merge the
1483 // updated database with the existing schema.
1484 if (database_type_map_.size() == 1 &&
1485 database_type_map_.find(database_to_update) != database_type_map_.end()) {
1486 // No other databases in the schema -- we can return the input database
1487 // schema.
1488 return input_database_schema;
1489 }
1490
1491 const SchemaProto* existing_schema = schema_proto.ValueOrDie();
1492 SchemaProto full_schema;
1493
1494 // 1. Add types from the existing schema, replacing existing types with the
1495 // input types if the database is the one being updated by the input schema.
1496 // - For database_to_update, we replace the existing types with the input
1497 // types. Any existing type not included in input_database_schema is
1498 // deleted.
1499 // - If there are more input types than existing types for database_to_update,
1500 // the rest of the input types are appended to the end of the full_schema.
1501 // - If there are fewer input types than existing types for
1502 // database_to_update, we shift forward all existing types that appear after
1503 // the last input type.
1504 // - For existing types from other databases, we preserve the existing order
1505 // after adding to full_schema. Note that the type-ids of existing types
1506 // might still change if some types are deleted in the database_to_update as
1507 // this will cause all subsequent types ids to shift forward.
1508 int input_schema_index = 0, existing_schema_index = 0;
1509 while (input_schema_index < input_database_schema.types().size() &&
1510 existing_schema_index < existing_schema->types().size()) {
1511 const SchemaTypeConfigProto& existing_type_config =
1512 existing_schema->types(existing_schema_index);
1513 SchemaTypeConfigProto& input_type_config =
1514 *input_database_schema.mutable_types(input_schema_index);
1515
1516 if (existing_type_config.database() == database_to_update) {
1517 // If the database is the one being updated by the input schema, replace
1518 // the existing type with a type from the input schema.
1519 *full_schema.add_types() = std::move(input_type_config);
1520 ++input_schema_index;
1521 } else {
1522 *full_schema.add_types() = existing_type_config;
1523 }
1524 ++existing_schema_index;
1525 }
1526
1527 // 2. Append remaining types to the end of the SchemaProto.
1528 for (; input_schema_index < input_database_schema.types().size();
1529 ++input_schema_index) {
1530 // Case 1: Append all remaining types from the input schema. This happens
1531 // when more types are added in input_database_schema than what's in the
1532 // existing schema. In this case, we've used up the space for the database
1533 // in the existing schema, so we can just append the rest of the types to
1534 // the end.
1535 SchemaTypeConfigProto& input_type_config =
1536 *input_database_schema.mutable_types(input_schema_index);
1537 *full_schema.add_types() = std::move(input_type_config);
1538 }
1539 for (; existing_schema_index < existing_schema->types().size();
1540 ++existing_schema_index) {
1541 // Case 2: Add remaining types from the existing schema, but skip the ones
1542 // that are from input_database, since existing types from input_database
1543 // are replaced with input_database_schema.
1544 if (existing_schema->types(existing_schema_index).database() !=
1545 database_to_update) {
1546 *full_schema.add_types() = existing_schema->types(existing_schema_index);
1547 }
1548 }
1549
1550 return full_schema;
1551 }
1552
1553 } // namespace lib
1554 } // namespace icing
1555