1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/schema/schema-store.h"
16
17 #include <algorithm>
18 #include <cinttypes>
19 #include <cstdint>
20 #include <limits>
21 #include <memory>
22 #include <string>
23 #include <string_view>
24 #include <unordered_map>
25 #include <unordered_set>
26 #include <utility>
27 #include <vector>
28
29 #include "icing/text_classifier/lib3/utils/base/status.h"
30 #include "icing/text_classifier/lib3/utils/base/statusor.h"
31 #include "icing/absl_ports/canonical_errors.h"
32 #include "icing/absl_ports/str_cat.h"
33 #include "icing/file/destructible-directory.h"
34 #include "icing/file/file-backed-proto.h"
35 #include "icing/file/filesystem.h"
36 #include "icing/file/version-util.h"
37 #include "icing/proto/debug.pb.h"
38 #include "icing/proto/document.pb.h"
39 #include "icing/proto/logging.pb.h"
40 #include "icing/proto/schema.pb.h"
41 #include "icing/proto/search.pb.h"
42 #include "icing/proto/storage.pb.h"
43 #include "icing/schema/backup-schema-producer.h"
44 #include "icing/schema/joinable-property.h"
45 #include "icing/schema/property-util.h"
46 #include "icing/schema/schema-type-manager.h"
47 #include "icing/schema/schema-util.h"
48 #include "icing/schema/section.h"
49 #include "icing/store/document-filter-data.h"
50 #include "icing/store/dynamic-trie-key-mapper.h"
51 #include "icing/util/crc32.h"
52 #include "icing/util/logging.h"
53 #include "icing/util/status-macros.h"
54
55 namespace icing {
56 namespace lib {
57
58 namespace {
59
60 constexpr char kSchemaStoreHeaderFilename[] = "schema_store_header";
61 constexpr char kSchemaFilename[] = "schema.pb";
62 constexpr char kOverlaySchemaFilename[] = "overlay_schema.pb";
63 constexpr char kSchemaTypeMapperFilename[] = "schema_type_mapper";
64
65 // A DynamicTrieKeyMapper stores its data across 3 arrays internally. Giving
66 // each array 128KiB for storage means the entire DynamicTrieKeyMapper requires
67 // 384KiB.
68 constexpr int32_t kSchemaTypeMapperMaxSize = 3 * 128 * 1024; // 384 KiB
69
MakeHeaderFilename(const std::string & base_dir)70 std::string MakeHeaderFilename(const std::string& base_dir) {
71 return absl_ports::StrCat(base_dir, "/", kSchemaStoreHeaderFilename);
72 }
73
MakeSchemaFilename(const std::string & base_dir)74 std::string MakeSchemaFilename(const std::string& base_dir) {
75 return absl_ports::StrCat(base_dir, "/", kSchemaFilename);
76 }
77
MakeOverlaySchemaFilename(const std::string & base_dir)78 std::string MakeOverlaySchemaFilename(const std::string& base_dir) {
79 return absl_ports::StrCat(base_dir, "/", kOverlaySchemaFilename);
80 }
81
MakeSchemaTypeMapperFilename(const std::string & base_dir)82 std::string MakeSchemaTypeMapperFilename(const std::string& base_dir) {
83 return absl_ports::StrCat(base_dir, "/", kSchemaTypeMapperFilename);
84 }
85
86 // Assuming that SchemaTypeIds are assigned to schema types based on their order
87 // in the SchemaProto. Check if the schema type->SchemaTypeId mapping would
88 // change with the new schema.
SchemaTypeIdsChanged(const SchemaProto & old_schema,const SchemaProto & new_schema)89 std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged(
90 const SchemaProto& old_schema, const SchemaProto& new_schema) {
91 std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
92
93 std::unordered_map<std::string, int> old_types_and_index;
94 for (int i = 0; i < old_schema.types_size(); ++i) {
95 old_types_and_index.emplace(old_schema.types(i).schema_type(), i);
96 }
97
98 std::unordered_map<std::string, int> new_types_and_index;
99 for (int i = 0; i < new_schema.types_size(); ++i) {
100 new_types_and_index.emplace(new_schema.types(i).schema_type(), i);
101 }
102
103 for (const auto& old_type_index : old_types_and_index) {
104 const auto& iter = new_types_and_index.find(old_type_index.first);
105 // We only care if the type exists in both the old and new schema. If the
106 // type has been deleted, then it'll be captured in
107 // SetSchemaResult.schema_types_deleted*. If the type has been added in the
108 // new schema then we also don't care because nothing needs to be updated.
109 if (iter != new_types_and_index.end()) {
110 // Since the SchemaTypeId of the schema type is just the index of it in
111 // the SchemaProto, compare the index and save it if it's not the same
112 if (old_type_index.second != iter->second) {
113 old_schema_type_ids_changed.emplace(old_type_index.second);
114 }
115 }
116 }
117
118 return old_schema_type_ids_changed;
119 }
120
121 } // namespace
122
123 /* static */ libtextclassifier3::StatusOr<SchemaStore::Header>
Read(const Filesystem * filesystem,const std::string & path)124 SchemaStore::Header::Read(const Filesystem* filesystem,
125 const std::string& path) {
126 Header header;
127 ScopedFd sfd(filesystem->OpenForRead(path.c_str()));
128 if (!sfd.is_valid()) {
129 return absl_ports::NotFoundError("SchemaStore header doesn't exist");
130 }
131
132 // If file is sizeof(LegacyHeader), then it must be LegacyHeader.
133 int64_t file_size = filesystem->GetFileSize(sfd.get());
134 if (file_size == sizeof(LegacyHeader)) {
135 LegacyHeader legacy_header;
136 if (!filesystem->Read(path.c_str(), &legacy_header,
137 sizeof(legacy_header))) {
138 return absl_ports::InternalError(
139 absl_ports::StrCat("Couldn't read: ", path));
140 }
141 if (legacy_header.magic != Header::kMagic) {
142 return absl_ports::InternalError(
143 absl_ports::StrCat("Invalid header kMagic for file: ", path));
144 }
145 header.set_checksum(legacy_header.checksum);
146 } else if (file_size == sizeof(Header)) {
147 if (!filesystem->Read(path.c_str(), &header, sizeof(header))) {
148 return absl_ports::InternalError(
149 absl_ports::StrCat("Couldn't read: ", path));
150 }
151 if (header.magic() != Header::kMagic) {
152 return absl_ports::InternalError(
153 absl_ports::StrCat("Invalid header kMagic for file: ", path));
154 }
155 } else {
156 int legacy_header_size = sizeof(LegacyHeader);
157 int header_size = sizeof(Header);
158 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
159 "Unexpected header size %" PRId64 ". Expected %d or %d", file_size,
160 legacy_header_size, header_size));
161 }
162 return header;
163 }
164
Write(const Filesystem * filesystem,const std::string & path)165 libtextclassifier3::Status SchemaStore::Header::Write(
166 const Filesystem* filesystem, const std::string& path) {
167 ScopedFd scoped_fd(filesystem->OpenForWrite(path.c_str()));
168 // This should overwrite the header.
169 if (!scoped_fd.is_valid() ||
170 !filesystem->Write(scoped_fd.get(), this, sizeof(*this)) ||
171 !filesystem->DataSync(scoped_fd.get())) {
172 return absl_ports::InternalError(
173 absl_ports::StrCat("Failed to write SchemaStore header: ", path));
174 }
175 return libtextclassifier3::Status::OK;
176 }
177
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,InitializeStatsProto * initialize_stats)178 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
179 const Filesystem* filesystem, const std::string& base_dir,
180 const Clock* clock, InitializeStatsProto* initialize_stats) {
181 ICING_RETURN_ERROR_IF_NULL(filesystem);
182 ICING_RETURN_ERROR_IF_NULL(clock);
183
184 if (!filesystem->DirectoryExists(base_dir.c_str())) {
185 return absl_ports::FailedPreconditionError(
186 "Schema store base directory does not exist!");
187 }
188 std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
189 new SchemaStore(filesystem, base_dir, clock));
190 ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats));
191 return schema_store;
192 }
193
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,SchemaProto schema)194 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
195 const Filesystem* filesystem, const std::string& base_dir,
196 const Clock* clock, SchemaProto schema) {
197 ICING_RETURN_ERROR_IF_NULL(filesystem);
198 ICING_RETURN_ERROR_IF_NULL(clock);
199
200 if (!filesystem->DirectoryExists(base_dir.c_str())) {
201 return absl_ports::FailedPreconditionError(
202 "Schema store base directory does not exist!");
203 }
204 std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
205 new SchemaStore(filesystem, base_dir, clock));
206 ICING_RETURN_IF_ERROR(schema_store->Initialize(std::move(schema)));
207 return schema_store;
208 }
209
DiscardOverlaySchema(const Filesystem * filesystem,const std::string & base_dir,Header & header)210 /* static */ libtextclassifier3::Status SchemaStore::DiscardOverlaySchema(
211 const Filesystem* filesystem, const std::string& base_dir, Header& header) {
212 std::string header_filename = MakeHeaderFilename(base_dir);
213 if (header.overlay_created()) {
214 header.SetOverlayInfo(
215 /*overlay_created=*/false,
216 /*min_overlay_version_compatibility=*/ std::numeric_limits<
217 int32_t>::max());
218 ICING_RETURN_IF_ERROR(header.Write(filesystem, header_filename));
219 }
220 std::string schema_overlay_filename = MakeOverlaySchemaFilename(base_dir);
221 if (!filesystem->DeleteFile(schema_overlay_filename.c_str())) {
222 return absl_ports::InternalError(
223 "Unable to delete stale schema overlay file.");
224 }
225 return libtextclassifier3::Status::OK;
226 }
227
MigrateSchema(const Filesystem * filesystem,const std::string & base_dir,version_util::StateChange version_state_change,int32_t new_version)228 /* static */ libtextclassifier3::Status SchemaStore::MigrateSchema(
229 const Filesystem* filesystem, const std::string& base_dir,
230 version_util::StateChange version_state_change, int32_t new_version) {
231 if (!filesystem->DirectoryExists(base_dir.c_str())) {
232 // Situations when schema store directory doesn't exist:
233 // - Initializing new Icing instance: don't have to do anything now. The
234 // directory will be created later.
235 // - Lose schema store: there is nothing we can do now. The logic will be
236 // handled later by initializing.
237 //
238 // Therefore, just simply return OK here.
239 return libtextclassifier3::Status::OK;
240 }
241
242 std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir);
243 if (!filesystem->FileExists(overlay_schema_filename.c_str())) {
244 // The overlay doesn't exist. So there should be nothing particularly
245 // interesting to worry about.
246 return libtextclassifier3::Status::OK;
247 }
248
249 std::string header_filename = MakeHeaderFilename(base_dir);
250 libtextclassifier3::StatusOr<Header> header_or;
251 switch (version_state_change) {
252 // No necessary actions for normal upgrades or no version change. The data
253 // that was produced by the previous version is fully compatible with this
254 // version and there's no stale data for us to clean up.
255 // The same is true for a normal rollforward. A normal rollforward implies
256 // that the previous version was one that understood the concept of the
257 // overlay schema and would have already discarded it if it was unusable.
258 case version_util::StateChange::kVersionZeroUpgrade:
259 // fallthrough
260 case version_util::StateChange::kUpgrade:
261 // fallthrough
262 case version_util::StateChange::kRollForward:
263 // fallthrough
264 case version_util::StateChange::kCompatible:
265 return libtextclassifier3::Status::OK;
266 case version_util::StateChange::kVersionZeroRollForward:
267 // We've rolled forward. The schema overlay file, if it exists, is
268 // possibly stale. We must throw it out.
269 header_or = Header::Read(filesystem, header_filename);
270 if (!header_or.ok()) {
271 return header_or.status();
272 }
273 return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
274 header_or.ValueOrDie());
275 case version_util::StateChange::kRollBack:
276 header_or = Header::Read(filesystem, header_filename);
277 if (!header_or.ok()) {
278 return header_or.status();
279 }
280 if (header_or.ValueOrDie().min_overlay_version_compatibility() <=
281 new_version) {
282 // We've been rolled back, but the overlay schema claims that it
283 // supports this version. So we can safely return.
284 return libtextclassifier3::Status::OK;
285 }
286 // We've been rolled back to a version that the overlay schema doesn't
287 // support. We must throw it out.
288 return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
289 header_or.ValueOrDie());
290 case version_util::StateChange::kUndetermined:
291 // It's not clear what version we're on, but the base schema should always
292 // be safe to use. Throw out the overlay.
293 header_or = Header::Read(filesystem, header_filename);
294 if (!header_or.ok()) {
295 return header_or.status();
296 }
297 return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
298 header_or.ValueOrDie());
299 }
300 return libtextclassifier3::Status::OK;
301 }
302
DiscardDerivedFiles(const Filesystem * filesystem,const std::string & base_dir)303 /* static */ libtextclassifier3::Status SchemaStore::DiscardDerivedFiles(
304 const Filesystem* filesystem, const std::string& base_dir) {
305 // Schema type mapper
306 return DynamicTrieKeyMapper<SchemaTypeId>::Delete(
307 *filesystem, MakeSchemaTypeMapperFilename(base_dir));
308 }
309
SchemaStore(const Filesystem * filesystem,std::string base_dir,const Clock * clock)310 SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir,
311 const Clock* clock)
312 : filesystem_(filesystem),
313 base_dir_(std::move(base_dir)),
314 clock_(clock),
315 schema_file_(std::make_unique<FileBackedProto<SchemaProto>>(
316 *filesystem, MakeSchemaFilename(base_dir_))) {}
317
~SchemaStore()318 SchemaStore::~SchemaStore() {
319 if (has_schema_successfully_set_ && schema_file_ != nullptr &&
320 schema_type_mapper_ != nullptr && schema_type_manager_ != nullptr) {
321 if (!PersistToDisk().ok()) {
322 ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor";
323 }
324 }
325 }
326
Initialize(SchemaProto new_schema)327 libtextclassifier3::Status SchemaStore::Initialize(SchemaProto new_schema) {
328 ICING_RETURN_IF_ERROR(LoadSchema());
329 if (!absl_ports::IsNotFound(GetSchema().status())) {
330 return absl_ports::FailedPreconditionError(
331 "Incorrectly tried to initialize schema store with a new schema, when "
332 "one is already set!");
333 }
334 ICING_RETURN_IF_ERROR(schema_file_->Write(
335 std::make_unique<SchemaProto>(std::move(new_schema))));
336 return InitializeInternal(/*create_overlay_if_necessary=*/true,
337 /*initialize_stats=*/nullptr);
338 }
339
Initialize(InitializeStatsProto * initialize_stats)340 libtextclassifier3::Status SchemaStore::Initialize(
341 InitializeStatsProto* initialize_stats) {
342 ICING_RETURN_IF_ERROR(LoadSchema());
343 auto schema_proto_or = GetSchema();
344 if (absl_ports::IsNotFound(schema_proto_or.status())) {
345 // Don't have an existing schema proto, that's fine
346 return libtextclassifier3::Status::OK;
347 } else if (!schema_proto_or.ok()) {
348 // Real error when trying to read the existing schema
349 return schema_proto_or.status();
350 }
351 return InitializeInternal(/*create_overlay_if_necessary=*/false,
352 initialize_stats);
353 }
354
LoadSchema()355 libtextclassifier3::Status SchemaStore::LoadSchema() {
356 libtextclassifier3::StatusOr<Header> header_or =
357 Header::Read(filesystem_, MakeHeaderFilename(base_dir_));
358 bool header_exists = false;
359 if (!header_or.ok() && !absl_ports::IsNotFound(header_or.status())) {
360 return header_or.status();
361 } else if (!header_or.ok()) {
362 header_ = std::make_unique<Header>();
363 } else {
364 header_exists = true;
365 header_ = std::make_unique<Header>(std::move(header_or).ValueOrDie());
366 }
367
368 std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir_);
369 bool overlay_schema_file_exists =
370 filesystem_->FileExists(overlay_schema_filename.c_str());
371
372 libtextclassifier3::Status base_schema_state = schema_file_->Read().status();
373 if (!base_schema_state.ok() && !absl_ports::IsNotFound(base_schema_state)) {
374 return base_schema_state;
375 }
376
377 // There are three valid cases:
378 // 1. Everything is missing. This is an empty schema store.
379 if (!base_schema_state.ok() && !overlay_schema_file_exists &&
380 !header_exists) {
381 return libtextclassifier3::Status::OK;
382 }
383
384 // 2. There never was a overlay schema. The header exists, the base schema
385 // exists and the header says the overlay schema shouldn't exist
386 if (base_schema_state.ok() && !overlay_schema_file_exists && header_exists &&
387 !header_->overlay_created()) {
388 // Nothing else to do. Just return safely.
389 return libtextclassifier3::Status::OK;
390 }
391
392 // 3. There is an overlay schema and a base schema and a header. The header
393 // says that the overlay schema should exist.
394 if (base_schema_state.ok() && overlay_schema_file_exists && header_exists &&
395 header_->overlay_created()) {
396 overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
397 *filesystem_, MakeOverlaySchemaFilename(base_dir_));
398 return libtextclassifier3::Status::OK;
399 }
400
401 // Something has gone wrong. We've lost part of the schema ground truth.
402 // Return an error.
403 bool overlay_created = header_->overlay_created();
404 bool base_schema_exists = base_schema_state.ok();
405 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
406 "Unable to properly load schema. Header {exists:%d, overlay_created:%d}, "
407 "base schema exists: %d, overlay_schema_exists: %d",
408 header_exists, overlay_created, base_schema_exists,
409 overlay_schema_file_exists));
410 }
411
InitializeInternal(bool create_overlay_if_necessary,InitializeStatsProto * initialize_stats)412 libtextclassifier3::Status SchemaStore::InitializeInternal(
413 bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats) {
414 if (!InitializeDerivedFiles().ok()) {
415 ICING_VLOG(3)
416 << "Couldn't find derived files or failed to initialize them, "
417 "regenerating derived files for SchemaStore.";
418 std::unique_ptr<Timer> regenerate_timer = clock_->GetNewTimer();
419 if (initialize_stats != nullptr) {
420 initialize_stats->set_schema_store_recovery_cause(
421 InitializeStatsProto::IO_ERROR);
422 }
423 ICING_RETURN_IF_ERROR(RegenerateDerivedFiles(create_overlay_if_necessary));
424 if (initialize_stats != nullptr) {
425 initialize_stats->set_schema_store_recovery_latency_ms(
426 regenerate_timer->GetElapsedMilliseconds());
427 }
428 }
429
430 if (initialize_stats != nullptr) {
431 initialize_stats->set_num_schema_types(type_config_map_.size());
432 }
433 has_schema_successfully_set_ = true;
434
435 return libtextclassifier3::Status::OK;
436 }
437
InitializeDerivedFiles()438 libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() {
439 ICING_ASSIGN_OR_RETURN(
440 schema_type_mapper_,
441 DynamicTrieKeyMapper<SchemaTypeId>::Create(
442 *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
443 kSchemaTypeMapperMaxSize));
444
445 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
446 if (checksum.Get() != header_->checksum()) {
447 return absl_ports::InternalError(
448 "Combined checksum of SchemaStore was inconsistent");
449 }
450
451 BuildInMemoryCache();
452 return libtextclassifier3::Status::OK;
453 }
454
RegenerateDerivedFiles(bool create_overlay_if_necessary)455 libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles(
456 bool create_overlay_if_necessary) {
457 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
458
459 ICING_RETURN_IF_ERROR(ResetSchemaTypeMapper());
460
461 for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
462 // Assign a SchemaTypeId to the type
463 ICING_RETURN_IF_ERROR(schema_type_mapper_->Put(
464 type_config.schema_type(), schema_type_mapper_->num_keys()));
465 }
466 BuildInMemoryCache();
467
468 if (create_overlay_if_necessary) {
469 ICING_ASSIGN_OR_RETURN(
470 BackupSchemaProducer producer,
471 BackupSchemaProducer::Create(*schema_proto,
472 schema_type_manager_->section_manager()));
473
474 if (producer.is_backup_necessary()) {
475 SchemaProto base_schema = std::move(producer).Produce();
476
477 // The overlay schema should be written to the overlay file location.
478 overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
479 *filesystem_, MakeOverlaySchemaFilename(base_dir_));
480 auto schema_ptr = std::make_unique<SchemaProto>(std::move(*schema_proto));
481 ICING_RETURN_IF_ERROR(overlay_schema_file_->Write(std::move(schema_ptr)));
482
483 // The base schema should be written to the original file
484 auto base_schema_ptr =
485 std::make_unique<SchemaProto>(std::move(base_schema));
486 ICING_RETURN_IF_ERROR(schema_file_->Write(std::move(base_schema_ptr)));
487
488 header_->SetOverlayInfo(
489 /*overlay_created=*/true,
490 /*min_overlay_version_compatibility=*/version_util::kVersionOne);
491 // Rebuild in memory data - references to the old schema will be invalid
492 // now.
493 BuildInMemoryCache();
494 }
495 }
496
497 // Write the header
498 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
499 header_->set_checksum(checksum.Get());
500 return header_->Write(filesystem_, MakeHeaderFilename(base_dir_));
501 }
502
BuildInMemoryCache()503 libtextclassifier3::Status SchemaStore::BuildInMemoryCache() {
504 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
505 ICING_ASSIGN_OR_RETURN(
506 SchemaUtil::InheritanceMap inheritance_map,
507 SchemaUtil::BuildTransitiveInheritanceGraph(*schema_proto));
508
509 reverse_schema_type_mapper_.clear();
510 type_config_map_.clear();
511 schema_subtype_id_map_.clear();
512 for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
513 std::string_view type_name = type_config.schema_type();
514 ICING_ASSIGN_OR_RETURN(SchemaTypeId type_id,
515 schema_type_mapper_->Get(type_name));
516
517 // Build reverse_schema_type_mapper_
518 reverse_schema_type_mapper_.insert({type_id, std::string(type_name)});
519
520 // Build type_config_map_
521 type_config_map_.insert({std::string(type_name), type_config});
522
523 // Build schema_subtype_id_map_
524 std::unordered_set<SchemaTypeId>& subtype_id_set =
525 schema_subtype_id_map_[type_id];
526 // Find all child types
527 auto child_types_names = inheritance_map.find(type_name);
528 if (child_types_names != inheritance_map.end()) {
529 subtype_id_set.reserve(child_types_names->second.size() + 1);
530 for (const auto& [child_type_name, is_direct_child] :
531 child_types_names->second) {
532 ICING_ASSIGN_OR_RETURN(SchemaTypeId child_type_id,
533 schema_type_mapper_->Get(child_type_name));
534 subtype_id_set.insert(child_type_id);
535 }
536 }
537 // Every type is a subtype of itself.
538 subtype_id_set.insert(type_id);
539 }
540
541 // Build schema_type_manager_
542 ICING_ASSIGN_OR_RETURN(
543 schema_type_manager_,
544 SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
545 return libtextclassifier3::Status::OK;
546 }
547
ResetSchemaTypeMapper()548 libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
549 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
550 schema_type_mapper_.reset();
551 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
552 // that can support error logging.
553 libtextclassifier3::Status status =
554 DynamicTrieKeyMapper<SchemaTypeId>::Delete(
555 *filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
556 if (!status.ok()) {
557 ICING_LOG(ERROR) << status.error_message()
558 << "Failed to delete old schema_type mapper";
559 return status;
560 }
561 ICING_ASSIGN_OR_RETURN(
562 schema_type_mapper_,
563 DynamicTrieKeyMapper<SchemaTypeId>::Create(
564 *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
565 kSchemaTypeMapperMaxSize));
566
567 return libtextclassifier3::Status::OK;
568 }
569
ComputeChecksum() const570 libtextclassifier3::StatusOr<Crc32> SchemaStore::ComputeChecksum() const {
571 // Base schema checksum
572 auto schema_proto_or = schema_file_->Read();
573 if (absl_ports::IsNotFound(schema_proto_or.status())) {
574 return Crc32();
575 }
576 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, schema_proto_or);
577 Crc32 schema_checksum;
578 schema_checksum.Append(schema_proto->SerializeAsString());
579
580 Crc32 overlay_schema_checksum;
581 if (overlay_schema_file_ != nullptr) {
582 auto schema_proto_or = schema_file_->Read();
583 if (schema_proto_or.ok()) {
584 ICING_ASSIGN_OR_RETURN(schema_proto, schema_proto_or);
585 overlay_schema_checksum.Append(schema_proto->SerializeAsString());
586 }
587 }
588
589 ICING_ASSIGN_OR_RETURN(Crc32 schema_type_mapper_checksum,
590 schema_type_mapper_->ComputeChecksum());
591
592 Crc32 total_checksum;
593 total_checksum.Append(std::to_string(schema_checksum.Get()));
594 if (overlay_schema_file_ != nullptr) {
595 total_checksum.Append(std::to_string(overlay_schema_checksum.Get()));
596 }
597 total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
598
599 return total_checksum;
600 }
601
GetSchema() const602 libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema()
603 const {
604 if (overlay_schema_file_ != nullptr) {
605 return overlay_schema_file_->Read();
606 }
607 return schema_file_->Read();
608 }
609
610 // TODO(cassiewang): Consider removing this definition of SetSchema if it's not
611 // needed by production code. It's currently being used by our tests, but maybe
612 // it's trivial to change our test code to also use the
613 // SetSchema(SchemaProto&& new_schema)
614 libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
SetSchema(const SchemaProto & new_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)615 SchemaStore::SetSchema(const SchemaProto& new_schema,
616 bool ignore_errors_and_delete_documents,
617 bool allow_circular_schema_definitions) {
618 return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents,
619 allow_circular_schema_definitions);
620 }
621
622 libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
SetSchema(SchemaProto && new_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)623 SchemaStore::SetSchema(SchemaProto&& new_schema,
624 bool ignore_errors_and_delete_documents,
625 bool allow_circular_schema_definitions) {
626 ICING_ASSIGN_OR_RETURN(
627 SchemaUtil::DependentMap new_dependent_map,
628 SchemaUtil::Validate(new_schema, allow_circular_schema_definitions));
629
630 SetSchemaResult result;
631
632 auto schema_proto_or = GetSchema();
633 if (absl_ports::IsNotFound(schema_proto_or.status())) {
634 // We don't have a pre-existing schema, so anything is valid.
635 result.success = true;
636 for (const SchemaTypeConfigProto& type_config : new_schema.types()) {
637 result.schema_types_new_by_name.insert(type_config.schema_type());
638 }
639 } else if (!schema_proto_or.ok()) {
640 // Real error
641 return schema_proto_or.status();
642 } else {
643 // At this point, we're guaranteed that we have a schema.
644 const SchemaProto old_schema = *schema_proto_or.ValueOrDie();
645
646 // Assume we can set the schema unless proven otherwise.
647 result.success = true;
648
649 if (new_schema.SerializeAsString() == old_schema.SerializeAsString()) {
650 // Same schema as before. No need to update anything
651 return result;
652 }
653
654 // Different schema, track the differences and see if we can still write it
655 SchemaUtil::SchemaDelta schema_delta =
656 SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
657 new_dependent_map);
658
659 result.schema_types_new_by_name = std::move(schema_delta.schema_types_new);
660 result.schema_types_changed_fully_compatible_by_name =
661 std::move(schema_delta.schema_types_changed_fully_compatible);
662 result.schema_types_index_incompatible_by_name =
663 std::move(schema_delta.schema_types_index_incompatible);
664 result.schema_types_join_incompatible_by_name =
665 std::move(schema_delta.schema_types_join_incompatible);
666
667 for (const auto& schema_type : schema_delta.schema_types_deleted) {
668 // We currently don't support deletions, so mark this as not possible.
669 // This will change once we allow force-set schemas.
670 result.success = false;
671
672 result.schema_types_deleted_by_name.emplace(schema_type);
673
674 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
675 GetSchemaTypeId(schema_type));
676 result.schema_types_deleted_by_id.emplace(schema_type_id);
677 }
678
679 for (const auto& schema_type : schema_delta.schema_types_incompatible) {
680 // We currently don't support incompatible schemas, so mark this as
681 // not possible. This will change once we allow force-set schemas.
682 result.success = false;
683
684 result.schema_types_incompatible_by_name.emplace(schema_type);
685
686 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
687 GetSchemaTypeId(schema_type));
688 result.schema_types_incompatible_by_id.emplace(schema_type_id);
689 }
690
691 // SchemaTypeIds changing is fine, we can update the DocumentStore
692 result.old_schema_type_ids_changed =
693 SchemaTypeIdsChanged(old_schema, new_schema);
694 }
695
696 // We can force set the schema if the caller has told us to ignore any errors
697 result.success = result.success || ignore_errors_and_delete_documents;
698
699 if (result.success) {
700 ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(new_schema)));
701 has_schema_successfully_set_ = true;
702 }
703
704 return result;
705 }
706
ApplySchemaChange(SchemaProto new_schema)707 libtextclassifier3::Status SchemaStore::ApplySchemaChange(
708 SchemaProto new_schema) {
709 // We need to ensure that we either 1) successfully set the schema and
710 // update all derived data structures or 2) fail and leave the schema store
711 // unchanged.
712 // So, first, we create an empty temporary directory to build a new schema
713 // store in.
714 std::string temp_schema_store_dir_path = base_dir_ + "_temp";
715 if (!filesystem_->DeleteDirectoryRecursively(
716 temp_schema_store_dir_path.c_str())) {
717 ICING_LOG(ERROR) << "Recursively deleting "
718 << temp_schema_store_dir_path.c_str();
719 return absl_ports::InternalError(
720 "Unable to delete temp directory to prepare to build new schema "
721 "store.");
722 }
723
724 DestructibleDirectory temp_schema_store_dir(
725 filesystem_, std::move(temp_schema_store_dir_path));
726 if (!temp_schema_store_dir.is_valid()) {
727 return absl_ports::InternalError(
728 "Unable to create temp directory to build new schema store.");
729 }
730
731 // Then we create our new schema store with the new schema.
732 ICING_ASSIGN_OR_RETURN(
733 std::unique_ptr<SchemaStore> new_schema_store,
734 SchemaStore::Create(filesystem_, temp_schema_store_dir.dir(), clock_,
735 std::move(new_schema)));
736
737 // Then we swap the new schema file + new derived files with the old files.
738 if (!filesystem_->SwapFiles(base_dir_.c_str(),
739 temp_schema_store_dir.dir().c_str())) {
740 return absl_ports::InternalError(
741 "Unable to apply new schema due to failed swap!");
742 }
743
744 std::string old_base_dir = std::move(base_dir_);
745 *this = std::move(*new_schema_store);
746
747 // After the std::move, the filepaths saved in this instance and in the
748 // schema_file_ instance will still be the one from temp_schema_store_dir
749 // even though they now point to files that are within old_base_dir.
750 // Manually set them to the correct paths.
751 base_dir_ = std::move(old_base_dir);
752 schema_file_->SetSwappedFilepath(MakeSchemaFilename(base_dir_));
753 if (overlay_schema_file_ != nullptr) {
754 overlay_schema_file_->SetSwappedFilepath(
755 MakeOverlaySchemaFilename(base_dir_));
756 }
757
758 return libtextclassifier3::Status::OK;
759 }
760
761 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
GetSchemaTypeConfig(std::string_view schema_type) const762 SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const {
763 ICING_RETURN_IF_ERROR(CheckSchemaSet());
764 const auto& type_config_iter =
765 type_config_map_.find(std::string(schema_type));
766 if (type_config_iter == type_config_map_.end()) {
767 return absl_ports::NotFoundError(
768 absl_ports::StrCat("Schema type config '", schema_type, "' not found"));
769 }
770 return &type_config_iter->second;
771 }
772
GetSchemaTypeId(std::string_view schema_type) const773 libtextclassifier3::StatusOr<SchemaTypeId> SchemaStore::GetSchemaTypeId(
774 std::string_view schema_type) const {
775 ICING_RETURN_IF_ERROR(CheckSchemaSet());
776 return schema_type_mapper_->Get(schema_type);
777 }
778
779 libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
GetSchemaTypeIdsWithChildren(std::string_view schema_type) const780 SchemaStore::GetSchemaTypeIdsWithChildren(std::string_view schema_type) const {
781 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
782 GetSchemaTypeId(schema_type));
783 auto iter = schema_subtype_id_map_.find(schema_type_id);
784 if (iter == schema_subtype_id_map_.end()) {
785 // This should never happen, unless there is an inconsistency or IO error.
786 return absl_ports::InternalError(absl_ports::StrCat(
787 "Schema type '", schema_type, "' is not found in the subtype map."));
788 }
789 return &iter->second;
790 }
791
792 libtextclassifier3::StatusOr<const SectionMetadata*>
GetSectionMetadata(SchemaTypeId schema_type_id,SectionId section_id) const793 SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id,
794 SectionId section_id) const {
795 ICING_RETURN_IF_ERROR(CheckSchemaSet());
796 return schema_type_manager_->section_manager().GetSectionMetadata(
797 schema_type_id, section_id);
798 }
799
ExtractSections(const DocumentProto & document) const800 libtextclassifier3::StatusOr<SectionGroup> SchemaStore::ExtractSections(
801 const DocumentProto& document) const {
802 ICING_RETURN_IF_ERROR(CheckSchemaSet());
803 return schema_type_manager_->section_manager().ExtractSections(document);
804 }
805
806 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,const std::string & property_path) const807 SchemaStore::GetJoinablePropertyMetadata(
808 SchemaTypeId schema_type_id, const std::string& property_path) const {
809 ICING_RETURN_IF_ERROR(CheckSchemaSet());
810 return schema_type_manager_->joinable_property_manager()
811 .GetJoinablePropertyMetadata(schema_type_id, property_path);
812 }
813
814 libtextclassifier3::StatusOr<JoinablePropertyGroup>
ExtractJoinableProperties(const DocumentProto & document) const815 SchemaStore::ExtractJoinableProperties(const DocumentProto& document) const {
816 ICING_RETURN_IF_ERROR(CheckSchemaSet());
817 return schema_type_manager_->joinable_property_manager()
818 .ExtractJoinableProperties(document);
819 }
820
PersistToDisk()821 libtextclassifier3::Status SchemaStore::PersistToDisk() {
822 if (!has_schema_successfully_set_) {
823 return libtextclassifier3::Status::OK;
824 }
825 ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk());
826 // Write the header
827 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
828 header_->set_checksum(checksum.Get());
829 return header_->Write(filesystem_, MakeHeaderFilename(base_dir_));
830 }
831
GetStorageInfo() const832 SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
833 SchemaStoreStorageInfoProto storage_info;
834 int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
835 storage_info.set_schema_store_size(
836 Filesystem::SanitizeFileSize(directory_size));
837 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
838 storage_info.set_num_schema_types(schema->types_size());
839 int total_sections = 0;
840 int num_types_sections_exhausted = 0;
841 for (const SchemaTypeConfigProto& type : schema->types()) {
842 auto sections_list_or =
843 schema_type_manager_->section_manager().GetMetadataList(
844 type.schema_type());
845 if (!sections_list_or.ok()) {
846 continue;
847 }
848 total_sections += sections_list_or.ValueOrDie()->size();
849 if (sections_list_or.ValueOrDie()->size() == kTotalNumSections) {
850 ++num_types_sections_exhausted;
851 }
852 }
853
854 storage_info.set_num_total_sections(total_sections);
855 storage_info.set_num_schema_types_sections_exhausted(
856 num_types_sections_exhausted);
857 return storage_info;
858 }
859
860 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
GetSectionMetadata(const std::string & schema_type) const861 SchemaStore::GetSectionMetadata(const std::string& schema_type) const {
862 return schema_type_manager_->section_manager().GetMetadataList(schema_type);
863 }
864
IsPropertyDefinedInSchema(SchemaTypeId schema_type_id,const std::string & property_path) const865 bool SchemaStore::IsPropertyDefinedInSchema(
866 SchemaTypeId schema_type_id, const std::string& property_path) const {
867 auto schema_name_itr = reverse_schema_type_mapper_.find(schema_type_id);
868 if (schema_name_itr == reverse_schema_type_mapper_.end()) {
869 return false;
870 }
871 const std::string* current_type_name = &schema_name_itr->second;
872
873 std::vector<std::string_view> property_path_parts =
874 property_util::SplitPropertyPathExpr(property_path);
875 for (int i = 0; i < property_path_parts.size(); ++i) {
876 auto type_config_itr = type_config_map_.find(*current_type_name);
877 if (type_config_itr == type_config_map_.end()) {
878 return false;
879 }
880 std::string_view property_name = property_path_parts.at(i);
881 const PropertyConfigProto* selected_property = nullptr;
882 for (const PropertyConfigProto& property :
883 type_config_itr->second.properties()) {
884 if (property.property_name() == property_name) {
885 selected_property = &property;
886 break;
887 }
888 }
889 if (selected_property == nullptr) {
890 return false;
891 }
892 if (i == property_path_parts.size() - 1) {
893 // We've found a property at the final part of the path.
894 return true;
895 }
896 if (selected_property->data_type() !=
897 PropertyConfigProto::DataType::DOCUMENT) {
898 // If this isn't final part of the path, but this property isn't a
899 // document, so we know that this path doesn't exist.
900 return false;
901 }
902 current_type_name = &selected_property->schema_type();
903 }
904
905 // We should never reach this point.
906 return false;
907 }
908
GetDebugInfo() const909 libtextclassifier3::StatusOr<SchemaDebugInfoProto> SchemaStore::GetDebugInfo()
910 const {
911 SchemaDebugInfoProto debug_info;
912 if (has_schema_successfully_set_) {
913 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
914 *debug_info.mutable_schema() = *schema;
915 }
916 ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
917 debug_info.set_crc(crc.Get());
918 return debug_info;
919 }
920
921 std::vector<SchemaStore::ExpandedTypePropertyMask>
ExpandTypePropertyMasks(const google::protobuf::RepeatedPtrField<TypePropertyMask> & type_property_masks) const922 SchemaStore::ExpandTypePropertyMasks(
923 const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks)
924 const {
925 std::unordered_map<SchemaTypeId, ExpandedTypePropertyMask> result_map;
926 for (const TypePropertyMask& type_field_mask : type_property_masks) {
927 if (type_field_mask.schema_type() == kSchemaTypeWildcard) {
928 ExpandedTypePropertyMask entry{type_field_mask.schema_type(),
929 /*paths=*/{}};
930 entry.paths.insert(type_field_mask.paths().begin(),
931 type_field_mask.paths().end());
932 result_map.insert({kInvalidSchemaTypeId, std::move(entry)});
933 } else {
934 auto schema_type_ids_or =
935 GetSchemaTypeIdsWithChildren(type_field_mask.schema_type());
936 // If we can't find the SchemaTypeIds, just throw it away
937 if (!schema_type_ids_or.ok()) {
938 continue;
939 }
940 const std::unordered_set<SchemaTypeId>* schema_type_ids =
941 schema_type_ids_or.ValueOrDie();
942 for (SchemaTypeId schema_type_id : *schema_type_ids) {
943 auto schema_type_name_iter =
944 reverse_schema_type_mapper_.find(schema_type_id);
945 if (schema_type_name_iter == reverse_schema_type_mapper_.end()) {
946 // This should never happen, unless there is an inconsistency or IO
947 // error.
948 ICING_LOG(ERROR) << "Got unknown schema type id: " << schema_type_id;
949 continue;
950 }
951
952 auto iter = result_map.find(schema_type_id);
953 if (iter == result_map.end()) {
954 ExpandedTypePropertyMask entry{schema_type_name_iter->second,
955 /*paths=*/{}};
956 iter = result_map.insert({schema_type_id, std::move(entry)}).first;
957 }
958 iter->second.paths.insert(type_field_mask.paths().begin(),
959 type_field_mask.paths().end());
960 }
961 }
962 }
963 std::vector<ExpandedTypePropertyMask> result;
964 result.reserve(result_map.size());
965 for (auto& entry : result_map) {
966 result.push_back(std::move(entry.second));
967 }
968 return result;
969 }
970
971 } // namespace lib
972 } // namespace icing
973