1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/schema/schema-store.h"
16
17 #include <algorithm>
18 #include <cinttypes>
19 #include <cstdint>
20 #include <limits>
21 #include <memory>
22 #include <string>
23 #include <string_view>
24 #include <unordered_map>
25 #include <unordered_set>
26 #include <utility>
27 #include <vector>
28
29 #include "icing/text_classifier/lib3/utils/base/status.h"
30 #include "icing/text_classifier/lib3/utils/base/statusor.h"
31 #include "icing/absl_ports/canonical_errors.h"
32 #include "icing/absl_ports/str_cat.h"
33 #include "icing/file/destructible-directory.h"
34 #include "icing/file/file-backed-proto.h"
35 #include "icing/file/filesystem.h"
36 #include "icing/file/version-util.h"
37 #include "icing/proto/debug.pb.h"
38 #include "icing/proto/document.pb.h"
39 #include "icing/proto/logging.pb.h"
40 #include "icing/proto/schema.pb.h"
41 #include "icing/proto/search.pb.h"
42 #include "icing/proto/storage.pb.h"
43 #include "icing/schema/backup-schema-producer.h"
44 #include "icing/schema/joinable-property.h"
45 #include "icing/schema/property-util.h"
46 #include "icing/schema/schema-type-manager.h"
47 #include "icing/schema/schema-util.h"
48 #include "icing/schema/section.h"
49 #include "icing/store/document-filter-data.h"
50 #include "icing/store/dynamic-trie-key-mapper.h"
51 #include "icing/util/crc32.h"
52 #include "icing/util/logging.h"
53 #include "icing/util/status-macros.h"
54
55 namespace icing {
56 namespace lib {
57
58 namespace {
59
60 constexpr char kSchemaStoreHeaderFilename[] = "schema_store_header";
61 constexpr char kSchemaFilename[] = "schema.pb";
62 constexpr char kOverlaySchemaFilename[] = "overlay_schema.pb";
63 constexpr char kSchemaTypeMapperFilename[] = "schema_type_mapper";
64
65 // A DynamicTrieKeyMapper stores its data across 3 arrays internally. Giving
66 // each array 128KiB for storage means the entire DynamicTrieKeyMapper requires
67 // 384KiB.
68 constexpr int32_t kSchemaTypeMapperMaxSize = 3 * 128 * 1024; // 384 KiB
69
MakeHeaderFilename(const std::string & base_dir)70 std::string MakeHeaderFilename(const std::string& base_dir) {
71 return absl_ports::StrCat(base_dir, "/", kSchemaStoreHeaderFilename);
72 }
73
MakeSchemaFilename(const std::string & base_dir)74 std::string MakeSchemaFilename(const std::string& base_dir) {
75 return absl_ports::StrCat(base_dir, "/", kSchemaFilename);
76 }
77
MakeOverlaySchemaFilename(const std::string & base_dir)78 std::string MakeOverlaySchemaFilename(const std::string& base_dir) {
79 return absl_ports::StrCat(base_dir, "/", kOverlaySchemaFilename);
80 }
81
MakeSchemaTypeMapperFilename(const std::string & base_dir)82 std::string MakeSchemaTypeMapperFilename(const std::string& base_dir) {
83 return absl_ports::StrCat(base_dir, "/", kSchemaTypeMapperFilename);
84 }
85
86 // Assuming that SchemaTypeIds are assigned to schema types based on their order
87 // in the SchemaProto. Check if the schema type->SchemaTypeId mapping would
88 // change with the new schema.
SchemaTypeIdsChanged(const SchemaProto & old_schema,const SchemaProto & new_schema)89 std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged(
90 const SchemaProto& old_schema, const SchemaProto& new_schema) {
91 std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
92
93 std::unordered_map<std::string, int> old_types_and_index;
94 for (int i = 0; i < old_schema.types_size(); ++i) {
95 old_types_and_index.emplace(old_schema.types(i).schema_type(), i);
96 }
97
98 std::unordered_map<std::string, int> new_types_and_index;
99 for (int i = 0; i < new_schema.types_size(); ++i) {
100 new_types_and_index.emplace(new_schema.types(i).schema_type(), i);
101 }
102
103 for (const auto& old_type_index : old_types_and_index) {
104 const auto& iter = new_types_and_index.find(old_type_index.first);
105 // We only care if the type exists in both the old and new schema. If the
106 // type has been deleted, then it'll be captured in
107 // SetSchemaResult.schema_types_deleted*. If the type has been added in the
108 // new schema then we also don't care because nothing needs to be updated.
109 if (iter != new_types_and_index.end()) {
110 // Since the SchemaTypeId of the schema type is just the index of it in
111 // the SchemaProto, compare the index and save it if it's not the same
112 if (old_type_index.second != iter->second) {
113 old_schema_type_ids_changed.emplace(old_type_index.second);
114 }
115 }
116 }
117
118 return old_schema_type_ids_changed;
119 }
120
121 } // namespace
122
123 /* static */ libtextclassifier3::StatusOr<SchemaStore::Header>
Read(const Filesystem * filesystem,const std::string & path)124 SchemaStore::Header::Read(const Filesystem* filesystem,
125 const std::string& path) {
126 Header header;
127 ScopedFd sfd(filesystem->OpenForRead(path.c_str()));
128 if (!sfd.is_valid()) {
129 return absl_ports::NotFoundError("SchemaStore header doesn't exist");
130 }
131
132 // If file is sizeof(LegacyHeader), then it must be LegacyHeader.
133 int64_t file_size = filesystem->GetFileSize(sfd.get());
134 if (file_size == sizeof(LegacyHeader)) {
135 LegacyHeader legacy_header;
136 if (!filesystem->Read(path.c_str(), &legacy_header,
137 sizeof(legacy_header))) {
138 return absl_ports::InternalError(
139 absl_ports::StrCat("Couldn't read: ", path));
140 }
141 if (legacy_header.magic != Header::kMagic) {
142 return absl_ports::InternalError(
143 absl_ports::StrCat("Invalid header kMagic for file: ", path));
144 }
145 header.set_checksum(legacy_header.checksum);
146 } else if (file_size == sizeof(Header)) {
147 if (!filesystem->Read(path.c_str(), &header, sizeof(header))) {
148 return absl_ports::InternalError(
149 absl_ports::StrCat("Couldn't read: ", path));
150 }
151 if (header.magic() != Header::kMagic) {
152 return absl_ports::InternalError(
153 absl_ports::StrCat("Invalid header kMagic for file: ", path));
154 }
155 } else {
156 int legacy_header_size = sizeof(LegacyHeader);
157 int header_size = sizeof(Header);
158 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
159 "Unexpected header size %" PRId64 ". Expected %d or %d", file_size,
160 legacy_header_size, header_size));
161 }
162 return header;
163 }
164
Write(const Filesystem * filesystem,const std::string & path)165 libtextclassifier3::Status SchemaStore::Header::Write(
166 const Filesystem* filesystem, const std::string& path) {
167 ScopedFd scoped_fd(filesystem->OpenForWrite(path.c_str()));
168 // This should overwrite the header.
169 if (!scoped_fd.is_valid() ||
170 !filesystem->Write(scoped_fd.get(), this, sizeof(*this)) ||
171 !filesystem->DataSync(scoped_fd.get())) {
172 return absl_ports::InternalError(
173 absl_ports::StrCat("Failed to write SchemaStore header: ", path));
174 }
175 return libtextclassifier3::Status::OK;
176 }
177
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,InitializeStatsProto * initialize_stats)178 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
179 const Filesystem* filesystem, const std::string& base_dir,
180 const Clock* clock, InitializeStatsProto* initialize_stats) {
181 ICING_RETURN_ERROR_IF_NULL(filesystem);
182 ICING_RETURN_ERROR_IF_NULL(clock);
183
184 if (!filesystem->DirectoryExists(base_dir.c_str())) {
185 return absl_ports::FailedPreconditionError(
186 "Schema store base directory does not exist!");
187 }
188 std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
189 new SchemaStore(filesystem, base_dir, clock));
190 ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats));
191 return schema_store;
192 }
193
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,SchemaProto schema)194 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
195 const Filesystem* filesystem, const std::string& base_dir,
196 const Clock* clock, SchemaProto schema) {
197 ICING_RETURN_ERROR_IF_NULL(filesystem);
198 ICING_RETURN_ERROR_IF_NULL(clock);
199
200 if (!filesystem->DirectoryExists(base_dir.c_str())) {
201 return absl_ports::FailedPreconditionError(
202 "Schema store base directory does not exist!");
203 }
204 std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
205 new SchemaStore(filesystem, base_dir, clock));
206 ICING_RETURN_IF_ERROR(schema_store->Initialize(std::move(schema)));
207 return schema_store;
208 }
209
DiscardOverlaySchema(const Filesystem * filesystem,const std::string & base_dir,Header & header)210 /* static */ libtextclassifier3::Status SchemaStore::DiscardOverlaySchema(
211 const Filesystem* filesystem, const std::string& base_dir, Header& header) {
212 std::string header_filename = MakeHeaderFilename(base_dir);
213 if (header.overlay_created()) {
214 header.SetOverlayInfo(
215 /*overlay_created=*/false,
216 /*min_overlay_version_compatibility=*/ std::numeric_limits<
217 int32_t>::max());
218 ICING_RETURN_IF_ERROR(header.Write(filesystem, header_filename));
219 }
220 std::string schema_overlay_filename = MakeOverlaySchemaFilename(base_dir);
221 if (!filesystem->DeleteFile(schema_overlay_filename.c_str())) {
222 return absl_ports::InternalError(
223 "Unable to delete stale schema overlay file.");
224 }
225 return libtextclassifier3::Status::OK;
226 }
227
MigrateSchema(const Filesystem * filesystem,const std::string & base_dir,version_util::StateChange version_state_change,int32_t new_version)228 /* static */ libtextclassifier3::Status SchemaStore::MigrateSchema(
229 const Filesystem* filesystem, const std::string& base_dir,
230 version_util::StateChange version_state_change, int32_t new_version) {
231 if (!filesystem->DirectoryExists(base_dir.c_str())) {
232 // Situations when schema store directory doesn't exist:
233 // - Initializing new Icing instance: don't have to do anything now. The
234 // directory will be created later.
235 // - Lose schema store: there is nothing we can do now. The logic will be
236 // handled later by initializing.
237 //
238 // Therefore, just simply return OK here.
239 return libtextclassifier3::Status::OK;
240 }
241
242 std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir);
243 if (!filesystem->FileExists(overlay_schema_filename.c_str())) {
244 // The overlay doesn't exist. So there should be nothing particularly
245 // interesting to worry about.
246 return libtextclassifier3::Status::OK;
247 }
248
249 std::string header_filename = MakeHeaderFilename(base_dir);
250 libtextclassifier3::StatusOr<Header> header_or;
251 switch (version_state_change) {
252 // No necessary actions for normal upgrades or no version change. The data
253 // that was produced by the previous version is fully compatible with this
254 // version and there's no stale data for us to clean up.
255 // The same is true for a normal rollforward. A normal rollforward implies
256 // that the previous version was one that understood the concept of the
257 // overlay schema and would have already discarded it if it was unusable.
258 case version_util::StateChange::kVersionZeroUpgrade:
259 // fallthrough
260 case version_util::StateChange::kUpgrade:
261 // fallthrough
262 case version_util::StateChange::kRollForward:
263 // fallthrough
264 case version_util::StateChange::kCompatible:
265 return libtextclassifier3::Status::OK;
266 case version_util::StateChange::kVersionZeroRollForward:
267 // We've rolled forward. The schema overlay file, if it exists, is
268 // possibly stale. We must throw it out.
269 header_or = Header::Read(filesystem, header_filename);
270 if (!header_or.ok()) {
271 return header_or.status();
272 }
273 return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
274 header_or.ValueOrDie());
275 case version_util::StateChange::kRollBack:
276 header_or = Header::Read(filesystem, header_filename);
277 if (!header_or.ok()) {
278 return header_or.status();
279 }
280 if (header_or.ValueOrDie().min_overlay_version_compatibility() <=
281 new_version) {
282 // We've been rolled back, but the overlay schema claims that it
283 // supports this version. So we can safely return.
284 return libtextclassifier3::Status::OK;
285 }
286 // We've been rolled back to a version that the overlay schema doesn't
287 // support. We must throw it out.
288 return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
289 header_or.ValueOrDie());
290 case version_util::StateChange::kUndetermined:
291 // It's not clear what version we're on, but the base schema should always
292 // be safe to use. Throw out the overlay.
293 header_or = Header::Read(filesystem, header_filename);
294 if (!header_or.ok()) {
295 return header_or.status();
296 }
297 return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
298 header_or.ValueOrDie());
299 }
300 return libtextclassifier3::Status::OK;
301 }
302
DiscardDerivedFiles(const Filesystem * filesystem,const std::string & base_dir)303 /* static */ libtextclassifier3::Status SchemaStore::DiscardDerivedFiles(
304 const Filesystem* filesystem, const std::string& base_dir) {
305 // Schema type mapper
306 return DynamicTrieKeyMapper<SchemaTypeId>::Delete(
307 *filesystem, MakeSchemaTypeMapperFilename(base_dir));
308 }
309
SchemaStore(const Filesystem * filesystem,std::string base_dir,const Clock * clock)310 SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir,
311 const Clock* clock)
312 : filesystem_(filesystem),
313 base_dir_(std::move(base_dir)),
314 clock_(clock),
315 schema_file_(std::make_unique<FileBackedProto<SchemaProto>>(
316 *filesystem, MakeSchemaFilename(base_dir_))) {}
317
~SchemaStore()318 SchemaStore::~SchemaStore() {
319 if (has_schema_successfully_set_ && schema_file_ != nullptr &&
320 schema_type_mapper_ != nullptr && schema_type_manager_ != nullptr) {
321 if (!PersistToDisk().ok()) {
322 ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor";
323 }
324 }
325 }
326
Initialize(SchemaProto new_schema)327 libtextclassifier3::Status SchemaStore::Initialize(SchemaProto new_schema) {
328 ICING_RETURN_IF_ERROR(LoadSchema());
329 if (!absl_ports::IsNotFound(GetSchema().status())) {
330 return absl_ports::FailedPreconditionError(
331 "Incorrectly tried to initialize schema store with a new schema, when "
332 "one is already set!");
333 }
334 ICING_RETURN_IF_ERROR(schema_file_->Write(
335 std::make_unique<SchemaProto>(std::move(new_schema))));
336 return InitializeInternal(/*create_overlay_if_necessary=*/true,
337 /*initialize_stats=*/nullptr);
338 }
339
Initialize(InitializeStatsProto * initialize_stats)340 libtextclassifier3::Status SchemaStore::Initialize(
341 InitializeStatsProto* initialize_stats) {
342 ICING_RETURN_IF_ERROR(LoadSchema());
343 auto schema_proto_or = GetSchema();
344 if (absl_ports::IsNotFound(schema_proto_or.status())) {
345 // Don't have an existing schema proto, that's fine
346 return libtextclassifier3::Status::OK;
347 } else if (!schema_proto_or.ok()) {
348 // Real error when trying to read the existing schema
349 return schema_proto_or.status();
350 }
351 return InitializeInternal(/*create_overlay_if_necessary=*/false,
352 initialize_stats);
353 }
354
LoadSchema()355 libtextclassifier3::Status SchemaStore::LoadSchema() {
356 libtextclassifier3::StatusOr<Header> header_or =
357 Header::Read(filesystem_, MakeHeaderFilename(base_dir_));
358 bool header_exists = false;
359 if (!header_or.ok() && !absl_ports::IsNotFound(header_or.status())) {
360 return header_or.status();
361 } else if (!header_or.ok()) {
362 header_ = std::make_unique<Header>();
363 } else {
364 header_exists = true;
365 header_ = std::make_unique<Header>(std::move(header_or).ValueOrDie());
366 }
367
368 std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir_);
369 bool overlay_schema_file_exists =
370 filesystem_->FileExists(overlay_schema_filename.c_str());
371
372 libtextclassifier3::Status base_schema_state = schema_file_->Read().status();
373 if (!base_schema_state.ok() && !absl_ports::IsNotFound(base_schema_state)) {
374 return base_schema_state;
375 }
376
377 // There are three valid cases:
378 // 1. Everything is missing. This is an empty schema store.
379 if (!base_schema_state.ok() && !overlay_schema_file_exists &&
380 !header_exists) {
381 return libtextclassifier3::Status::OK;
382 }
383
384 // 2. There never was a overlay schema. The header exists, the base schema
385 // exists and the header says the overlay schema shouldn't exist
386 if (base_schema_state.ok() && !overlay_schema_file_exists && header_exists &&
387 !header_->overlay_created()) {
388 // Nothing else to do. Just return safely.
389 return libtextclassifier3::Status::OK;
390 }
391
392 // 3. There is an overlay schema and a base schema and a header. The header
393 // says that the overlay schema should exist.
394 if (base_schema_state.ok() && overlay_schema_file_exists && header_exists &&
395 header_->overlay_created()) {
396 overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
397 *filesystem_, MakeOverlaySchemaFilename(base_dir_));
398 return libtextclassifier3::Status::OK;
399 }
400
401 // Something has gone wrong. We've lost part of the schema ground truth.
402 // Return an error.
403 bool overlay_created = header_->overlay_created();
404 bool base_schema_exists = base_schema_state.ok();
405 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
406 "Unable to properly load schema. Header {exists:%d, overlay_created:%d}, "
407 "base schema exists: %d, overlay_schema_exists: %d",
408 header_exists, overlay_created, base_schema_exists,
409 overlay_schema_file_exists));
410 }
411
InitializeInternal(bool create_overlay_if_necessary,InitializeStatsProto * initialize_stats)412 libtextclassifier3::Status SchemaStore::InitializeInternal(
413 bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats) {
414 if (!InitializeDerivedFiles().ok()) {
415 ICING_VLOG(3)
416 << "Couldn't find derived files or failed to initialize them, "
417 "regenerating derived files for SchemaStore.";
418 std::unique_ptr<Timer> regenerate_timer = clock_->GetNewTimer();
419 if (initialize_stats != nullptr) {
420 initialize_stats->set_schema_store_recovery_cause(
421 InitializeStatsProto::IO_ERROR);
422 }
423 ICING_RETURN_IF_ERROR(RegenerateDerivedFiles(create_overlay_if_necessary));
424 if (initialize_stats != nullptr) {
425 initialize_stats->set_schema_store_recovery_latency_ms(
426 regenerate_timer->GetElapsedMilliseconds());
427 }
428 }
429
430 if (initialize_stats != nullptr) {
431 initialize_stats->set_num_schema_types(type_config_map_.size());
432 }
433 has_schema_successfully_set_ = true;
434
435 return libtextclassifier3::Status::OK;
436 }
437
InitializeDerivedFiles()438 libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() {
439 ICING_ASSIGN_OR_RETURN(
440 schema_type_mapper_,
441 DynamicTrieKeyMapper<SchemaTypeId>::Create(
442 *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
443 kSchemaTypeMapperMaxSize));
444
445 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
446 if (checksum.Get() != header_->checksum()) {
447 return absl_ports::InternalError(
448 "Combined checksum of SchemaStore was inconsistent");
449 }
450
451 ICING_RETURN_IF_ERROR(BuildInMemoryCache());
452 return libtextclassifier3::Status::OK;
453 }
454
RegenerateDerivedFiles(bool create_overlay_if_necessary)455 libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles(
456 bool create_overlay_if_necessary) {
457 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
458
459 ICING_RETURN_IF_ERROR(ResetSchemaTypeMapper());
460
461 for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
462 // Assign a SchemaTypeId to the type
463 ICING_RETURN_IF_ERROR(schema_type_mapper_->Put(
464 type_config.schema_type(), schema_type_mapper_->num_keys()));
465 }
466 ICING_RETURN_IF_ERROR(BuildInMemoryCache());
467
468 if (create_overlay_if_necessary) {
469 ICING_ASSIGN_OR_RETURN(
470 BackupSchemaProducer producer,
471 BackupSchemaProducer::Create(*schema_proto,
472 schema_type_manager_->section_manager()));
473
474 if (producer.is_backup_necessary()) {
475 SchemaProto base_schema = std::move(producer).Produce();
476
477 // The overlay schema should be written to the overlay file location.
478 overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
479 *filesystem_, MakeOverlaySchemaFilename(base_dir_));
480 auto schema_ptr = std::make_unique<SchemaProto>(std::move(*schema_proto));
481 ICING_RETURN_IF_ERROR(overlay_schema_file_->Write(std::move(schema_ptr)));
482
483 // The base schema should be written to the original file
484 auto base_schema_ptr =
485 std::make_unique<SchemaProto>(std::move(base_schema));
486 ICING_RETURN_IF_ERROR(schema_file_->Write(std::move(base_schema_ptr)));
487
488 // LINT.IfChange(min_overlay_version_compatibility)
489 // Although the current version is 4, the schema is compatible with
490 // version 1, so min_overlay_version_compatibility should be 1.
491 int32_t min_overlay_version_compatibility = version_util::kVersionOne;
492 // LINT.ThenChange(//depot/google3/icing/file/version-util.h:kVersion)
493 header_->SetOverlayInfo(
494 /*overlay_created=*/true, min_overlay_version_compatibility);
495 // Rebuild in memory data - references to the old schema will be invalid
496 // now.
497 ICING_RETURN_IF_ERROR(BuildInMemoryCache());
498 }
499 }
500
501 // Write the header
502 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
503 header_->set_checksum(checksum.Get());
504 return header_->Write(filesystem_, MakeHeaderFilename(base_dir_));
505 }
506
BuildInMemoryCache()507 libtextclassifier3::Status SchemaStore::BuildInMemoryCache() {
508 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
509 ICING_ASSIGN_OR_RETURN(
510 SchemaUtil::InheritanceMap inheritance_map,
511 SchemaUtil::BuildTransitiveInheritanceGraph(*schema_proto));
512
513 reverse_schema_type_mapper_.clear();
514 type_config_map_.clear();
515 schema_subtype_id_map_.clear();
516 for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
517 std::string_view type_name = type_config.schema_type();
518 ICING_ASSIGN_OR_RETURN(SchemaTypeId type_id,
519 schema_type_mapper_->Get(type_name));
520
521 // Build reverse_schema_type_mapper_
522 reverse_schema_type_mapper_.insert({type_id, std::string(type_name)});
523
524 // Build type_config_map_
525 type_config_map_.insert({std::string(type_name), type_config});
526
527 // Build schema_subtype_id_map_
528 std::unordered_set<SchemaTypeId>& subtype_id_set =
529 schema_subtype_id_map_[type_id];
530 // Find all child types
531 auto child_types_names = inheritance_map.find(type_name);
532 if (child_types_names != inheritance_map.end()) {
533 subtype_id_set.reserve(child_types_names->second.size() + 1);
534 for (const auto& [child_type_name, is_direct_child] :
535 child_types_names->second) {
536 ICING_ASSIGN_OR_RETURN(SchemaTypeId child_type_id,
537 schema_type_mapper_->Get(child_type_name));
538 subtype_id_set.insert(child_type_id);
539 }
540 }
541 // Every type is a subtype of itself.
542 subtype_id_set.insert(type_id);
543 }
544
545 // Build schema_type_manager_
546 ICING_ASSIGN_OR_RETURN(
547 schema_type_manager_,
548 SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
549 return libtextclassifier3::Status::OK;
550 }
551
ResetSchemaTypeMapper()552 libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
553 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
554 schema_type_mapper_.reset();
555 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
556 // that can support error logging.
557 libtextclassifier3::Status status =
558 DynamicTrieKeyMapper<SchemaTypeId>::Delete(
559 *filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
560 if (!status.ok()) {
561 ICING_LOG(ERROR) << status.error_message()
562 << "Failed to delete old schema_type mapper";
563 return status;
564 }
565 ICING_ASSIGN_OR_RETURN(
566 schema_type_mapper_,
567 DynamicTrieKeyMapper<SchemaTypeId>::Create(
568 *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
569 kSchemaTypeMapperMaxSize));
570
571 return libtextclassifier3::Status::OK;
572 }
573
ComputeChecksum() const574 libtextclassifier3::StatusOr<Crc32> SchemaStore::ComputeChecksum() const {
575 // Base schema checksum
576 auto schema_proto_or = schema_file_->Read();
577 if (absl_ports::IsNotFound(schema_proto_or.status())) {
578 return Crc32();
579 }
580 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, schema_proto_or);
581 Crc32 schema_checksum;
582 schema_checksum.Append(schema_proto->SerializeAsString());
583
584 Crc32 overlay_schema_checksum;
585 if (overlay_schema_file_ != nullptr) {
586 auto schema_proto_or = schema_file_->Read();
587 if (schema_proto_or.ok()) {
588 ICING_ASSIGN_OR_RETURN(schema_proto, schema_proto_or);
589 overlay_schema_checksum.Append(schema_proto->SerializeAsString());
590 }
591 }
592
593 ICING_ASSIGN_OR_RETURN(Crc32 schema_type_mapper_checksum,
594 schema_type_mapper_->ComputeChecksum());
595
596 Crc32 total_checksum;
597 total_checksum.Append(std::to_string(schema_checksum.Get()));
598 if (overlay_schema_file_ != nullptr) {
599 total_checksum.Append(std::to_string(overlay_schema_checksum.Get()));
600 }
601 total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
602
603 return total_checksum;
604 }
605
GetSchema() const606 libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema()
607 const {
608 if (overlay_schema_file_ != nullptr) {
609 return overlay_schema_file_->Read();
610 }
611 return schema_file_->Read();
612 }
613
614 // TODO(cassiewang): Consider removing this definition of SetSchema if it's not
615 // needed by production code. It's currently being used by our tests, but maybe
616 // it's trivial to change our test code to also use the
617 // SetSchema(SchemaProto&& new_schema)
618 libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
SetSchema(const SchemaProto & new_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)619 SchemaStore::SetSchema(const SchemaProto& new_schema,
620 bool ignore_errors_and_delete_documents,
621 bool allow_circular_schema_definitions) {
622 return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents,
623 allow_circular_schema_definitions);
624 }
625
626 libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
SetSchema(SchemaProto && new_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)627 SchemaStore::SetSchema(SchemaProto&& new_schema,
628 bool ignore_errors_and_delete_documents,
629 bool allow_circular_schema_definitions) {
630 ICING_ASSIGN_OR_RETURN(
631 SchemaUtil::DependentMap new_dependent_map,
632 SchemaUtil::Validate(new_schema, allow_circular_schema_definitions));
633
634 SetSchemaResult result;
635
636 auto schema_proto_or = GetSchema();
637 if (absl_ports::IsNotFound(schema_proto_or.status())) {
638 // We don't have a pre-existing schema, so anything is valid.
639 result.success = true;
640 for (const SchemaTypeConfigProto& type_config : new_schema.types()) {
641 result.schema_types_new_by_name.insert(type_config.schema_type());
642 }
643 } else if (!schema_proto_or.ok()) {
644 // Real error
645 return schema_proto_or.status();
646 } else {
647 // At this point, we're guaranteed that we have a schema.
648 const SchemaProto old_schema = *schema_proto_or.ValueOrDie();
649
650 // Assume we can set the schema unless proven otherwise.
651 result.success = true;
652
653 if (new_schema.SerializeAsString() == old_schema.SerializeAsString()) {
654 // Same schema as before. No need to update anything
655 return result;
656 }
657
658 // Different schema, track the differences and see if we can still write it
659 SchemaUtil::SchemaDelta schema_delta =
660 SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
661 new_dependent_map);
662
663 result.schema_types_new_by_name = std::move(schema_delta.schema_types_new);
664 result.schema_types_changed_fully_compatible_by_name =
665 std::move(schema_delta.schema_types_changed_fully_compatible);
666 result.schema_types_index_incompatible_by_name =
667 std::move(schema_delta.schema_types_index_incompatible);
668 result.schema_types_join_incompatible_by_name =
669 std::move(schema_delta.schema_types_join_incompatible);
670
671 for (const auto& schema_type : schema_delta.schema_types_deleted) {
672 // We currently don't support deletions, so mark this as not possible.
673 // This will change once we allow force-set schemas.
674 result.success = false;
675
676 result.schema_types_deleted_by_name.emplace(schema_type);
677
678 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
679 GetSchemaTypeId(schema_type));
680 result.schema_types_deleted_by_id.emplace(schema_type_id);
681 }
682
683 for (const auto& schema_type : schema_delta.schema_types_incompatible) {
684 // We currently don't support incompatible schemas, so mark this as
685 // not possible. This will change once we allow force-set schemas.
686 result.success = false;
687
688 result.schema_types_incompatible_by_name.emplace(schema_type);
689
690 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
691 GetSchemaTypeId(schema_type));
692 result.schema_types_incompatible_by_id.emplace(schema_type_id);
693 }
694
695 // SchemaTypeIds changing is fine, we can update the DocumentStore
696 result.old_schema_type_ids_changed =
697 SchemaTypeIdsChanged(old_schema, new_schema);
698 }
699
700 // We can force set the schema if the caller has told us to ignore any errors
701 result.success = result.success || ignore_errors_and_delete_documents;
702
703 if (result.success) {
704 ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(new_schema)));
705 has_schema_successfully_set_ = true;
706 }
707
708 return result;
709 }
710
ApplySchemaChange(SchemaProto new_schema)711 libtextclassifier3::Status SchemaStore::ApplySchemaChange(
712 SchemaProto new_schema) {
713 // We need to ensure that we either 1) successfully set the schema and
714 // update all derived data structures or 2) fail and leave the schema store
715 // unchanged.
716 // So, first, we create an empty temporary directory to build a new schema
717 // store in.
718 std::string temp_schema_store_dir_path = base_dir_ + "_temp";
719 if (!filesystem_->DeleteDirectoryRecursively(
720 temp_schema_store_dir_path.c_str())) {
721 ICING_LOG(ERROR) << "Recursively deleting "
722 << temp_schema_store_dir_path.c_str();
723 return absl_ports::InternalError(
724 "Unable to delete temp directory to prepare to build new schema "
725 "store.");
726 }
727
728 DestructibleDirectory temp_schema_store_dir(
729 filesystem_, std::move(temp_schema_store_dir_path));
730 if (!temp_schema_store_dir.is_valid()) {
731 return absl_ports::InternalError(
732 "Unable to create temp directory to build new schema store.");
733 }
734
735 // Then we create our new schema store with the new schema.
736 ICING_ASSIGN_OR_RETURN(
737 std::unique_ptr<SchemaStore> new_schema_store,
738 SchemaStore::Create(filesystem_, temp_schema_store_dir.dir(), clock_,
739 std::move(new_schema)));
740
741 // Then we swap the new schema file + new derived files with the old files.
742 if (!filesystem_->SwapFiles(base_dir_.c_str(),
743 temp_schema_store_dir.dir().c_str())) {
744 return absl_ports::InternalError(
745 "Unable to apply new schema due to failed swap!");
746 }
747
748 std::string old_base_dir = std::move(base_dir_);
749 *this = std::move(*new_schema_store);
750
751 // After the std::move, the filepaths saved in this instance and in the
752 // schema_file_ instance will still be the one from temp_schema_store_dir
753 // even though they now point to files that are within old_base_dir.
754 // Manually set them to the correct paths.
755 base_dir_ = std::move(old_base_dir);
756 schema_file_->SetSwappedFilepath(MakeSchemaFilename(base_dir_));
757 if (overlay_schema_file_ != nullptr) {
758 overlay_schema_file_->SetSwappedFilepath(
759 MakeOverlaySchemaFilename(base_dir_));
760 }
761
762 return libtextclassifier3::Status::OK;
763 }
764
765 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
GetSchemaTypeConfig(std::string_view schema_type) const766 SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const {
767 ICING_RETURN_IF_ERROR(CheckSchemaSet());
768 const auto& type_config_iter =
769 type_config_map_.find(std::string(schema_type));
770 if (type_config_iter == type_config_map_.end()) {
771 return absl_ports::NotFoundError(
772 absl_ports::StrCat("Schema type config '", schema_type, "' not found"));
773 }
774 return &type_config_iter->second;
775 }
776
GetSchemaTypeId(std::string_view schema_type) const777 libtextclassifier3::StatusOr<SchemaTypeId> SchemaStore::GetSchemaTypeId(
778 std::string_view schema_type) const {
779 ICING_RETURN_IF_ERROR(CheckSchemaSet());
780 return schema_type_mapper_->Get(schema_type);
781 }
782
GetSchemaType(SchemaTypeId schema_type_id) const783 libtextclassifier3::StatusOr<const std::string*> SchemaStore::GetSchemaType(
784 SchemaTypeId schema_type_id) const {
785 ICING_RETURN_IF_ERROR(CheckSchemaSet());
786 if (const auto it = reverse_schema_type_mapper_.find(schema_type_id);
787 it == reverse_schema_type_mapper_.end()) {
788 return absl_ports::InvalidArgumentError("Invalid schema type id");
789 } else {
790 return &it->second;
791 }
792 }
793
794 libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
GetSchemaTypeIdsWithChildren(std::string_view schema_type) const795 SchemaStore::GetSchemaTypeIdsWithChildren(std::string_view schema_type) const {
796 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
797 GetSchemaTypeId(schema_type));
798 auto iter = schema_subtype_id_map_.find(schema_type_id);
799 if (iter == schema_subtype_id_map_.end()) {
800 // This should never happen, unless there is an inconsistency or IO error.
801 return absl_ports::InternalError(absl_ports::StrCat(
802 "Schema type '", schema_type, "' is not found in the subtype map."));
803 }
804 return &iter->second;
805 }
806
807 libtextclassifier3::StatusOr<const SectionMetadata*>
GetSectionMetadata(SchemaTypeId schema_type_id,SectionId section_id) const808 SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id,
809 SectionId section_id) const {
810 ICING_RETURN_IF_ERROR(CheckSchemaSet());
811 return schema_type_manager_->section_manager().GetSectionMetadata(
812 schema_type_id, section_id);
813 }
814
ExtractSections(const DocumentProto & document) const815 libtextclassifier3::StatusOr<SectionGroup> SchemaStore::ExtractSections(
816 const DocumentProto& document) const {
817 ICING_RETURN_IF_ERROR(CheckSchemaSet());
818 return schema_type_manager_->section_manager().ExtractSections(document);
819 }
820
821 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,const std::string & property_path) const822 SchemaStore::GetJoinablePropertyMetadata(
823 SchemaTypeId schema_type_id, const std::string& property_path) const {
824 ICING_RETURN_IF_ERROR(CheckSchemaSet());
825 return schema_type_manager_->joinable_property_manager()
826 .GetJoinablePropertyMetadata(schema_type_id, property_path);
827 }
828
829 libtextclassifier3::StatusOr<JoinablePropertyGroup>
ExtractJoinableProperties(const DocumentProto & document) const830 SchemaStore::ExtractJoinableProperties(const DocumentProto& document) const {
831 ICING_RETURN_IF_ERROR(CheckSchemaSet());
832 return schema_type_manager_->joinable_property_manager()
833 .ExtractJoinableProperties(document);
834 }
835
PersistToDisk()836 libtextclassifier3::Status SchemaStore::PersistToDisk() {
837 if (!has_schema_successfully_set_) {
838 return libtextclassifier3::Status::OK;
839 }
840 ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk());
841 // Write the header
842 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
843 header_->set_checksum(checksum.Get());
844 return header_->Write(filesystem_, MakeHeaderFilename(base_dir_));
845 }
846
GetStorageInfo() const847 SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
848 SchemaStoreStorageInfoProto storage_info;
849 int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
850 storage_info.set_schema_store_size(
851 Filesystem::SanitizeFileSize(directory_size));
852 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
853 storage_info.set_num_schema_types(schema->types_size());
854 int total_sections = 0;
855 int num_types_sections_exhausted = 0;
856 for (const SchemaTypeConfigProto& type : schema->types()) {
857 auto sections_list_or =
858 schema_type_manager_->section_manager().GetMetadataList(
859 type.schema_type());
860 if (!sections_list_or.ok()) {
861 continue;
862 }
863 total_sections += sections_list_or.ValueOrDie()->size();
864 if (sections_list_or.ValueOrDie()->size() == kTotalNumSections) {
865 ++num_types_sections_exhausted;
866 }
867 }
868
869 storage_info.set_num_total_sections(total_sections);
870 storage_info.set_num_schema_types_sections_exhausted(
871 num_types_sections_exhausted);
872 return storage_info;
873 }
874
875 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
GetSectionMetadata(const std::string & schema_type) const876 SchemaStore::GetSectionMetadata(const std::string& schema_type) const {
877 return schema_type_manager_->section_manager().GetMetadataList(schema_type);
878 }
879
IsPropertyDefinedInSchema(SchemaTypeId schema_type_id,const std::string & property_path) const880 bool SchemaStore::IsPropertyDefinedInSchema(
881 SchemaTypeId schema_type_id, const std::string& property_path) const {
882 auto schema_name_itr = reverse_schema_type_mapper_.find(schema_type_id);
883 if (schema_name_itr == reverse_schema_type_mapper_.end()) {
884 return false;
885 }
886 const std::string* current_type_name = &schema_name_itr->second;
887
888 std::vector<std::string_view> property_path_parts =
889 property_util::SplitPropertyPathExpr(property_path);
890 for (int i = 0; i < property_path_parts.size(); ++i) {
891 auto type_config_itr = type_config_map_.find(*current_type_name);
892 if (type_config_itr == type_config_map_.end()) {
893 return false;
894 }
895 std::string_view property_name = property_path_parts.at(i);
896 const PropertyConfigProto* selected_property = nullptr;
897 for (const PropertyConfigProto& property :
898 type_config_itr->second.properties()) {
899 if (property.property_name() == property_name) {
900 selected_property = &property;
901 break;
902 }
903 }
904 if (selected_property == nullptr) {
905 return false;
906 }
907 if (i == property_path_parts.size() - 1) {
908 // We've found a property at the final part of the path.
909 return true;
910 }
911 if (selected_property->data_type() !=
912 PropertyConfigProto::DataType::DOCUMENT) {
913 // If this isn't final part of the path, but this property isn't a
914 // document, so we know that this path doesn't exist.
915 return false;
916 }
917 current_type_name = &selected_property->schema_type();
918 }
919
920 // We should never reach this point.
921 return false;
922 }
923
GetDebugInfo() const924 libtextclassifier3::StatusOr<SchemaDebugInfoProto> SchemaStore::GetDebugInfo()
925 const {
926 SchemaDebugInfoProto debug_info;
927 if (has_schema_successfully_set_) {
928 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
929 *debug_info.mutable_schema() = *schema;
930 }
931 ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
932 debug_info.set_crc(crc.Get());
933 return debug_info;
934 }
935
936 std::vector<SchemaStore::ExpandedTypePropertyMask>
ExpandTypePropertyMasks(const google::protobuf::RepeatedPtrField<TypePropertyMask> & type_property_masks) const937 SchemaStore::ExpandTypePropertyMasks(
938 const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks)
939 const {
940 std::unordered_map<SchemaTypeId, ExpandedTypePropertyMask> result_map;
941 for (const TypePropertyMask& type_field_mask : type_property_masks) {
942 if (type_field_mask.schema_type() == kSchemaTypeWildcard) {
943 ExpandedTypePropertyMask entry{type_field_mask.schema_type(),
944 /*paths=*/{}};
945 entry.paths.insert(type_field_mask.paths().begin(),
946 type_field_mask.paths().end());
947 result_map.insert({kInvalidSchemaTypeId, std::move(entry)});
948 } else {
949 auto schema_type_ids_or =
950 GetSchemaTypeIdsWithChildren(type_field_mask.schema_type());
951 // If we can't find the SchemaTypeIds, just throw it away
952 if (!schema_type_ids_or.ok()) {
953 continue;
954 }
955 const std::unordered_set<SchemaTypeId>* schema_type_ids =
956 schema_type_ids_or.ValueOrDie();
957 for (SchemaTypeId schema_type_id : *schema_type_ids) {
958 auto schema_type_name_iter =
959 reverse_schema_type_mapper_.find(schema_type_id);
960 if (schema_type_name_iter == reverse_schema_type_mapper_.end()) {
961 // This should never happen, unless there is an inconsistency or IO
962 // error.
963 ICING_LOG(ERROR) << "Got unknown schema type id: " << schema_type_id;
964 continue;
965 }
966
967 auto iter = result_map.find(schema_type_id);
968 if (iter == result_map.end()) {
969 ExpandedTypePropertyMask entry{schema_type_name_iter->second,
970 /*paths=*/{}};
971 iter = result_map.insert({schema_type_id, std::move(entry)}).first;
972 }
973 iter->second.paths.insert(type_field_mask.paths().begin(),
974 type_field_mask.paths().end());
975 }
976 }
977 }
978 std::vector<ExpandedTypePropertyMask> result;
979 result.reserve(result_map.size());
980 for (auto& entry : result_map) {
981 result.push_back(std::move(entry.second));
982 }
983 return result;
984 }
985
986 } // namespace lib
987 } // namespace icing
988