• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/schema/schema-store.h"
16 
17 #include <algorithm>
18 #include <cinttypes>
19 #include <cstdint>
20 #include <limits>
21 #include <memory>
22 #include <string>
23 #include <string_view>
24 #include <unordered_map>
25 #include <unordered_set>
26 #include <utility>
27 #include <vector>
28 
29 #include "icing/text_classifier/lib3/utils/base/status.h"
30 #include "icing/text_classifier/lib3/utils/base/statusor.h"
31 #include "icing/absl_ports/canonical_errors.h"
32 #include "icing/absl_ports/str_cat.h"
33 #include "icing/file/destructible-directory.h"
34 #include "icing/file/file-backed-proto.h"
35 #include "icing/file/filesystem.h"
36 #include "icing/file/version-util.h"
37 #include "icing/proto/debug.pb.h"
38 #include "icing/proto/document.pb.h"
39 #include "icing/proto/logging.pb.h"
40 #include "icing/proto/schema.pb.h"
41 #include "icing/proto/search.pb.h"
42 #include "icing/proto/storage.pb.h"
43 #include "icing/schema/backup-schema-producer.h"
44 #include "icing/schema/joinable-property.h"
45 #include "icing/schema/property-util.h"
46 #include "icing/schema/schema-type-manager.h"
47 #include "icing/schema/schema-util.h"
48 #include "icing/schema/section.h"
49 #include "icing/store/document-filter-data.h"
50 #include "icing/store/dynamic-trie-key-mapper.h"
51 #include "icing/util/crc32.h"
52 #include "icing/util/logging.h"
53 #include "icing/util/status-macros.h"
54 
55 namespace icing {
56 namespace lib {
57 
58 namespace {
59 
60 constexpr char kSchemaStoreHeaderFilename[] = "schema_store_header";
61 constexpr char kSchemaFilename[] = "schema.pb";
62 constexpr char kOverlaySchemaFilename[] = "overlay_schema.pb";
63 constexpr char kSchemaTypeMapperFilename[] = "schema_type_mapper";
64 
65 // A DynamicTrieKeyMapper stores its data across 3 arrays internally. Giving
66 // each array 128KiB for storage means the entire DynamicTrieKeyMapper requires
67 // 384KiB.
68 constexpr int32_t kSchemaTypeMapperMaxSize = 3 * 128 * 1024;  // 384 KiB
69 
MakeHeaderFilename(const std::string & base_dir)70 std::string MakeHeaderFilename(const std::string& base_dir) {
71   return absl_ports::StrCat(base_dir, "/", kSchemaStoreHeaderFilename);
72 }
73 
MakeSchemaFilename(const std::string & base_dir)74 std::string MakeSchemaFilename(const std::string& base_dir) {
75   return absl_ports::StrCat(base_dir, "/", kSchemaFilename);
76 }
77 
MakeOverlaySchemaFilename(const std::string & base_dir)78 std::string MakeOverlaySchemaFilename(const std::string& base_dir) {
79   return absl_ports::StrCat(base_dir, "/", kOverlaySchemaFilename);
80 }
81 
MakeSchemaTypeMapperFilename(const std::string & base_dir)82 std::string MakeSchemaTypeMapperFilename(const std::string& base_dir) {
83   return absl_ports::StrCat(base_dir, "/", kSchemaTypeMapperFilename);
84 }
85 
86 // Assuming that SchemaTypeIds are assigned to schema types based on their order
87 // in the SchemaProto. Check if the schema type->SchemaTypeId mapping would
88 // change with the new schema.
SchemaTypeIdsChanged(const SchemaProto & old_schema,const SchemaProto & new_schema)89 std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged(
90     const SchemaProto& old_schema, const SchemaProto& new_schema) {
91   std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
92 
93   std::unordered_map<std::string, int> old_types_and_index;
94   for (int i = 0; i < old_schema.types_size(); ++i) {
95     old_types_and_index.emplace(old_schema.types(i).schema_type(), i);
96   }
97 
98   std::unordered_map<std::string, int> new_types_and_index;
99   for (int i = 0; i < new_schema.types_size(); ++i) {
100     new_types_and_index.emplace(new_schema.types(i).schema_type(), i);
101   }
102 
103   for (const auto& old_type_index : old_types_and_index) {
104     const auto& iter = new_types_and_index.find(old_type_index.first);
105     // We only care if the type exists in both the old and new schema. If the
106     // type has been deleted, then it'll be captured in
107     // SetSchemaResult.schema_types_deleted*. If the type has been added in the
108     // new schema then we also don't care because nothing needs to be updated.
109     if (iter != new_types_and_index.end()) {
110       // Since the SchemaTypeId of the schema type is just the index of it in
111       // the SchemaProto, compare the index and save it if it's not the same
112       if (old_type_index.second != iter->second) {
113         old_schema_type_ids_changed.emplace(old_type_index.second);
114       }
115     }
116   }
117 
118   return old_schema_type_ids_changed;
119 }
120 
121 }  // namespace
122 
123 /* static */ libtextclassifier3::StatusOr<SchemaStore::Header>
Read(const Filesystem * filesystem,const std::string & path)124 SchemaStore::Header::Read(const Filesystem* filesystem,
125                           const std::string& path) {
126   Header header;
127   ScopedFd sfd(filesystem->OpenForRead(path.c_str()));
128   if (!sfd.is_valid()) {
129     return absl_ports::NotFoundError("SchemaStore header doesn't exist");
130   }
131 
132   // If file is sizeof(LegacyHeader), then it must be LegacyHeader.
133   int64_t file_size = filesystem->GetFileSize(sfd.get());
134   if (file_size == sizeof(LegacyHeader)) {
135     LegacyHeader legacy_header;
136     if (!filesystem->Read(path.c_str(), &legacy_header,
137                           sizeof(legacy_header))) {
138       return absl_ports::InternalError(
139           absl_ports::StrCat("Couldn't read: ", path));
140     }
141     if (legacy_header.magic != Header::kMagic) {
142       return absl_ports::InternalError(
143           absl_ports::StrCat("Invalid header kMagic for file: ", path));
144     }
145     header.set_checksum(legacy_header.checksum);
146   } else if (file_size == sizeof(Header)) {
147     if (!filesystem->Read(path.c_str(), &header, sizeof(header))) {
148       return absl_ports::InternalError(
149           absl_ports::StrCat("Couldn't read: ", path));
150     }
151     if (header.magic() != Header::kMagic) {
152       return absl_ports::InternalError(
153           absl_ports::StrCat("Invalid header kMagic for file: ", path));
154     }
155   } else {
156     int legacy_header_size = sizeof(LegacyHeader);
157     int header_size = sizeof(Header);
158     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
159         "Unexpected header size %" PRId64 ". Expected %d or %d", file_size,
160         legacy_header_size, header_size));
161   }
162   return header;
163 }
164 
Write(const Filesystem * filesystem,const std::string & path)165 libtextclassifier3::Status SchemaStore::Header::Write(
166     const Filesystem* filesystem, const std::string& path) {
167   ScopedFd scoped_fd(filesystem->OpenForWrite(path.c_str()));
168   // This should overwrite the header.
169   if (!scoped_fd.is_valid() ||
170       !filesystem->Write(scoped_fd.get(), this, sizeof(*this)) ||
171       !filesystem->DataSync(scoped_fd.get())) {
172     return absl_ports::InternalError(
173         absl_ports::StrCat("Failed to write SchemaStore header: ", path));
174   }
175   return libtextclassifier3::Status::OK;
176 }
177 
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,InitializeStatsProto * initialize_stats)178 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
179     const Filesystem* filesystem, const std::string& base_dir,
180     const Clock* clock, InitializeStatsProto* initialize_stats) {
181   ICING_RETURN_ERROR_IF_NULL(filesystem);
182   ICING_RETURN_ERROR_IF_NULL(clock);
183 
184   if (!filesystem->DirectoryExists(base_dir.c_str())) {
185     return absl_ports::FailedPreconditionError(
186         "Schema store base directory does not exist!");
187   }
188   std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
189       new SchemaStore(filesystem, base_dir, clock));
190   ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats));
191   return schema_store;
192 }
193 
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,SchemaProto schema)194 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
195     const Filesystem* filesystem, const std::string& base_dir,
196     const Clock* clock, SchemaProto schema) {
197   ICING_RETURN_ERROR_IF_NULL(filesystem);
198   ICING_RETURN_ERROR_IF_NULL(clock);
199 
200   if (!filesystem->DirectoryExists(base_dir.c_str())) {
201     return absl_ports::FailedPreconditionError(
202         "Schema store base directory does not exist!");
203   }
204   std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
205       new SchemaStore(filesystem, base_dir, clock));
206   ICING_RETURN_IF_ERROR(schema_store->Initialize(std::move(schema)));
207   return schema_store;
208 }
209 
DiscardOverlaySchema(const Filesystem * filesystem,const std::string & base_dir,Header & header)210 /* static */ libtextclassifier3::Status SchemaStore::DiscardOverlaySchema(
211     const Filesystem* filesystem, const std::string& base_dir, Header& header) {
212   std::string header_filename = MakeHeaderFilename(base_dir);
213   if (header.overlay_created()) {
214     header.SetOverlayInfo(
215         /*overlay_created=*/false,
216         /*min_overlay_version_compatibility=*/ std::numeric_limits<
217             int32_t>::max());
218     ICING_RETURN_IF_ERROR(header.Write(filesystem, header_filename));
219   }
220   std::string schema_overlay_filename = MakeOverlaySchemaFilename(base_dir);
221   if (!filesystem->DeleteFile(schema_overlay_filename.c_str())) {
222     return absl_ports::InternalError(
223         "Unable to delete stale schema overlay file.");
224   }
225   return libtextclassifier3::Status::OK;
226 }
227 
MigrateSchema(const Filesystem * filesystem,const std::string & base_dir,version_util::StateChange version_state_change,int32_t new_version)228 /* static */ libtextclassifier3::Status SchemaStore::MigrateSchema(
229     const Filesystem* filesystem, const std::string& base_dir,
230     version_util::StateChange version_state_change, int32_t new_version) {
231   if (!filesystem->DirectoryExists(base_dir.c_str())) {
232     // Situations when schema store directory doesn't exist:
233     // - Initializing new Icing instance: don't have to do anything now. The
234     //   directory will be created later.
235     // - Lose schema store: there is nothing we can do now. The logic will be
236     //   handled later by initializing.
237     //
238     // Therefore, just simply return OK here.
239     return libtextclassifier3::Status::OK;
240   }
241 
242   std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir);
243   if (!filesystem->FileExists(overlay_schema_filename.c_str())) {
244     // The overlay doesn't exist. So there should be nothing particularly
245     // interesting to worry about.
246     return libtextclassifier3::Status::OK;
247   }
248 
249   std::string header_filename = MakeHeaderFilename(base_dir);
250   libtextclassifier3::StatusOr<Header> header_or;
251   switch (version_state_change) {
252     // No necessary actions for normal upgrades or no version change. The data
253     // that was produced by the previous version is fully compatible with this
254     // version and there's no stale data for us to clean up.
255     // The same is true for a normal rollforward. A normal rollforward implies
256     // that the previous version was one that understood the concept of the
257     // overlay schema and would have already discarded it if it was unusable.
258     case version_util::StateChange::kVersionZeroUpgrade:
259       // fallthrough
260     case version_util::StateChange::kUpgrade:
261       // fallthrough
262     case version_util::StateChange::kRollForward:
263       // fallthrough
264     case version_util::StateChange::kCompatible:
265       return libtextclassifier3::Status::OK;
266     case version_util::StateChange::kVersionZeroRollForward:
267       // We've rolled forward. The schema overlay file, if it exists, is
268       // possibly stale. We must throw it out.
269       header_or = Header::Read(filesystem, header_filename);
270       if (!header_or.ok()) {
271         return header_or.status();
272       }
273       return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
274                                                header_or.ValueOrDie());
275     case version_util::StateChange::kRollBack:
276       header_or = Header::Read(filesystem, header_filename);
277       if (!header_or.ok()) {
278         return header_or.status();
279       }
280       if (header_or.ValueOrDie().min_overlay_version_compatibility() <=
281           new_version) {
282         // We've been rolled back, but the overlay schema claims that it
283         // supports this version. So we can safely return.
284         return libtextclassifier3::Status::OK;
285       }
286       // We've been rolled back to a version that the overlay schema doesn't
287       // support. We must throw it out.
288       return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
289                                                header_or.ValueOrDie());
290     case version_util::StateChange::kUndetermined:
291       // It's not clear what version we're on, but the base schema should always
292       // be safe to use. Throw out the overlay.
293       header_or = Header::Read(filesystem, header_filename);
294       if (!header_or.ok()) {
295         return header_or.status();
296       }
297       return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
298                                                header_or.ValueOrDie());
299   }
300   return libtextclassifier3::Status::OK;
301 }
302 
DiscardDerivedFiles(const Filesystem * filesystem,const std::string & base_dir)303 /* static */ libtextclassifier3::Status SchemaStore::DiscardDerivedFiles(
304     const Filesystem* filesystem, const std::string& base_dir) {
305   // Schema type mapper
306   return DynamicTrieKeyMapper<SchemaTypeId>::Delete(
307       *filesystem, MakeSchemaTypeMapperFilename(base_dir));
308 }
309 
SchemaStore(const Filesystem * filesystem,std::string base_dir,const Clock * clock)310 SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir,
311                          const Clock* clock)
312     : filesystem_(filesystem),
313       base_dir_(std::move(base_dir)),
314       clock_(clock),
315       schema_file_(std::make_unique<FileBackedProto<SchemaProto>>(
316           *filesystem, MakeSchemaFilename(base_dir_))) {}
317 
~SchemaStore()318 SchemaStore::~SchemaStore() {
319   if (has_schema_successfully_set_ && schema_file_ != nullptr &&
320       schema_type_mapper_ != nullptr && schema_type_manager_ != nullptr) {
321     if (!PersistToDisk().ok()) {
322       ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor";
323     }
324   }
325 }
326 
Initialize(SchemaProto new_schema)327 libtextclassifier3::Status SchemaStore::Initialize(SchemaProto new_schema) {
328   ICING_RETURN_IF_ERROR(LoadSchema());
329   if (!absl_ports::IsNotFound(GetSchema().status())) {
330     return absl_ports::FailedPreconditionError(
331         "Incorrectly tried to initialize schema store with a new schema, when "
332         "one is already set!");
333   }
334   ICING_RETURN_IF_ERROR(schema_file_->Write(
335       std::make_unique<SchemaProto>(std::move(new_schema))));
336   return InitializeInternal(/*create_overlay_if_necessary=*/true,
337                             /*initialize_stats=*/nullptr);
338 }
339 
Initialize(InitializeStatsProto * initialize_stats)340 libtextclassifier3::Status SchemaStore::Initialize(
341     InitializeStatsProto* initialize_stats) {
342   ICING_RETURN_IF_ERROR(LoadSchema());
343   auto schema_proto_or = GetSchema();
344   if (absl_ports::IsNotFound(schema_proto_or.status())) {
345     // Don't have an existing schema proto, that's fine
346     return libtextclassifier3::Status::OK;
347   } else if (!schema_proto_or.ok()) {
348     // Real error when trying to read the existing schema
349     return schema_proto_or.status();
350   }
351   return InitializeInternal(/*create_overlay_if_necessary=*/false,
352                             initialize_stats);
353 }
354 
LoadSchema()355 libtextclassifier3::Status SchemaStore::LoadSchema() {
356   libtextclassifier3::StatusOr<Header> header_or =
357       Header::Read(filesystem_, MakeHeaderFilename(base_dir_));
358   bool header_exists = false;
359   if (!header_or.ok() && !absl_ports::IsNotFound(header_or.status())) {
360     return header_or.status();
361   } else if (!header_or.ok()) {
362     header_ = std::make_unique<Header>();
363   } else {
364     header_exists = true;
365     header_ = std::make_unique<Header>(std::move(header_or).ValueOrDie());
366   }
367 
368   std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir_);
369   bool overlay_schema_file_exists =
370       filesystem_->FileExists(overlay_schema_filename.c_str());
371 
372   libtextclassifier3::Status base_schema_state = schema_file_->Read().status();
373   if (!base_schema_state.ok() && !absl_ports::IsNotFound(base_schema_state)) {
374     return base_schema_state;
375   }
376 
377   // There are three valid cases:
378   // 1. Everything is missing. This is an empty schema store.
379   if (!base_schema_state.ok() && !overlay_schema_file_exists &&
380       !header_exists) {
381     return libtextclassifier3::Status::OK;
382   }
383 
384   // 2. There never was a overlay schema. The header exists, the base schema
385   //    exists and the header says the overlay schema shouldn't exist
386   if (base_schema_state.ok() && !overlay_schema_file_exists && header_exists &&
387       !header_->overlay_created()) {
388     // Nothing else to do. Just return safely.
389     return libtextclassifier3::Status::OK;
390   }
391 
392   // 3. There is an overlay schema and a base schema and a header. The header
393   // says that the overlay schema should exist.
394   if (base_schema_state.ok() && overlay_schema_file_exists && header_exists &&
395       header_->overlay_created()) {
396     overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
397         *filesystem_, MakeOverlaySchemaFilename(base_dir_));
398     return libtextclassifier3::Status::OK;
399   }
400 
401   // Something has gone wrong. We've lost part of the schema ground truth.
402   // Return an error.
403   bool overlay_created = header_->overlay_created();
404   bool base_schema_exists = base_schema_state.ok();
405   return absl_ports::InternalError(IcingStringUtil::StringPrintf(
406       "Unable to properly load schema. Header {exists:%d, overlay_created:%d}, "
407       "base schema exists: %d, overlay_schema_exists: %d",
408       header_exists, overlay_created, base_schema_exists,
409       overlay_schema_file_exists));
410 }
411 
InitializeInternal(bool create_overlay_if_necessary,InitializeStatsProto * initialize_stats)412 libtextclassifier3::Status SchemaStore::InitializeInternal(
413     bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats) {
414   if (!InitializeDerivedFiles().ok()) {
415     ICING_VLOG(3)
416         << "Couldn't find derived files or failed to initialize them, "
417            "regenerating derived files for SchemaStore.";
418     std::unique_ptr<Timer> regenerate_timer = clock_->GetNewTimer();
419     if (initialize_stats != nullptr) {
420       initialize_stats->set_schema_store_recovery_cause(
421           InitializeStatsProto::IO_ERROR);
422     }
423     ICING_RETURN_IF_ERROR(RegenerateDerivedFiles(create_overlay_if_necessary));
424     if (initialize_stats != nullptr) {
425       initialize_stats->set_schema_store_recovery_latency_ms(
426           regenerate_timer->GetElapsedMilliseconds());
427     }
428   }
429 
430   if (initialize_stats != nullptr) {
431     initialize_stats->set_num_schema_types(type_config_map_.size());
432   }
433   has_schema_successfully_set_ = true;
434 
435   return libtextclassifier3::Status::OK;
436 }
437 
InitializeDerivedFiles()438 libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() {
439   ICING_ASSIGN_OR_RETURN(
440       schema_type_mapper_,
441       DynamicTrieKeyMapper<SchemaTypeId>::Create(
442           *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
443           kSchemaTypeMapperMaxSize));
444 
445   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
446   if (checksum.Get() != header_->checksum()) {
447     return absl_ports::InternalError(
448         "Combined checksum of SchemaStore was inconsistent");
449   }
450 
451   ICING_RETURN_IF_ERROR(BuildInMemoryCache());
452   return libtextclassifier3::Status::OK;
453 }
454 
RegenerateDerivedFiles(bool create_overlay_if_necessary)455 libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles(
456     bool create_overlay_if_necessary) {
457   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
458 
459   ICING_RETURN_IF_ERROR(ResetSchemaTypeMapper());
460 
461   for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
462     // Assign a SchemaTypeId to the type
463     ICING_RETURN_IF_ERROR(schema_type_mapper_->Put(
464         type_config.schema_type(), schema_type_mapper_->num_keys()));
465   }
466   ICING_RETURN_IF_ERROR(BuildInMemoryCache());
467 
468   if (create_overlay_if_necessary) {
469     ICING_ASSIGN_OR_RETURN(
470         BackupSchemaProducer producer,
471         BackupSchemaProducer::Create(*schema_proto,
472                                      schema_type_manager_->section_manager()));
473 
474     if (producer.is_backup_necessary()) {
475       SchemaProto base_schema = std::move(producer).Produce();
476 
477       // The overlay schema should be written to the overlay file location.
478       overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
479           *filesystem_, MakeOverlaySchemaFilename(base_dir_));
480       auto schema_ptr = std::make_unique<SchemaProto>(std::move(*schema_proto));
481       ICING_RETURN_IF_ERROR(overlay_schema_file_->Write(std::move(schema_ptr)));
482 
483       // The base schema should be written to the original file
484       auto base_schema_ptr =
485           std::make_unique<SchemaProto>(std::move(base_schema));
486       ICING_RETURN_IF_ERROR(schema_file_->Write(std::move(base_schema_ptr)));
487 
488       // LINT.IfChange(min_overlay_version_compatibility)
489       // Although the current version is 4, the schema is compatible with
490       // version 1, so min_overlay_version_compatibility should be 1.
491       int32_t min_overlay_version_compatibility = version_util::kVersionOne;
492       // LINT.ThenChange(//depot/google3/icing/file/version-util.h:kVersion)
493       header_->SetOverlayInfo(
494           /*overlay_created=*/true, min_overlay_version_compatibility);
495       // Rebuild in memory data - references to the old schema will be invalid
496       // now.
497       ICING_RETURN_IF_ERROR(BuildInMemoryCache());
498     }
499   }
500 
501   // Write the header
502   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
503   header_->set_checksum(checksum.Get());
504   return header_->Write(filesystem_, MakeHeaderFilename(base_dir_));
505 }
506 
BuildInMemoryCache()507 libtextclassifier3::Status SchemaStore::BuildInMemoryCache() {
508   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
509   ICING_ASSIGN_OR_RETURN(
510       SchemaUtil::InheritanceMap inheritance_map,
511       SchemaUtil::BuildTransitiveInheritanceGraph(*schema_proto));
512 
513   reverse_schema_type_mapper_.clear();
514   type_config_map_.clear();
515   schema_subtype_id_map_.clear();
516   for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
517     std::string_view type_name = type_config.schema_type();
518     ICING_ASSIGN_OR_RETURN(SchemaTypeId type_id,
519                            schema_type_mapper_->Get(type_name));
520 
521     // Build reverse_schema_type_mapper_
522     reverse_schema_type_mapper_.insert({type_id, std::string(type_name)});
523 
524     // Build type_config_map_
525     type_config_map_.insert({std::string(type_name), type_config});
526 
527     // Build schema_subtype_id_map_
528     std::unordered_set<SchemaTypeId>& subtype_id_set =
529         schema_subtype_id_map_[type_id];
530     // Find all child types
531     auto child_types_names = inheritance_map.find(type_name);
532     if (child_types_names != inheritance_map.end()) {
533       subtype_id_set.reserve(child_types_names->second.size() + 1);
534       for (const auto& [child_type_name, is_direct_child] :
535            child_types_names->second) {
536         ICING_ASSIGN_OR_RETURN(SchemaTypeId child_type_id,
537                                schema_type_mapper_->Get(child_type_name));
538         subtype_id_set.insert(child_type_id);
539       }
540     }
541     // Every type is a subtype of itself.
542     subtype_id_set.insert(type_id);
543   }
544 
545   // Build schema_type_manager_
546   ICING_ASSIGN_OR_RETURN(
547       schema_type_manager_,
548       SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
549   return libtextclassifier3::Status::OK;
550 }
551 
ResetSchemaTypeMapper()552 libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
553   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
554   schema_type_mapper_.reset();
555   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
556   // that can support error logging.
557   libtextclassifier3::Status status =
558       DynamicTrieKeyMapper<SchemaTypeId>::Delete(
559           *filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
560   if (!status.ok()) {
561     ICING_LOG(ERROR) << status.error_message()
562                      << "Failed to delete old schema_type mapper";
563     return status;
564   }
565   ICING_ASSIGN_OR_RETURN(
566       schema_type_mapper_,
567       DynamicTrieKeyMapper<SchemaTypeId>::Create(
568           *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
569           kSchemaTypeMapperMaxSize));
570 
571   return libtextclassifier3::Status::OK;
572 }
573 
ComputeChecksum() const574 libtextclassifier3::StatusOr<Crc32> SchemaStore::ComputeChecksum() const {
575   // Base schema checksum
576   auto schema_proto_or = schema_file_->Read();
577   if (absl_ports::IsNotFound(schema_proto_or.status())) {
578     return Crc32();
579   }
580   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, schema_proto_or);
581   Crc32 schema_checksum;
582   schema_checksum.Append(schema_proto->SerializeAsString());
583 
584   Crc32 overlay_schema_checksum;
585   if (overlay_schema_file_ != nullptr) {
586     auto schema_proto_or = schema_file_->Read();
587     if (schema_proto_or.ok()) {
588       ICING_ASSIGN_OR_RETURN(schema_proto, schema_proto_or);
589       overlay_schema_checksum.Append(schema_proto->SerializeAsString());
590     }
591   }
592 
593   ICING_ASSIGN_OR_RETURN(Crc32 schema_type_mapper_checksum,
594                          schema_type_mapper_->ComputeChecksum());
595 
596   Crc32 total_checksum;
597   total_checksum.Append(std::to_string(schema_checksum.Get()));
598   if (overlay_schema_file_ != nullptr) {
599     total_checksum.Append(std::to_string(overlay_schema_checksum.Get()));
600   }
601   total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
602 
603   return total_checksum;
604 }
605 
GetSchema() const606 libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema()
607     const {
608   if (overlay_schema_file_ != nullptr) {
609     return overlay_schema_file_->Read();
610   }
611   return schema_file_->Read();
612 }
613 
614 // TODO(cassiewang): Consider removing this definition of SetSchema if it's not
615 // needed by production code. It's currently being used by our tests, but maybe
616 // it's trivial to change our test code to also use the
617 // SetSchema(SchemaProto&& new_schema)
618 libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
SetSchema(const SchemaProto & new_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)619 SchemaStore::SetSchema(const SchemaProto& new_schema,
620                        bool ignore_errors_and_delete_documents,
621                        bool allow_circular_schema_definitions) {
622   return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents,
623                    allow_circular_schema_definitions);
624 }
625 
626 libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
SetSchema(SchemaProto && new_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)627 SchemaStore::SetSchema(SchemaProto&& new_schema,
628                        bool ignore_errors_and_delete_documents,
629                        bool allow_circular_schema_definitions) {
630   ICING_ASSIGN_OR_RETURN(
631       SchemaUtil::DependentMap new_dependent_map,
632       SchemaUtil::Validate(new_schema, allow_circular_schema_definitions));
633 
634   SetSchemaResult result;
635 
636   auto schema_proto_or = GetSchema();
637   if (absl_ports::IsNotFound(schema_proto_or.status())) {
638     // We don't have a pre-existing schema, so anything is valid.
639     result.success = true;
640     for (const SchemaTypeConfigProto& type_config : new_schema.types()) {
641       result.schema_types_new_by_name.insert(type_config.schema_type());
642     }
643   } else if (!schema_proto_or.ok()) {
644     // Real error
645     return schema_proto_or.status();
646   } else {
647     // At this point, we're guaranteed that we have a schema.
648     const SchemaProto old_schema = *schema_proto_or.ValueOrDie();
649 
650     // Assume we can set the schema unless proven otherwise.
651     result.success = true;
652 
653     if (new_schema.SerializeAsString() == old_schema.SerializeAsString()) {
654       // Same schema as before. No need to update anything
655       return result;
656     }
657 
658     // Different schema, track the differences and see if we can still write it
659     SchemaUtil::SchemaDelta schema_delta =
660         SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
661                                               new_dependent_map);
662 
663     result.schema_types_new_by_name = std::move(schema_delta.schema_types_new);
664     result.schema_types_changed_fully_compatible_by_name =
665         std::move(schema_delta.schema_types_changed_fully_compatible);
666     result.schema_types_index_incompatible_by_name =
667         std::move(schema_delta.schema_types_index_incompatible);
668     result.schema_types_join_incompatible_by_name =
669         std::move(schema_delta.schema_types_join_incompatible);
670 
671     for (const auto& schema_type : schema_delta.schema_types_deleted) {
672       // We currently don't support deletions, so mark this as not possible.
673       // This will change once we allow force-set schemas.
674       result.success = false;
675 
676       result.schema_types_deleted_by_name.emplace(schema_type);
677 
678       ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
679                              GetSchemaTypeId(schema_type));
680       result.schema_types_deleted_by_id.emplace(schema_type_id);
681     }
682 
683     for (const auto& schema_type : schema_delta.schema_types_incompatible) {
684       // We currently don't support incompatible schemas, so mark this as
685       // not possible. This will change once we allow force-set schemas.
686       result.success = false;
687 
688       result.schema_types_incompatible_by_name.emplace(schema_type);
689 
690       ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
691                              GetSchemaTypeId(schema_type));
692       result.schema_types_incompatible_by_id.emplace(schema_type_id);
693     }
694 
695     // SchemaTypeIds changing is fine, we can update the DocumentStore
696     result.old_schema_type_ids_changed =
697         SchemaTypeIdsChanged(old_schema, new_schema);
698   }
699 
700   // We can force set the schema if the caller has told us to ignore any errors
701   result.success = result.success || ignore_errors_and_delete_documents;
702 
703   if (result.success) {
704     ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(new_schema)));
705     has_schema_successfully_set_ = true;
706   }
707 
708   return result;
709 }
710 
ApplySchemaChange(SchemaProto new_schema)711 libtextclassifier3::Status SchemaStore::ApplySchemaChange(
712     SchemaProto new_schema) {
713   // We need to ensure that we either 1) successfully set the schema and
714   // update all derived data structures or 2) fail and leave the schema store
715   // unchanged.
716   // So, first, we create an empty temporary directory to build a new schema
717   // store in.
718   std::string temp_schema_store_dir_path = base_dir_ + "_temp";
719   if (!filesystem_->DeleteDirectoryRecursively(
720           temp_schema_store_dir_path.c_str())) {
721     ICING_LOG(ERROR) << "Recursively deleting "
722                      << temp_schema_store_dir_path.c_str();
723     return absl_ports::InternalError(
724         "Unable to delete temp directory to prepare to build new schema "
725         "store.");
726   }
727 
728   DestructibleDirectory temp_schema_store_dir(
729       filesystem_, std::move(temp_schema_store_dir_path));
730   if (!temp_schema_store_dir.is_valid()) {
731     return absl_ports::InternalError(
732         "Unable to create temp directory to build new schema store.");
733   }
734 
735   // Then we create our new schema store with the new schema.
736   ICING_ASSIGN_OR_RETURN(
737       std::unique_ptr<SchemaStore> new_schema_store,
738       SchemaStore::Create(filesystem_, temp_schema_store_dir.dir(), clock_,
739                           std::move(new_schema)));
740 
741   // Then we swap the new schema file + new derived files with the old files.
742   if (!filesystem_->SwapFiles(base_dir_.c_str(),
743                               temp_schema_store_dir.dir().c_str())) {
744     return absl_ports::InternalError(
745         "Unable to apply new schema due to failed swap!");
746   }
747 
748   std::string old_base_dir = std::move(base_dir_);
749   *this = std::move(*new_schema_store);
750 
751   // After the std::move, the filepaths saved in this instance and in the
752   // schema_file_ instance will still be the one from temp_schema_store_dir
753   // even though they now point to files that are within old_base_dir.
754   // Manually set them to the correct paths.
755   base_dir_ = std::move(old_base_dir);
756   schema_file_->SetSwappedFilepath(MakeSchemaFilename(base_dir_));
757   if (overlay_schema_file_ != nullptr) {
758     overlay_schema_file_->SetSwappedFilepath(
759         MakeOverlaySchemaFilename(base_dir_));
760   }
761 
762   return libtextclassifier3::Status::OK;
763 }
764 
765 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
GetSchemaTypeConfig(std::string_view schema_type) const766 SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const {
767   ICING_RETURN_IF_ERROR(CheckSchemaSet());
768   const auto& type_config_iter =
769       type_config_map_.find(std::string(schema_type));
770   if (type_config_iter == type_config_map_.end()) {
771     return absl_ports::NotFoundError(
772         absl_ports::StrCat("Schema type config '", schema_type, "' not found"));
773   }
774   return &type_config_iter->second;
775 }
776 
GetSchemaTypeId(std::string_view schema_type) const777 libtextclassifier3::StatusOr<SchemaTypeId> SchemaStore::GetSchemaTypeId(
778     std::string_view schema_type) const {
779   ICING_RETURN_IF_ERROR(CheckSchemaSet());
780   return schema_type_mapper_->Get(schema_type);
781 }
782 
GetSchemaType(SchemaTypeId schema_type_id) const783 libtextclassifier3::StatusOr<const std::string*> SchemaStore::GetSchemaType(
784       SchemaTypeId schema_type_id) const {
785   ICING_RETURN_IF_ERROR(CheckSchemaSet());
786   if (const auto it = reverse_schema_type_mapper_.find(schema_type_id);
787       it == reverse_schema_type_mapper_.end()) {
788     return absl_ports::InvalidArgumentError("Invalid schema type id");
789   } else {
790     return &it->second;
791   }
792 }
793 
794 libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
GetSchemaTypeIdsWithChildren(std::string_view schema_type) const795 SchemaStore::GetSchemaTypeIdsWithChildren(std::string_view schema_type) const {
796   ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
797                          GetSchemaTypeId(schema_type));
798   auto iter = schema_subtype_id_map_.find(schema_type_id);
799   if (iter == schema_subtype_id_map_.end()) {
800     // This should never happen, unless there is an inconsistency or IO error.
801     return absl_ports::InternalError(absl_ports::StrCat(
802         "Schema type '", schema_type, "' is not found in the subtype map."));
803   }
804   return &iter->second;
805 }
806 
807 libtextclassifier3::StatusOr<const SectionMetadata*>
GetSectionMetadata(SchemaTypeId schema_type_id,SectionId section_id) const808 SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id,
809                                 SectionId section_id) const {
810   ICING_RETURN_IF_ERROR(CheckSchemaSet());
811   return schema_type_manager_->section_manager().GetSectionMetadata(
812       schema_type_id, section_id);
813 }
814 
ExtractSections(const DocumentProto & document) const815 libtextclassifier3::StatusOr<SectionGroup> SchemaStore::ExtractSections(
816     const DocumentProto& document) const {
817   ICING_RETURN_IF_ERROR(CheckSchemaSet());
818   return schema_type_manager_->section_manager().ExtractSections(document);
819 }
820 
821 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,const std::string & property_path) const822 SchemaStore::GetJoinablePropertyMetadata(
823     SchemaTypeId schema_type_id, const std::string& property_path) const {
824   ICING_RETURN_IF_ERROR(CheckSchemaSet());
825   return schema_type_manager_->joinable_property_manager()
826       .GetJoinablePropertyMetadata(schema_type_id, property_path);
827 }
828 
829 libtextclassifier3::StatusOr<JoinablePropertyGroup>
ExtractJoinableProperties(const DocumentProto & document) const830 SchemaStore::ExtractJoinableProperties(const DocumentProto& document) const {
831   ICING_RETURN_IF_ERROR(CheckSchemaSet());
832   return schema_type_manager_->joinable_property_manager()
833       .ExtractJoinableProperties(document);
834 }
835 
PersistToDisk()836 libtextclassifier3::Status SchemaStore::PersistToDisk() {
837   if (!has_schema_successfully_set_) {
838     return libtextclassifier3::Status::OK;
839   }
840   ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk());
841   // Write the header
842   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
843   header_->set_checksum(checksum.Get());
844   return header_->Write(filesystem_, MakeHeaderFilename(base_dir_));
845 }
846 
GetStorageInfo() const847 SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
848   SchemaStoreStorageInfoProto storage_info;
849   int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
850   storage_info.set_schema_store_size(
851       Filesystem::SanitizeFileSize(directory_size));
852   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
853   storage_info.set_num_schema_types(schema->types_size());
854   int total_sections = 0;
855   int num_types_sections_exhausted = 0;
856   for (const SchemaTypeConfigProto& type : schema->types()) {
857     auto sections_list_or =
858         schema_type_manager_->section_manager().GetMetadataList(
859             type.schema_type());
860     if (!sections_list_or.ok()) {
861       continue;
862     }
863     total_sections += sections_list_or.ValueOrDie()->size();
864     if (sections_list_or.ValueOrDie()->size() == kTotalNumSections) {
865       ++num_types_sections_exhausted;
866     }
867   }
868 
869   storage_info.set_num_total_sections(total_sections);
870   storage_info.set_num_schema_types_sections_exhausted(
871       num_types_sections_exhausted);
872   return storage_info;
873 }
874 
875 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
GetSectionMetadata(const std::string & schema_type) const876 SchemaStore::GetSectionMetadata(const std::string& schema_type) const {
877   return schema_type_manager_->section_manager().GetMetadataList(schema_type);
878 }
879 
IsPropertyDefinedInSchema(SchemaTypeId schema_type_id,const std::string & property_path) const880 bool SchemaStore::IsPropertyDefinedInSchema(
881     SchemaTypeId schema_type_id, const std::string& property_path) const {
882   auto schema_name_itr = reverse_schema_type_mapper_.find(schema_type_id);
883   if (schema_name_itr == reverse_schema_type_mapper_.end()) {
884     return false;
885   }
886   const std::string* current_type_name = &schema_name_itr->second;
887 
888   std::vector<std::string_view> property_path_parts =
889       property_util::SplitPropertyPathExpr(property_path);
890   for (int i = 0; i < property_path_parts.size(); ++i) {
891     auto type_config_itr = type_config_map_.find(*current_type_name);
892     if (type_config_itr == type_config_map_.end()) {
893       return false;
894     }
895     std::string_view property_name = property_path_parts.at(i);
896     const PropertyConfigProto* selected_property = nullptr;
897     for (const PropertyConfigProto& property :
898          type_config_itr->second.properties()) {
899       if (property.property_name() == property_name) {
900         selected_property = &property;
901         break;
902       }
903     }
904     if (selected_property == nullptr) {
905       return false;
906     }
907     if (i == property_path_parts.size() - 1) {
908       // We've found a property at the final part of the path.
909       return true;
910     }
911     if (selected_property->data_type() !=
912         PropertyConfigProto::DataType::DOCUMENT) {
913       // If this isn't final part of the path, but this property isn't a
914       // document, so we know that this path doesn't exist.
915       return false;
916     }
917     current_type_name = &selected_property->schema_type();
918   }
919 
920   // We should never reach this point.
921   return false;
922 }
923 
GetDebugInfo() const924 libtextclassifier3::StatusOr<SchemaDebugInfoProto> SchemaStore::GetDebugInfo()
925     const {
926   SchemaDebugInfoProto debug_info;
927   if (has_schema_successfully_set_) {
928     ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
929     *debug_info.mutable_schema() = *schema;
930   }
931   ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
932   debug_info.set_crc(crc.Get());
933   return debug_info;
934 }
935 
936 std::vector<SchemaStore::ExpandedTypePropertyMask>
ExpandTypePropertyMasks(const google::protobuf::RepeatedPtrField<TypePropertyMask> & type_property_masks) const937 SchemaStore::ExpandTypePropertyMasks(
938     const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks)
939     const {
940   std::unordered_map<SchemaTypeId, ExpandedTypePropertyMask> result_map;
941   for (const TypePropertyMask& type_field_mask : type_property_masks) {
942     if (type_field_mask.schema_type() == kSchemaTypeWildcard) {
943       ExpandedTypePropertyMask entry{type_field_mask.schema_type(),
944                                      /*paths=*/{}};
945       entry.paths.insert(type_field_mask.paths().begin(),
946                          type_field_mask.paths().end());
947       result_map.insert({kInvalidSchemaTypeId, std::move(entry)});
948     } else {
949       auto schema_type_ids_or =
950           GetSchemaTypeIdsWithChildren(type_field_mask.schema_type());
951       // If we can't find the SchemaTypeIds, just throw it away
952       if (!schema_type_ids_or.ok()) {
953         continue;
954       }
955       const std::unordered_set<SchemaTypeId>* schema_type_ids =
956           schema_type_ids_or.ValueOrDie();
957       for (SchemaTypeId schema_type_id : *schema_type_ids) {
958         auto schema_type_name_iter =
959             reverse_schema_type_mapper_.find(schema_type_id);
960         if (schema_type_name_iter == reverse_schema_type_mapper_.end()) {
961           // This should never happen, unless there is an inconsistency or IO
962           // error.
963           ICING_LOG(ERROR) << "Got unknown schema type id: " << schema_type_id;
964           continue;
965         }
966 
967         auto iter = result_map.find(schema_type_id);
968         if (iter == result_map.end()) {
969           ExpandedTypePropertyMask entry{schema_type_name_iter->second,
970                                          /*paths=*/{}};
971           iter = result_map.insert({schema_type_id, std::move(entry)}).first;
972         }
973         iter->second.paths.insert(type_field_mask.paths().begin(),
974                                   type_field_mask.paths().end());
975       }
976     }
977   }
978   std::vector<ExpandedTypePropertyMask> result;
979   result.reserve(result_map.size());
980   for (auto& entry : result_map) {
981     result.push_back(std::move(entry.second));
982   }
983   return result;
984 }
985 
986 }  // namespace lib
987 }  // namespace icing
988