• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/schema/schema-store.h"
16 
17 #include <algorithm>
18 #include <cinttypes>
19 #include <cstdint>
20 #include <limits>
21 #include <memory>
22 #include <string>
23 #include <string_view>
24 #include <unordered_map>
25 #include <unordered_set>
26 #include <utility>
27 #include <vector>
28 
29 #include "icing/text_classifier/lib3/utils/base/status.h"
30 #include "icing/text_classifier/lib3/utils/base/statusor.h"
31 #include "icing/absl_ports/canonical_errors.h"
32 #include "icing/absl_ports/str_cat.h"
33 #include "icing/file/destructible-directory.h"
34 #include "icing/file/file-backed-proto.h"
35 #include "icing/file/filesystem.h"
36 #include "icing/file/version-util.h"
37 #include "icing/proto/debug.pb.h"
38 #include "icing/proto/document.pb.h"
39 #include "icing/proto/logging.pb.h"
40 #include "icing/proto/schema.pb.h"
41 #include "icing/proto/search.pb.h"
42 #include "icing/proto/storage.pb.h"
43 #include "icing/schema/backup-schema-producer.h"
44 #include "icing/schema/joinable-property.h"
45 #include "icing/schema/property-util.h"
46 #include "icing/schema/schema-type-manager.h"
47 #include "icing/schema/schema-util.h"
48 #include "icing/schema/section.h"
49 #include "icing/store/document-filter-data.h"
50 #include "icing/store/dynamic-trie-key-mapper.h"
51 #include "icing/util/crc32.h"
52 #include "icing/util/logging.h"
53 #include "icing/util/status-macros.h"
54 
55 namespace icing {
56 namespace lib {
57 
58 namespace {
59 
60 constexpr char kSchemaStoreHeaderFilename[] = "schema_store_header";
61 constexpr char kSchemaFilename[] = "schema.pb";
62 constexpr char kOverlaySchemaFilename[] = "overlay_schema.pb";
63 constexpr char kSchemaTypeMapperFilename[] = "schema_type_mapper";
64 
65 // A DynamicTrieKeyMapper stores its data across 3 arrays internally. Giving
66 // each array 128KiB for storage means the entire DynamicTrieKeyMapper requires
67 // 384KiB.
68 constexpr int32_t kSchemaTypeMapperMaxSize = 3 * 128 * 1024;  // 384 KiB
69 
MakeHeaderFilename(const std::string & base_dir)70 std::string MakeHeaderFilename(const std::string& base_dir) {
71   return absl_ports::StrCat(base_dir, "/", kSchemaStoreHeaderFilename);
72 }
73 
MakeSchemaFilename(const std::string & base_dir)74 std::string MakeSchemaFilename(const std::string& base_dir) {
75   return absl_ports::StrCat(base_dir, "/", kSchemaFilename);
76 }
77 
MakeOverlaySchemaFilename(const std::string & base_dir)78 std::string MakeOverlaySchemaFilename(const std::string& base_dir) {
79   return absl_ports::StrCat(base_dir, "/", kOverlaySchemaFilename);
80 }
81 
MakeSchemaTypeMapperFilename(const std::string & base_dir)82 std::string MakeSchemaTypeMapperFilename(const std::string& base_dir) {
83   return absl_ports::StrCat(base_dir, "/", kSchemaTypeMapperFilename);
84 }
85 
86 // Assuming that SchemaTypeIds are assigned to schema types based on their order
87 // in the SchemaProto. Check if the schema type->SchemaTypeId mapping would
88 // change with the new schema.
SchemaTypeIdsChanged(const SchemaProto & old_schema,const SchemaProto & new_schema)89 std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged(
90     const SchemaProto& old_schema, const SchemaProto& new_schema) {
91   std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
92 
93   std::unordered_map<std::string, int> old_types_and_index;
94   for (int i = 0; i < old_schema.types_size(); ++i) {
95     old_types_and_index.emplace(old_schema.types(i).schema_type(), i);
96   }
97 
98   std::unordered_map<std::string, int> new_types_and_index;
99   for (int i = 0; i < new_schema.types_size(); ++i) {
100     new_types_and_index.emplace(new_schema.types(i).schema_type(), i);
101   }
102 
103   for (const auto& old_type_index : old_types_and_index) {
104     const auto& iter = new_types_and_index.find(old_type_index.first);
105     // We only care if the type exists in both the old and new schema. If the
106     // type has been deleted, then it'll be captured in
107     // SetSchemaResult.schema_types_deleted*. If the type has been added in the
108     // new schema then we also don't care because nothing needs to be updated.
109     if (iter != new_types_and_index.end()) {
110       // Since the SchemaTypeId of the schema type is just the index of it in
111       // the SchemaProto, compare the index and save it if it's not the same
112       if (old_type_index.second != iter->second) {
113         old_schema_type_ids_changed.emplace(old_type_index.second);
114       }
115     }
116   }
117 
118   return old_schema_type_ids_changed;
119 }
120 
121 }  // namespace
122 
123 /* static */ libtextclassifier3::StatusOr<SchemaStore::Header>
Read(const Filesystem * filesystem,const std::string & path)124 SchemaStore::Header::Read(const Filesystem* filesystem,
125                           const std::string& path) {
126   Header header;
127   ScopedFd sfd(filesystem->OpenForRead(path.c_str()));
128   if (!sfd.is_valid()) {
129     return absl_ports::NotFoundError("SchemaStore header doesn't exist");
130   }
131 
132   // If file is sizeof(LegacyHeader), then it must be LegacyHeader.
133   int64_t file_size = filesystem->GetFileSize(sfd.get());
134   if (file_size == sizeof(LegacyHeader)) {
135     LegacyHeader legacy_header;
136     if (!filesystem->Read(path.c_str(), &legacy_header,
137                           sizeof(legacy_header))) {
138       return absl_ports::InternalError(
139           absl_ports::StrCat("Couldn't read: ", path));
140     }
141     if (legacy_header.magic != Header::kMagic) {
142       return absl_ports::InternalError(
143           absl_ports::StrCat("Invalid header kMagic for file: ", path));
144     }
145     header.set_checksum(legacy_header.checksum);
146   } else if (file_size == sizeof(Header)) {
147     if (!filesystem->Read(path.c_str(), &header, sizeof(header))) {
148       return absl_ports::InternalError(
149           absl_ports::StrCat("Couldn't read: ", path));
150     }
151     if (header.magic() != Header::kMagic) {
152       return absl_ports::InternalError(
153           absl_ports::StrCat("Invalid header kMagic for file: ", path));
154     }
155   } else {
156     int legacy_header_size = sizeof(LegacyHeader);
157     int header_size = sizeof(Header);
158     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
159         "Unexpected header size %" PRId64 ". Expected %d or %d", file_size,
160         legacy_header_size, header_size));
161   }
162   return header;
163 }
164 
Write(const Filesystem * filesystem,const std::string & path)165 libtextclassifier3::Status SchemaStore::Header::Write(
166     const Filesystem* filesystem, const std::string& path) {
167   ScopedFd scoped_fd(filesystem->OpenForWrite(path.c_str()));
168   // This should overwrite the header.
169   if (!scoped_fd.is_valid() ||
170       !filesystem->Write(scoped_fd.get(), this, sizeof(*this)) ||
171       !filesystem->DataSync(scoped_fd.get())) {
172     return absl_ports::InternalError(
173         absl_ports::StrCat("Failed to write SchemaStore header: ", path));
174   }
175   return libtextclassifier3::Status::OK;
176 }
177 
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,InitializeStatsProto * initialize_stats)178 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
179     const Filesystem* filesystem, const std::string& base_dir,
180     const Clock* clock, InitializeStatsProto* initialize_stats) {
181   ICING_RETURN_ERROR_IF_NULL(filesystem);
182   ICING_RETURN_ERROR_IF_NULL(clock);
183 
184   if (!filesystem->DirectoryExists(base_dir.c_str())) {
185     return absl_ports::FailedPreconditionError(
186         "Schema store base directory does not exist!");
187   }
188   std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
189       new SchemaStore(filesystem, base_dir, clock));
190   ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats));
191   return schema_store;
192 }
193 
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,SchemaProto schema)194 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
195     const Filesystem* filesystem, const std::string& base_dir,
196     const Clock* clock, SchemaProto schema) {
197   ICING_RETURN_ERROR_IF_NULL(filesystem);
198   ICING_RETURN_ERROR_IF_NULL(clock);
199 
200   if (!filesystem->DirectoryExists(base_dir.c_str())) {
201     return absl_ports::FailedPreconditionError(
202         "Schema store base directory does not exist!");
203   }
204   std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
205       new SchemaStore(filesystem, base_dir, clock));
206   ICING_RETURN_IF_ERROR(schema_store->Initialize(std::move(schema)));
207   return schema_store;
208 }
209 
DiscardOverlaySchema(const Filesystem * filesystem,const std::string & base_dir,Header & header)210 /* static */ libtextclassifier3::Status SchemaStore::DiscardOverlaySchema(
211     const Filesystem* filesystem, const std::string& base_dir, Header& header) {
212   std::string header_filename = MakeHeaderFilename(base_dir);
213   if (header.overlay_created()) {
214     header.SetOverlayInfo(
215         /*overlay_created=*/false,
216         /*min_overlay_version_compatibility=*/ std::numeric_limits<
217             int32_t>::max());
218     ICING_RETURN_IF_ERROR(header.Write(filesystem, header_filename));
219   }
220   std::string schema_overlay_filename = MakeOverlaySchemaFilename(base_dir);
221   if (!filesystem->DeleteFile(schema_overlay_filename.c_str())) {
222     return absl_ports::InternalError(
223         "Unable to delete stale schema overlay file.");
224   }
225   return libtextclassifier3::Status::OK;
226 }
227 
MigrateSchema(const Filesystem * filesystem,const std::string & base_dir,version_util::StateChange version_state_change,int32_t new_version)228 /* static */ libtextclassifier3::Status SchemaStore::MigrateSchema(
229     const Filesystem* filesystem, const std::string& base_dir,
230     version_util::StateChange version_state_change, int32_t new_version) {
231   if (!filesystem->DirectoryExists(base_dir.c_str())) {
232     // Situations when schema store directory doesn't exist:
233     // - Initializing new Icing instance: don't have to do anything now. The
234     //   directory will be created later.
235     // - Lose schema store: there is nothing we can do now. The logic will be
236     //   handled later by initializing.
237     //
238     // Therefore, just simply return OK here.
239     return libtextclassifier3::Status::OK;
240   }
241 
242   std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir);
243   if (!filesystem->FileExists(overlay_schema_filename.c_str())) {
244     // The overlay doesn't exist. So there should be nothing particularly
245     // interesting to worry about.
246     return libtextclassifier3::Status::OK;
247   }
248 
249   std::string header_filename = MakeHeaderFilename(base_dir);
250   libtextclassifier3::StatusOr<Header> header_or;
251   switch (version_state_change) {
252     // No necessary actions for normal upgrades or no version change. The data
253     // that was produced by the previous version is fully compatible with this
254     // version and there's no stale data for us to clean up.
255     // The same is true for a normal rollforward. A normal rollforward implies
256     // that the previous version was one that understood the concept of the
257     // overlay schema and would have already discarded it if it was unusable.
258     case version_util::StateChange::kVersionZeroUpgrade:
259       // fallthrough
260     case version_util::StateChange::kUpgrade:
261       // fallthrough
262     case version_util::StateChange::kRollForward:
263       // fallthrough
264     case version_util::StateChange::kCompatible:
265       return libtextclassifier3::Status::OK;
266     case version_util::StateChange::kVersionZeroRollForward:
267       // We've rolled forward. The schema overlay file, if it exists, is
268       // possibly stale. We must throw it out.
269       header_or = Header::Read(filesystem, header_filename);
270       if (!header_or.ok()) {
271         return header_or.status();
272       }
273       return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
274                                                header_or.ValueOrDie());
275     case version_util::StateChange::kRollBack:
276       header_or = Header::Read(filesystem, header_filename);
277       if (!header_or.ok()) {
278         return header_or.status();
279       }
280       if (header_or.ValueOrDie().min_overlay_version_compatibility() <=
281           new_version) {
282         // We've been rolled back, but the overlay schema claims that it
283         // supports this version. So we can safely return.
284         return libtextclassifier3::Status::OK;
285       }
286       // We've been rolled back to a version that the overlay schema doesn't
287       // support. We must throw it out.
288       return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
289                                                header_or.ValueOrDie());
290     case version_util::StateChange::kUndetermined:
291       // It's not clear what version we're on, but the base schema should always
292       // be safe to use. Throw out the overlay.
293       header_or = Header::Read(filesystem, header_filename);
294       if (!header_or.ok()) {
295         return header_or.status();
296       }
297       return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
298                                                header_or.ValueOrDie());
299   }
300   return libtextclassifier3::Status::OK;
301 }
302 
DiscardDerivedFiles(const Filesystem * filesystem,const std::string & base_dir)303 /* static */ libtextclassifier3::Status SchemaStore::DiscardDerivedFiles(
304     const Filesystem* filesystem, const std::string& base_dir) {
305   // Schema type mapper
306   return DynamicTrieKeyMapper<SchemaTypeId>::Delete(
307       *filesystem, MakeSchemaTypeMapperFilename(base_dir));
308 }
309 
SchemaStore(const Filesystem * filesystem,std::string base_dir,const Clock * clock)310 SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir,
311                          const Clock* clock)
312     : filesystem_(filesystem),
313       base_dir_(std::move(base_dir)),
314       clock_(clock),
315       schema_file_(std::make_unique<FileBackedProto<SchemaProto>>(
316           *filesystem, MakeSchemaFilename(base_dir_))) {}
317 
~SchemaStore()318 SchemaStore::~SchemaStore() {
319   if (has_schema_successfully_set_ && schema_file_ != nullptr &&
320       schema_type_mapper_ != nullptr && schema_type_manager_ != nullptr) {
321     if (!PersistToDisk().ok()) {
322       ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor";
323     }
324   }
325 }
326 
Initialize(SchemaProto new_schema)327 libtextclassifier3::Status SchemaStore::Initialize(SchemaProto new_schema) {
328   ICING_RETURN_IF_ERROR(LoadSchema());
329   if (!absl_ports::IsNotFound(GetSchema().status())) {
330     return absl_ports::FailedPreconditionError(
331         "Incorrectly tried to initialize schema store with a new schema, when "
332         "one is already set!");
333   }
334   ICING_RETURN_IF_ERROR(schema_file_->Write(
335       std::make_unique<SchemaProto>(std::move(new_schema))));
336   return InitializeInternal(/*create_overlay_if_necessary=*/true,
337                             /*initialize_stats=*/nullptr);
338 }
339 
Initialize(InitializeStatsProto * initialize_stats)340 libtextclassifier3::Status SchemaStore::Initialize(
341     InitializeStatsProto* initialize_stats) {
342   ICING_RETURN_IF_ERROR(LoadSchema());
343   auto schema_proto_or = GetSchema();
344   if (absl_ports::IsNotFound(schema_proto_or.status())) {
345     // Don't have an existing schema proto, that's fine
346     return libtextclassifier3::Status::OK;
347   } else if (!schema_proto_or.ok()) {
348     // Real error when trying to read the existing schema
349     return schema_proto_or.status();
350   }
351   return InitializeInternal(/*create_overlay_if_necessary=*/false,
352                             initialize_stats);
353 }
354 
LoadSchema()355 libtextclassifier3::Status SchemaStore::LoadSchema() {
356   libtextclassifier3::StatusOr<Header> header_or =
357       Header::Read(filesystem_, MakeHeaderFilename(base_dir_));
358   bool header_exists = false;
359   if (!header_or.ok() && !absl_ports::IsNotFound(header_or.status())) {
360     return header_or.status();
361   } else if (!header_or.ok()) {
362     header_ = std::make_unique<Header>();
363   } else {
364     header_exists = true;
365     header_ = std::make_unique<Header>(std::move(header_or).ValueOrDie());
366   }
367 
368   std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir_);
369   bool overlay_schema_file_exists =
370       filesystem_->FileExists(overlay_schema_filename.c_str());
371 
372   libtextclassifier3::Status base_schema_state = schema_file_->Read().status();
373   if (!base_schema_state.ok() && !absl_ports::IsNotFound(base_schema_state)) {
374     return base_schema_state;
375   }
376 
377   // There are three valid cases:
378   // 1. Everything is missing. This is an empty schema store.
379   if (!base_schema_state.ok() && !overlay_schema_file_exists &&
380       !header_exists) {
381     return libtextclassifier3::Status::OK;
382   }
383 
384   // 2. There never was a overlay schema. The header exists, the base schema
385   //    exists and the header says the overlay schema shouldn't exist
386   if (base_schema_state.ok() && !overlay_schema_file_exists && header_exists &&
387       !header_->overlay_created()) {
388     // Nothing else to do. Just return safely.
389     return libtextclassifier3::Status::OK;
390   }
391 
392   // 3. There is an overlay schema and a base schema and a header. The header
393   // says that the overlay schema should exist.
394   if (base_schema_state.ok() && overlay_schema_file_exists && header_exists &&
395       header_->overlay_created()) {
396     overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
397         *filesystem_, MakeOverlaySchemaFilename(base_dir_));
398     return libtextclassifier3::Status::OK;
399   }
400 
401   // Something has gone wrong. We've lost part of the schema ground truth.
402   // Return an error.
403   bool overlay_created = header_->overlay_created();
404   bool base_schema_exists = base_schema_state.ok();
405   return absl_ports::InternalError(IcingStringUtil::StringPrintf(
406       "Unable to properly load schema. Header {exists:%d, overlay_created:%d}, "
407       "base schema exists: %d, overlay_schema_exists: %d",
408       header_exists, overlay_created, base_schema_exists,
409       overlay_schema_file_exists));
410 }
411 
InitializeInternal(bool create_overlay_if_necessary,InitializeStatsProto * initialize_stats)412 libtextclassifier3::Status SchemaStore::InitializeInternal(
413     bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats) {
414   if (!InitializeDerivedFiles().ok()) {
415     ICING_VLOG(3)
416         << "Couldn't find derived files or failed to initialize them, "
417            "regenerating derived files for SchemaStore.";
418     std::unique_ptr<Timer> regenerate_timer = clock_->GetNewTimer();
419     if (initialize_stats != nullptr) {
420       initialize_stats->set_schema_store_recovery_cause(
421           InitializeStatsProto::IO_ERROR);
422     }
423     ICING_RETURN_IF_ERROR(RegenerateDerivedFiles(create_overlay_if_necessary));
424     if (initialize_stats != nullptr) {
425       initialize_stats->set_schema_store_recovery_latency_ms(
426           regenerate_timer->GetElapsedMilliseconds());
427     }
428   }
429 
430   if (initialize_stats != nullptr) {
431     initialize_stats->set_num_schema_types(type_config_map_.size());
432   }
433   has_schema_successfully_set_ = true;
434 
435   return libtextclassifier3::Status::OK;
436 }
437 
InitializeDerivedFiles()438 libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() {
439   ICING_ASSIGN_OR_RETURN(
440       schema_type_mapper_,
441       DynamicTrieKeyMapper<SchemaTypeId>::Create(
442           *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
443           kSchemaTypeMapperMaxSize));
444 
445   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
446   if (checksum.Get() != header_->checksum()) {
447     return absl_ports::InternalError(
448         "Combined checksum of SchemaStore was inconsistent");
449   }
450 
451   BuildInMemoryCache();
452   return libtextclassifier3::Status::OK;
453 }
454 
RegenerateDerivedFiles(bool create_overlay_if_necessary)455 libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles(
456     bool create_overlay_if_necessary) {
457   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
458 
459   ICING_RETURN_IF_ERROR(ResetSchemaTypeMapper());
460 
461   for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
462     // Assign a SchemaTypeId to the type
463     ICING_RETURN_IF_ERROR(schema_type_mapper_->Put(
464         type_config.schema_type(), schema_type_mapper_->num_keys()));
465   }
466   BuildInMemoryCache();
467 
468   if (create_overlay_if_necessary) {
469     ICING_ASSIGN_OR_RETURN(
470         BackupSchemaProducer producer,
471         BackupSchemaProducer::Create(*schema_proto,
472                                      schema_type_manager_->section_manager()));
473 
474     if (producer.is_backup_necessary()) {
475       SchemaProto base_schema = std::move(producer).Produce();
476 
477       // The overlay schema should be written to the overlay file location.
478       overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
479           *filesystem_, MakeOverlaySchemaFilename(base_dir_));
480       auto schema_ptr = std::make_unique<SchemaProto>(std::move(*schema_proto));
481       ICING_RETURN_IF_ERROR(overlay_schema_file_->Write(std::move(schema_ptr)));
482 
483       // The base schema should be written to the original file
484       auto base_schema_ptr =
485           std::make_unique<SchemaProto>(std::move(base_schema));
486       ICING_RETURN_IF_ERROR(schema_file_->Write(std::move(base_schema_ptr)));
487 
488       header_->SetOverlayInfo(
489           /*overlay_created=*/true,
490           /*min_overlay_version_compatibility=*/version_util::kVersionOne);
491       // Rebuild in memory data - references to the old schema will be invalid
492       // now.
493       BuildInMemoryCache();
494     }
495   }
496 
497   // Write the header
498   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
499   header_->set_checksum(checksum.Get());
500   return header_->Write(filesystem_, MakeHeaderFilename(base_dir_));
501 }
502 
BuildInMemoryCache()503 libtextclassifier3::Status SchemaStore::BuildInMemoryCache() {
504   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
505   ICING_ASSIGN_OR_RETURN(
506       SchemaUtil::InheritanceMap inheritance_map,
507       SchemaUtil::BuildTransitiveInheritanceGraph(*schema_proto));
508 
509   reverse_schema_type_mapper_.clear();
510   type_config_map_.clear();
511   schema_subtype_id_map_.clear();
512   for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
513     std::string_view type_name = type_config.schema_type();
514     ICING_ASSIGN_OR_RETURN(SchemaTypeId type_id,
515                            schema_type_mapper_->Get(type_name));
516 
517     // Build reverse_schema_type_mapper_
518     reverse_schema_type_mapper_.insert({type_id, std::string(type_name)});
519 
520     // Build type_config_map_
521     type_config_map_.insert({std::string(type_name), type_config});
522 
523     // Build schema_subtype_id_map_
524     std::unordered_set<SchemaTypeId>& subtype_id_set =
525         schema_subtype_id_map_[type_id];
526     // Find all child types
527     auto child_types_names = inheritance_map.find(type_name);
528     if (child_types_names != inheritance_map.end()) {
529       subtype_id_set.reserve(child_types_names->second.size() + 1);
530       for (const auto& [child_type_name, is_direct_child] :
531            child_types_names->second) {
532         ICING_ASSIGN_OR_RETURN(SchemaTypeId child_type_id,
533                                schema_type_mapper_->Get(child_type_name));
534         subtype_id_set.insert(child_type_id);
535       }
536     }
537     // Every type is a subtype of itself.
538     subtype_id_set.insert(type_id);
539   }
540 
541   // Build schema_type_manager_
542   ICING_ASSIGN_OR_RETURN(
543       schema_type_manager_,
544       SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
545   return libtextclassifier3::Status::OK;
546 }
547 
ResetSchemaTypeMapper()548 libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
549   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
550   schema_type_mapper_.reset();
551   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
552   // that can support error logging.
553   libtextclassifier3::Status status =
554       DynamicTrieKeyMapper<SchemaTypeId>::Delete(
555           *filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
556   if (!status.ok()) {
557     ICING_LOG(ERROR) << status.error_message()
558                      << "Failed to delete old schema_type mapper";
559     return status;
560   }
561   ICING_ASSIGN_OR_RETURN(
562       schema_type_mapper_,
563       DynamicTrieKeyMapper<SchemaTypeId>::Create(
564           *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
565           kSchemaTypeMapperMaxSize));
566 
567   return libtextclassifier3::Status::OK;
568 }
569 
ComputeChecksum() const570 libtextclassifier3::StatusOr<Crc32> SchemaStore::ComputeChecksum() const {
571   // Base schema checksum
572   auto schema_proto_or = schema_file_->Read();
573   if (absl_ports::IsNotFound(schema_proto_or.status())) {
574     return Crc32();
575   }
576   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, schema_proto_or);
577   Crc32 schema_checksum;
578   schema_checksum.Append(schema_proto->SerializeAsString());
579 
580   Crc32 overlay_schema_checksum;
581   if (overlay_schema_file_ != nullptr) {
582     auto schema_proto_or = schema_file_->Read();
583     if (schema_proto_or.ok()) {
584       ICING_ASSIGN_OR_RETURN(schema_proto, schema_proto_or);
585       overlay_schema_checksum.Append(schema_proto->SerializeAsString());
586     }
587   }
588 
589   ICING_ASSIGN_OR_RETURN(Crc32 schema_type_mapper_checksum,
590                          schema_type_mapper_->ComputeChecksum());
591 
592   Crc32 total_checksum;
593   total_checksum.Append(std::to_string(schema_checksum.Get()));
594   if (overlay_schema_file_ != nullptr) {
595     total_checksum.Append(std::to_string(overlay_schema_checksum.Get()));
596   }
597   total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
598 
599   return total_checksum;
600 }
601 
GetSchema() const602 libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema()
603     const {
604   if (overlay_schema_file_ != nullptr) {
605     return overlay_schema_file_->Read();
606   }
607   return schema_file_->Read();
608 }
609 
610 // TODO(cassiewang): Consider removing this definition of SetSchema if it's not
611 // needed by production code. It's currently being used by our tests, but maybe
612 // it's trivial to change our test code to also use the
613 // SetSchema(SchemaProto&& new_schema)
614 libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
SetSchema(const SchemaProto & new_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)615 SchemaStore::SetSchema(const SchemaProto& new_schema,
616                        bool ignore_errors_and_delete_documents,
617                        bool allow_circular_schema_definitions) {
618   return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents,
619                    allow_circular_schema_definitions);
620 }
621 
622 libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
SetSchema(SchemaProto && new_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)623 SchemaStore::SetSchema(SchemaProto&& new_schema,
624                        bool ignore_errors_and_delete_documents,
625                        bool allow_circular_schema_definitions) {
626   ICING_ASSIGN_OR_RETURN(
627       SchemaUtil::DependentMap new_dependent_map,
628       SchemaUtil::Validate(new_schema, allow_circular_schema_definitions));
629 
630   SetSchemaResult result;
631 
632   auto schema_proto_or = GetSchema();
633   if (absl_ports::IsNotFound(schema_proto_or.status())) {
634     // We don't have a pre-existing schema, so anything is valid.
635     result.success = true;
636     for (const SchemaTypeConfigProto& type_config : new_schema.types()) {
637       result.schema_types_new_by_name.insert(type_config.schema_type());
638     }
639   } else if (!schema_proto_or.ok()) {
640     // Real error
641     return schema_proto_or.status();
642   } else {
643     // At this point, we're guaranteed that we have a schema.
644     const SchemaProto old_schema = *schema_proto_or.ValueOrDie();
645 
646     // Assume we can set the schema unless proven otherwise.
647     result.success = true;
648 
649     if (new_schema.SerializeAsString() == old_schema.SerializeAsString()) {
650       // Same schema as before. No need to update anything
651       return result;
652     }
653 
654     // Different schema, track the differences and see if we can still write it
655     SchemaUtil::SchemaDelta schema_delta =
656         SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
657                                               new_dependent_map);
658 
659     result.schema_types_new_by_name = std::move(schema_delta.schema_types_new);
660     result.schema_types_changed_fully_compatible_by_name =
661         std::move(schema_delta.schema_types_changed_fully_compatible);
662     result.schema_types_index_incompatible_by_name =
663         std::move(schema_delta.schema_types_index_incompatible);
664     result.schema_types_join_incompatible_by_name =
665         std::move(schema_delta.schema_types_join_incompatible);
666 
667     for (const auto& schema_type : schema_delta.schema_types_deleted) {
668       // We currently don't support deletions, so mark this as not possible.
669       // This will change once we allow force-set schemas.
670       result.success = false;
671 
672       result.schema_types_deleted_by_name.emplace(schema_type);
673 
674       ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
675                              GetSchemaTypeId(schema_type));
676       result.schema_types_deleted_by_id.emplace(schema_type_id);
677     }
678 
679     for (const auto& schema_type : schema_delta.schema_types_incompatible) {
680       // We currently don't support incompatible schemas, so mark this as
681       // not possible. This will change once we allow force-set schemas.
682       result.success = false;
683 
684       result.schema_types_incompatible_by_name.emplace(schema_type);
685 
686       ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
687                              GetSchemaTypeId(schema_type));
688       result.schema_types_incompatible_by_id.emplace(schema_type_id);
689     }
690 
691     // SchemaTypeIds changing is fine, we can update the DocumentStore
692     result.old_schema_type_ids_changed =
693         SchemaTypeIdsChanged(old_schema, new_schema);
694   }
695 
696   // We can force set the schema if the caller has told us to ignore any errors
697   result.success = result.success || ignore_errors_and_delete_documents;
698 
699   if (result.success) {
700     ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(new_schema)));
701     has_schema_successfully_set_ = true;
702   }
703 
704   return result;
705 }
706 
ApplySchemaChange(SchemaProto new_schema)707 libtextclassifier3::Status SchemaStore::ApplySchemaChange(
708     SchemaProto new_schema) {
709   // We need to ensure that we either 1) successfully set the schema and
710   // update all derived data structures or 2) fail and leave the schema store
711   // unchanged.
712   // So, first, we create an empty temporary directory to build a new schema
713   // store in.
714   std::string temp_schema_store_dir_path = base_dir_ + "_temp";
715   if (!filesystem_->DeleteDirectoryRecursively(
716           temp_schema_store_dir_path.c_str())) {
717     ICING_LOG(ERROR) << "Recursively deleting "
718                      << temp_schema_store_dir_path.c_str();
719     return absl_ports::InternalError(
720         "Unable to delete temp directory to prepare to build new schema "
721         "store.");
722   }
723 
724   DestructibleDirectory temp_schema_store_dir(
725       filesystem_, std::move(temp_schema_store_dir_path));
726   if (!temp_schema_store_dir.is_valid()) {
727     return absl_ports::InternalError(
728         "Unable to create temp directory to build new schema store.");
729   }
730 
731   // Then we create our new schema store with the new schema.
732   ICING_ASSIGN_OR_RETURN(
733       std::unique_ptr<SchemaStore> new_schema_store,
734       SchemaStore::Create(filesystem_, temp_schema_store_dir.dir(), clock_,
735                           std::move(new_schema)));
736 
737   // Then we swap the new schema file + new derived files with the old files.
738   if (!filesystem_->SwapFiles(base_dir_.c_str(),
739                               temp_schema_store_dir.dir().c_str())) {
740     return absl_ports::InternalError(
741         "Unable to apply new schema due to failed swap!");
742   }
743 
744   std::string old_base_dir = std::move(base_dir_);
745   *this = std::move(*new_schema_store);
746 
747   // After the std::move, the filepaths saved in this instance and in the
748   // schema_file_ instance will still be the one from temp_schema_store_dir
749   // even though they now point to files that are within old_base_dir.
750   // Manually set them to the correct paths.
751   base_dir_ = std::move(old_base_dir);
752   schema_file_->SetSwappedFilepath(MakeSchemaFilename(base_dir_));
753   if (overlay_schema_file_ != nullptr) {
754     overlay_schema_file_->SetSwappedFilepath(
755         MakeOverlaySchemaFilename(base_dir_));
756   }
757 
758   return libtextclassifier3::Status::OK;
759 }
760 
761 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
GetSchemaTypeConfig(std::string_view schema_type) const762 SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const {
763   ICING_RETURN_IF_ERROR(CheckSchemaSet());
764   const auto& type_config_iter =
765       type_config_map_.find(std::string(schema_type));
766   if (type_config_iter == type_config_map_.end()) {
767     return absl_ports::NotFoundError(
768         absl_ports::StrCat("Schema type config '", schema_type, "' not found"));
769   }
770   return &type_config_iter->second;
771 }
772 
GetSchemaTypeId(std::string_view schema_type) const773 libtextclassifier3::StatusOr<SchemaTypeId> SchemaStore::GetSchemaTypeId(
774     std::string_view schema_type) const {
775   ICING_RETURN_IF_ERROR(CheckSchemaSet());
776   return schema_type_mapper_->Get(schema_type);
777 }
778 
779 libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
GetSchemaTypeIdsWithChildren(std::string_view schema_type) const780 SchemaStore::GetSchemaTypeIdsWithChildren(std::string_view schema_type) const {
781   ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
782                          GetSchemaTypeId(schema_type));
783   auto iter = schema_subtype_id_map_.find(schema_type_id);
784   if (iter == schema_subtype_id_map_.end()) {
785     // This should never happen, unless there is an inconsistency or IO error.
786     return absl_ports::InternalError(absl_ports::StrCat(
787         "Schema type '", schema_type, "' is not found in the subtype map."));
788   }
789   return &iter->second;
790 }
791 
792 libtextclassifier3::StatusOr<const SectionMetadata*>
GetSectionMetadata(SchemaTypeId schema_type_id,SectionId section_id) const793 SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id,
794                                 SectionId section_id) const {
795   ICING_RETURN_IF_ERROR(CheckSchemaSet());
796   return schema_type_manager_->section_manager().GetSectionMetadata(
797       schema_type_id, section_id);
798 }
799 
ExtractSections(const DocumentProto & document) const800 libtextclassifier3::StatusOr<SectionGroup> SchemaStore::ExtractSections(
801     const DocumentProto& document) const {
802   ICING_RETURN_IF_ERROR(CheckSchemaSet());
803   return schema_type_manager_->section_manager().ExtractSections(document);
804 }
805 
806 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,const std::string & property_path) const807 SchemaStore::GetJoinablePropertyMetadata(
808     SchemaTypeId schema_type_id, const std::string& property_path) const {
809   ICING_RETURN_IF_ERROR(CheckSchemaSet());
810   return schema_type_manager_->joinable_property_manager()
811       .GetJoinablePropertyMetadata(schema_type_id, property_path);
812 }
813 
814 libtextclassifier3::StatusOr<JoinablePropertyGroup>
ExtractJoinableProperties(const DocumentProto & document) const815 SchemaStore::ExtractJoinableProperties(const DocumentProto& document) const {
816   ICING_RETURN_IF_ERROR(CheckSchemaSet());
817   return schema_type_manager_->joinable_property_manager()
818       .ExtractJoinableProperties(document);
819 }
820 
PersistToDisk()821 libtextclassifier3::Status SchemaStore::PersistToDisk() {
822   if (!has_schema_successfully_set_) {
823     return libtextclassifier3::Status::OK;
824   }
825   ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk());
826   // Write the header
827   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
828   header_->set_checksum(checksum.Get());
829   return header_->Write(filesystem_, MakeHeaderFilename(base_dir_));
830 }
831 
GetStorageInfo() const832 SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
833   SchemaStoreStorageInfoProto storage_info;
834   int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
835   storage_info.set_schema_store_size(
836       Filesystem::SanitizeFileSize(directory_size));
837   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
838   storage_info.set_num_schema_types(schema->types_size());
839   int total_sections = 0;
840   int num_types_sections_exhausted = 0;
841   for (const SchemaTypeConfigProto& type : schema->types()) {
842     auto sections_list_or =
843         schema_type_manager_->section_manager().GetMetadataList(
844             type.schema_type());
845     if (!sections_list_or.ok()) {
846       continue;
847     }
848     total_sections += sections_list_or.ValueOrDie()->size();
849     if (sections_list_or.ValueOrDie()->size() == kTotalNumSections) {
850       ++num_types_sections_exhausted;
851     }
852   }
853 
854   storage_info.set_num_total_sections(total_sections);
855   storage_info.set_num_schema_types_sections_exhausted(
856       num_types_sections_exhausted);
857   return storage_info;
858 }
859 
860 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
GetSectionMetadata(const std::string & schema_type) const861 SchemaStore::GetSectionMetadata(const std::string& schema_type) const {
862   return schema_type_manager_->section_manager().GetMetadataList(schema_type);
863 }
864 
IsPropertyDefinedInSchema(SchemaTypeId schema_type_id,const std::string & property_path) const865 bool SchemaStore::IsPropertyDefinedInSchema(
866     SchemaTypeId schema_type_id, const std::string& property_path) const {
867   auto schema_name_itr = reverse_schema_type_mapper_.find(schema_type_id);
868   if (schema_name_itr == reverse_schema_type_mapper_.end()) {
869     return false;
870   }
871   const std::string* current_type_name = &schema_name_itr->second;
872 
873   std::vector<std::string_view> property_path_parts =
874       property_util::SplitPropertyPathExpr(property_path);
875   for (int i = 0; i < property_path_parts.size(); ++i) {
876     auto type_config_itr = type_config_map_.find(*current_type_name);
877     if (type_config_itr == type_config_map_.end()) {
878       return false;
879     }
880     std::string_view property_name = property_path_parts.at(i);
881     const PropertyConfigProto* selected_property = nullptr;
882     for (const PropertyConfigProto& property :
883          type_config_itr->second.properties()) {
884       if (property.property_name() == property_name) {
885         selected_property = &property;
886         break;
887       }
888     }
889     if (selected_property == nullptr) {
890       return false;
891     }
892     if (i == property_path_parts.size() - 1) {
893       // We've found a property at the final part of the path.
894       return true;
895     }
896     if (selected_property->data_type() !=
897         PropertyConfigProto::DataType::DOCUMENT) {
898       // If this isn't final part of the path, but this property isn't a
899       // document, so we know that this path doesn't exist.
900       return false;
901     }
902     current_type_name = &selected_property->schema_type();
903   }
904 
905   // We should never reach this point.
906   return false;
907 }
908 
GetDebugInfo() const909 libtextclassifier3::StatusOr<SchemaDebugInfoProto> SchemaStore::GetDebugInfo()
910     const {
911   SchemaDebugInfoProto debug_info;
912   if (has_schema_successfully_set_) {
913     ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
914     *debug_info.mutable_schema() = *schema;
915   }
916   ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
917   debug_info.set_crc(crc.Get());
918   return debug_info;
919 }
920 
921 std::vector<SchemaStore::ExpandedTypePropertyMask>
ExpandTypePropertyMasks(const google::protobuf::RepeatedPtrField<TypePropertyMask> & type_property_masks) const922 SchemaStore::ExpandTypePropertyMasks(
923     const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks)
924     const {
925   std::unordered_map<SchemaTypeId, ExpandedTypePropertyMask> result_map;
926   for (const TypePropertyMask& type_field_mask : type_property_masks) {
927     if (type_field_mask.schema_type() == kSchemaTypeWildcard) {
928       ExpandedTypePropertyMask entry{type_field_mask.schema_type(),
929                                      /*paths=*/{}};
930       entry.paths.insert(type_field_mask.paths().begin(),
931                          type_field_mask.paths().end());
932       result_map.insert({kInvalidSchemaTypeId, std::move(entry)});
933     } else {
934       auto schema_type_ids_or =
935           GetSchemaTypeIdsWithChildren(type_field_mask.schema_type());
936       // If we can't find the SchemaTypeIds, just throw it away
937       if (!schema_type_ids_or.ok()) {
938         continue;
939       }
940       const std::unordered_set<SchemaTypeId>* schema_type_ids =
941           schema_type_ids_or.ValueOrDie();
942       for (SchemaTypeId schema_type_id : *schema_type_ids) {
943         auto schema_type_name_iter =
944             reverse_schema_type_mapper_.find(schema_type_id);
945         if (schema_type_name_iter == reverse_schema_type_mapper_.end()) {
946           // This should never happen, unless there is an inconsistency or IO
947           // error.
948           ICING_LOG(ERROR) << "Got unknown schema type id: " << schema_type_id;
949           continue;
950         }
951 
952         auto iter = result_map.find(schema_type_id);
953         if (iter == result_map.end()) {
954           ExpandedTypePropertyMask entry{schema_type_name_iter->second,
955                                          /*paths=*/{}};
956           iter = result_map.insert({schema_type_id, std::move(entry)}).first;
957         }
958         iter->second.paths.insert(type_field_mask.paths().begin(),
959                                   type_field_mask.paths().end());
960       }
961     }
962   }
963   std::vector<ExpandedTypePropertyMask> result;
964   result.reserve(result_map.size());
965   for (auto& entry : result_map) {
966     result.push_back(std::move(entry.second));
967   }
968   return result;
969 }
970 
971 }  // namespace lib
972 }  // namespace icing
973