• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/schema/schema-store.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/file/file-backed-proto.h"
31 #include "icing/file/filesystem.h"
32 #include "icing/proto/document.pb.h"
33 #include "icing/proto/schema.pb.h"
34 #include "icing/schema/schema-util.h"
35 #include "icing/schema/section-manager.h"
36 #include "icing/schema/section.h"
37 #include "icing/store/document-filter-data.h"
38 #include "icing/store/key-mapper.h"
39 #include "icing/util/crc32.h"
40 #include "icing/util/logging.h"
41 #include "icing/util/status-macros.h"
42 
43 namespace icing {
44 namespace lib {
45 
46 namespace {
47 
48 constexpr char kSchemaStoreHeaderFilename[] = "schema_store_header";
49 constexpr char kSchemaFilename[] = "schema.pb";
50 constexpr char kSchemaTypeMapperFilename[] = "schema_type_mapper";
51 
52 // A KeyMapper stores its data across 3 arrays internally. Giving each array
53 // 128KiB for storage means the entire KeyMapper requires 384KiB.
54 constexpr int32_t kSchemaTypeMapperMaxSize = 3 * 128 * 1024;  // 384 KiB
55 
MakeHeaderFilename(const std::string & base_dir)56 const std::string MakeHeaderFilename(const std::string& base_dir) {
57   return absl_ports::StrCat(base_dir, "/", kSchemaStoreHeaderFilename);
58 }
59 
MakeSchemaFilename(const std::string & base_dir)60 const std::string MakeSchemaFilename(const std::string& base_dir) {
61   return absl_ports::StrCat(base_dir, "/", kSchemaFilename);
62 }
63 
MakeSchemaTypeMapperFilename(const std::string & base_dir)64 const std::string MakeSchemaTypeMapperFilename(const std::string& base_dir) {
65   return absl_ports::StrCat(base_dir, "/", kSchemaTypeMapperFilename);
66 }
67 
68 // Assuming that SchemaTypeIds are assigned to schema types based on their order
69 // in the SchemaProto. Check if the schema type->SchemaTypeId mapping would
70 // change with the new schema.
SchemaTypeIdsChanged(const SchemaProto & old_schema,const SchemaProto & new_schema)71 std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged(
72     const SchemaProto& old_schema, const SchemaProto& new_schema) {
73   std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
74 
75   std::unordered_map<std::string, int> old_types_and_index;
76   for (int i = 0; i < old_schema.types_size(); ++i) {
77     old_types_and_index.emplace(old_schema.types(i).schema_type(), i);
78   }
79 
80   std::unordered_map<std::string, int> new_types_and_index;
81   for (int i = 0; i < new_schema.types_size(); ++i) {
82     new_types_and_index.emplace(new_schema.types(i).schema_type(), i);
83   }
84 
85   for (const auto& old_type_index : old_types_and_index) {
86     const auto& iter = new_types_and_index.find(old_type_index.first);
87     // We only care if the type exists in both the old and new schema. If the
88     // type has been deleted, then it'll be captured in
89     // SetSchemaResult.schema_types_deleted*. If the type has been added in the
90     // new schema then we also don't care because nothing needs to be updated.
91     if (iter != new_types_and_index.end()) {
92       // Since the SchemaTypeId of the schema type is just the index of it in
93       // the SchemaProto, compare the index and save it if it's not the same
94       if (old_type_index.second != iter->second) {
95         old_schema_type_ids_changed.emplace(old_type_index.second);
96       }
97     }
98   }
99 
100   return old_schema_type_ids_changed;
101 }
102 
103 }  // namespace
104 
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,InitializeStatsProto * initialize_stats)105 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
106     const Filesystem* filesystem, const std::string& base_dir,
107     const Clock* clock, InitializeStatsProto* initialize_stats) {
108   ICING_RETURN_ERROR_IF_NULL(filesystem);
109   ICING_RETURN_ERROR_IF_NULL(clock);
110 
111   std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
112       new SchemaStore(filesystem, base_dir, clock));
113   ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats));
114   return schema_store;
115 }
116 
SchemaStore(const Filesystem * filesystem,std::string base_dir,const Clock * clock)117 SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir,
118                          const Clock* clock)
119     : filesystem_(*filesystem),
120       base_dir_(std::move(base_dir)),
121       clock_(*clock),
122       schema_file_(*filesystem, MakeSchemaFilename(base_dir_)) {}
123 
~SchemaStore()124 SchemaStore::~SchemaStore() {
125   if (has_schema_successfully_set_) {
126     if (!PersistToDisk().ok()) {
127       ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor";
128     }
129   }
130 }
131 
Initialize(InitializeStatsProto * initialize_stats)132 libtextclassifier3::Status SchemaStore::Initialize(
133     InitializeStatsProto* initialize_stats) {
134   auto schema_proto_or = GetSchema();
135   if (absl_ports::IsNotFound(schema_proto_or.status())) {
136     // Don't have an existing schema proto, that's fine
137     return libtextclassifier3::Status::OK;
138   } else if (!schema_proto_or.ok()) {
139     // Real error when trying to read the existing schema
140     return schema_proto_or.status();
141   }
142   has_schema_successfully_set_ = true;
143 
144   if (!InitializeDerivedFiles().ok()) {
145     ICING_VLOG(3)
146         << "Couldn't find derived files or failed to initialize them, "
147            "regenerating derived files for SchemaStore.";
148     std::unique_ptr<Timer> regenerate_timer = clock_.GetNewTimer();
149     if (initialize_stats != nullptr) {
150       initialize_stats->set_schema_store_recovery_cause(
151           InitializeStatsProto::IO_ERROR);
152     }
153     ICING_RETURN_IF_ERROR(RegenerateDerivedFiles());
154     if (initialize_stats != nullptr) {
155       initialize_stats->set_schema_store_recovery_latency_ms(
156           regenerate_timer->GetElapsedMilliseconds());
157     }
158   }
159 
160   if (initialize_stats != nullptr) {
161     initialize_stats->set_num_schema_types(type_config_map_.size());
162   }
163 
164   return libtextclassifier3::Status::OK;
165 }
166 
InitializeDerivedFiles()167 libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() {
168   if (!HeaderExists()) {
169     // Without a header, we don't know if things are consistent between each
170     // other so the caller should just regenerate everything from ground truth.
171     return absl_ports::InternalError("SchemaStore header doesn't exist");
172   }
173 
174   SchemaStore::Header header;
175   if (!filesystem_.Read(MakeHeaderFilename(base_dir_).c_str(), &header,
176                         sizeof(header))) {
177     return absl_ports::InternalError(
178         absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
179   }
180 
181   if (header.magic != SchemaStore::Header::kMagic) {
182     return absl_ports::InternalError(absl_ports::StrCat(
183         "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_)));
184   }
185 
186   ICING_ASSIGN_OR_RETURN(
187       schema_type_mapper_,
188       KeyMapper<SchemaTypeId>::Create(filesystem_,
189                                       MakeSchemaTypeMapperFilename(base_dir_),
190                                       kSchemaTypeMapperMaxSize));
191 
192   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
193   if (checksum.Get() != header.checksum) {
194     return absl_ports::InternalError(
195         "Combined checksum of SchemaStore was inconsistent");
196   }
197 
198   // Update our in-memory data structures
199   type_config_map_.clear();
200   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
201   for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
202     // Update our type_config_map_
203     type_config_map_.emplace(type_config.schema_type(), type_config);
204   }
205   ICING_ASSIGN_OR_RETURN(
206       section_manager_,
207       SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
208 
209   return libtextclassifier3::Status::OK;
210 }
211 
RegenerateDerivedFiles()212 libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles() {
213   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
214 
215   ICING_RETURN_IF_ERROR(ResetSchemaTypeMapper());
216   type_config_map_.clear();
217 
218   for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
219     // Update our type_config_map_
220     type_config_map_.emplace(type_config.schema_type(), type_config);
221 
222     // Assign a SchemaTypeId to the type
223     ICING_RETURN_IF_ERROR(schema_type_mapper_->Put(
224         type_config.schema_type(), schema_type_mapper_->num_keys()));
225   }
226 
227   ICING_ASSIGN_OR_RETURN(
228       section_manager_,
229       SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
230 
231   // Write the header
232   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
233   ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
234 
235   return libtextclassifier3::Status::OK;
236 }
237 
HeaderExists()238 bool SchemaStore::HeaderExists() {
239   if (!filesystem_.FileExists(MakeHeaderFilename(base_dir_).c_str())) {
240     return false;
241   }
242 
243   int64_t file_size =
244       filesystem_.GetFileSize(MakeHeaderFilename(base_dir_).c_str());
245 
246   // If it's been truncated to size 0 before, we consider it to be a new file
247   return file_size != 0 && file_size != Filesystem::kBadFileSize;
248 }
249 
UpdateHeader(const Crc32 & checksum)250 libtextclassifier3::Status SchemaStore::UpdateHeader(const Crc32& checksum) {
251   // Write the header
252   SchemaStore::Header header;
253   header.magic = SchemaStore::Header::kMagic;
254   header.checksum = checksum.Get();
255 
256   ScopedFd scoped_fd(
257       filesystem_.OpenForWrite(MakeHeaderFilename(base_dir_).c_str()));
258   // This should overwrite the header.
259   if (!scoped_fd.is_valid() ||
260       !filesystem_.Write(scoped_fd.get(), &header, sizeof(header)) ||
261       !filesystem_.DataSync(scoped_fd.get())) {
262     return absl_ports::InternalError(absl_ports::StrCat(
263         "Failed to write SchemaStore header: ", MakeHeaderFilename(base_dir_)));
264   }
265   return libtextclassifier3::Status::OK;
266 }
267 
ResetSchemaTypeMapper()268 libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
269   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
270   schema_type_mapper_.reset();
271   // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
272   // that can support error logging.
273   libtextclassifier3::Status status = KeyMapper<SchemaTypeId>::Delete(
274       filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
275   if (!status.ok()) {
276     ICING_LOG(ERROR) << status.error_message()
277                      << "Failed to delete old schema_type mapper";
278     return status;
279   }
280   ICING_ASSIGN_OR_RETURN(
281       schema_type_mapper_,
282       KeyMapper<SchemaTypeId>::Create(filesystem_,
283                                       MakeSchemaTypeMapperFilename(base_dir_),
284                                       kSchemaTypeMapperMaxSize));
285 
286   return libtextclassifier3::Status::OK;
287 }
288 
ComputeChecksum() const289 libtextclassifier3::StatusOr<Crc32> SchemaStore::ComputeChecksum() const {
290   Crc32 total_checksum;
291   if (!has_schema_successfully_set_) {
292     // Nothing to checksum
293     return total_checksum;
294   }
295   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
296   Crc32 schema_checksum;
297   schema_checksum.Append(schema_proto->SerializeAsString());
298 
299   Crc32 schema_type_mapper_checksum = schema_type_mapper_->ComputeChecksum();
300 
301   total_checksum.Append(std::to_string(schema_checksum.Get()));
302   total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
303 
304   return total_checksum;
305 }
306 
GetSchema() const307 libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema()
308     const {
309   return schema_file_.Read();
310 }
311 
312 // TODO(cassiewang): Consider removing this definition of SetSchema if it's not
313 // needed by production code. It's currently being used by our tests, but maybe
314 // it's trivial to change our test code to also use the
315 // SetSchema(SchemaProto&& new_schema)
316 libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
SetSchema(const SchemaProto & new_schema,bool ignore_errors_and_delete_documents)317 SchemaStore::SetSchema(const SchemaProto& new_schema,
318                        bool ignore_errors_and_delete_documents) {
319   return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents);
320 }
321 
322 libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
SetSchema(SchemaProto && new_schema,bool ignore_errors_and_delete_documents)323 SchemaStore::SetSchema(SchemaProto&& new_schema,
324                        bool ignore_errors_and_delete_documents) {
325   ICING_ASSIGN_OR_RETURN(SchemaUtil::DependencyMap new_dependency_map,
326                          SchemaUtil::Validate(new_schema));
327 
328   SetSchemaResult result;
329 
330   auto schema_proto_or = GetSchema();
331   if (absl_ports::IsNotFound(schema_proto_or.status())) {
332     // We don't have a pre-existing schema, so anything is valid.
333     result.success = true;
334   } else if (!schema_proto_or.ok()) {
335     // Real error
336     return schema_proto_or.status();
337   } else {
338     // At this point, we're guaranteed that we have a schema.
339     const SchemaProto old_schema = *schema_proto_or.ValueOrDie();
340 
341     // Assume we can set the schema unless proven otherwise.
342     result.success = true;
343 
344     if (new_schema.SerializeAsString() == old_schema.SerializeAsString()) {
345       // Same schema as before. No need to update anything
346       return result;
347     }
348 
349     // Different schema, track the differences and see if we can still write it
350     SchemaUtil::SchemaDelta schema_delta =
351         SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
352                                               new_dependency_map);
353 
354     // An incompatible index is fine, we can just reindex
355     result.index_incompatible = schema_delta.index_incompatible;
356 
357     for (const auto& schema_type : schema_delta.schema_types_deleted) {
358       // We currently don't support deletions, so mark this as not possible.
359       // This will change once we allow force-set schemas.
360       result.success = false;
361 
362       result.schema_types_deleted_by_name.emplace(schema_type);
363 
364       ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
365                              GetSchemaTypeId(schema_type));
366       result.schema_types_deleted_by_id.emplace(schema_type_id);
367     }
368 
369     for (const auto& schema_type : schema_delta.schema_types_incompatible) {
370       // We currently don't support incompatible schemas, so mark this as
371       // not possible. This will change once we allow force-set schemas.
372       result.success = false;
373 
374       result.schema_types_incompatible_by_name.emplace(schema_type);
375 
376       ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
377                              GetSchemaTypeId(schema_type));
378       result.schema_types_incompatible_by_id.emplace(schema_type_id);
379     }
380 
381     // SchemaTypeIds changing is fine, we can update the DocumentStore
382     result.old_schema_type_ids_changed =
383         SchemaTypeIdsChanged(old_schema, new_schema);
384   }
385 
386   // We can force set the schema if the caller has told us to ignore any errors
387   result.success = result.success || ignore_errors_and_delete_documents;
388 
389   if (result.success) {
390     // Write the schema (and potentially overwrite a previous schema)
391     ICING_RETURN_IF_ERROR(
392         schema_file_.Write(std::make_unique<SchemaProto>(new_schema)));
393     has_schema_successfully_set_ = true;
394 
395     ICING_RETURN_IF_ERROR(RegenerateDerivedFiles());
396   }
397 
398   return result;
399 }
400 
401 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
GetSchemaTypeConfig(std::string_view schema_type) const402 SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const {
403   ICING_RETURN_IF_ERROR(CheckSchemaSet());
404   const auto& type_config_iter =
405       type_config_map_.find(std::string(schema_type));
406   if (type_config_iter == type_config_map_.end()) {
407     return absl_ports::NotFoundError(
408         absl_ports::StrCat("Schema type config '", schema_type, "' not found"));
409   }
410   return &type_config_iter->second;
411 }
412 
GetSchemaTypeId(std::string_view schema_type) const413 libtextclassifier3::StatusOr<SchemaTypeId> SchemaStore::GetSchemaTypeId(
414     std::string_view schema_type) const {
415   ICING_RETURN_IF_ERROR(CheckSchemaSet());
416   return schema_type_mapper_->Get(schema_type);
417 }
418 
419 libtextclassifier3::StatusOr<std::vector<std::string_view>>
GetStringSectionContent(const DocumentProto & document,std::string_view section_path) const420 SchemaStore::GetStringSectionContent(const DocumentProto& document,
421                                      std::string_view section_path) const {
422   ICING_RETURN_IF_ERROR(CheckSchemaSet());
423   return section_manager_->GetStringSectionContent(document, section_path);
424 }
425 
426 libtextclassifier3::StatusOr<std::vector<std::string_view>>
GetStringSectionContent(const DocumentProto & document,SectionId section_id) const427 SchemaStore::GetStringSectionContent(const DocumentProto& document,
428                                      SectionId section_id) const {
429   ICING_RETURN_IF_ERROR(CheckSchemaSet());
430   return section_manager_->GetStringSectionContent(document, section_id);
431 }
432 
433 libtextclassifier3::StatusOr<const SectionMetadata*>
GetSectionMetadata(SchemaTypeId schema_type_id,SectionId section_id) const434 SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id,
435                                 SectionId section_id) const {
436   ICING_RETURN_IF_ERROR(CheckSchemaSet());
437   return section_manager_->GetSectionMetadata(schema_type_id, section_id);
438 }
439 
ExtractSections(const DocumentProto & document) const440 libtextclassifier3::StatusOr<std::vector<Section>> SchemaStore::ExtractSections(
441     const DocumentProto& document) const {
442   ICING_RETURN_IF_ERROR(CheckSchemaSet());
443   return section_manager_->ExtractSections(document);
444 }
445 
PersistToDisk()446 libtextclassifier3::Status SchemaStore::PersistToDisk() {
447   if (!has_schema_successfully_set_) {
448     return libtextclassifier3::Status::OK;
449   }
450   ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk());
451   // Write the header
452   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
453   ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
454 
455   return libtextclassifier3::Status::OK;
456 }
457 
GetStorageInfo() const458 SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
459   SchemaStoreStorageInfoProto storage_info;
460   int64_t directory_size = filesystem_.GetDiskUsage(base_dir_.c_str());
461   if (directory_size != Filesystem::kBadFileSize) {
462     storage_info.set_schema_store_size(directory_size);
463   } else {
464     storage_info.set_schema_store_size(-1);
465   }
466   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
467   storage_info.set_num_schema_types(schema->types_size());
468   int total_sections = 0;
469   int num_types_sections_exhausted = 0;
470   for (const SchemaTypeConfigProto& type : schema->types()) {
471     auto sections_list_or =
472         section_manager_->GetMetadataList(type.schema_type());
473     if (!sections_list_or.ok()) {
474       continue;
475     }
476     total_sections += sections_list_or.ValueOrDie()->size();
477     if (sections_list_or.ValueOrDie()->size() == kMaxSectionId + 1) {
478       ++num_types_sections_exhausted;
479     }
480   }
481 
482   storage_info.set_num_total_sections(total_sections);
483   storage_info.set_num_schema_types_sections_exhausted(
484       num_types_sections_exhausted);
485   return storage_info;
486 }
487 
488 }  // namespace lib
489 }  // namespace icing
490