1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/schema/schema-store.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/file/file-backed-proto.h"
31 #include "icing/file/filesystem.h"
32 #include "icing/proto/document.pb.h"
33 #include "icing/proto/schema.pb.h"
34 #include "icing/schema/schema-util.h"
35 #include "icing/schema/section-manager.h"
36 #include "icing/schema/section.h"
37 #include "icing/store/document-filter-data.h"
38 #include "icing/store/key-mapper.h"
39 #include "icing/util/crc32.h"
40 #include "icing/util/logging.h"
41 #include "icing/util/status-macros.h"
42
43 namespace icing {
44 namespace lib {
45
46 namespace {
47
48 constexpr char kSchemaStoreHeaderFilename[] = "schema_store_header";
49 constexpr char kSchemaFilename[] = "schema.pb";
50 constexpr char kSchemaTypeMapperFilename[] = "schema_type_mapper";
51
52 // A KeyMapper stores its data across 3 arrays internally. Giving each array
53 // 128KiB for storage means the entire KeyMapper requires 384KiB.
54 constexpr int32_t kSchemaTypeMapperMaxSize = 3 * 128 * 1024; // 384 KiB
55
MakeHeaderFilename(const std::string & base_dir)56 const std::string MakeHeaderFilename(const std::string& base_dir) {
57 return absl_ports::StrCat(base_dir, "/", kSchemaStoreHeaderFilename);
58 }
59
MakeSchemaFilename(const std::string & base_dir)60 const std::string MakeSchemaFilename(const std::string& base_dir) {
61 return absl_ports::StrCat(base_dir, "/", kSchemaFilename);
62 }
63
MakeSchemaTypeMapperFilename(const std::string & base_dir)64 const std::string MakeSchemaTypeMapperFilename(const std::string& base_dir) {
65 return absl_ports::StrCat(base_dir, "/", kSchemaTypeMapperFilename);
66 }
67
68 // Assuming that SchemaTypeIds are assigned to schema types based on their order
69 // in the SchemaProto. Check if the schema type->SchemaTypeId mapping would
70 // change with the new schema.
SchemaTypeIdsChanged(const SchemaProto & old_schema,const SchemaProto & new_schema)71 std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged(
72 const SchemaProto& old_schema, const SchemaProto& new_schema) {
73 std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
74
75 std::unordered_map<std::string, int> old_types_and_index;
76 for (int i = 0; i < old_schema.types_size(); ++i) {
77 old_types_and_index.emplace(old_schema.types(i).schema_type(), i);
78 }
79
80 std::unordered_map<std::string, int> new_types_and_index;
81 for (int i = 0; i < new_schema.types_size(); ++i) {
82 new_types_and_index.emplace(new_schema.types(i).schema_type(), i);
83 }
84
85 for (const auto& old_type_index : old_types_and_index) {
86 const auto& iter = new_types_and_index.find(old_type_index.first);
87 // We only care if the type exists in both the old and new schema. If the
88 // type has been deleted, then it'll be captured in
89 // SetSchemaResult.schema_types_deleted*. If the type has been added in the
90 // new schema then we also don't care because nothing needs to be updated.
91 if (iter != new_types_and_index.end()) {
92 // Since the SchemaTypeId of the schema type is just the index of it in
93 // the SchemaProto, compare the index and save it if it's not the same
94 if (old_type_index.second != iter->second) {
95 old_schema_type_ids_changed.emplace(old_type_index.second);
96 }
97 }
98 }
99
100 return old_schema_type_ids_changed;
101 }
102
103 } // namespace
104
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,InitializeStatsProto * initialize_stats)105 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
106 const Filesystem* filesystem, const std::string& base_dir,
107 const Clock* clock, InitializeStatsProto* initialize_stats) {
108 ICING_RETURN_ERROR_IF_NULL(filesystem);
109 ICING_RETURN_ERROR_IF_NULL(clock);
110
111 std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
112 new SchemaStore(filesystem, base_dir, clock));
113 ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats));
114 return schema_store;
115 }
116
SchemaStore(const Filesystem * filesystem,std::string base_dir,const Clock * clock)117 SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir,
118 const Clock* clock)
119 : filesystem_(*filesystem),
120 base_dir_(std::move(base_dir)),
121 clock_(*clock),
122 schema_file_(*filesystem, MakeSchemaFilename(base_dir_)) {}
123
~SchemaStore()124 SchemaStore::~SchemaStore() {
125 if (has_schema_successfully_set_) {
126 if (!PersistToDisk().ok()) {
127 ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor";
128 }
129 }
130 }
131
Initialize(InitializeStatsProto * initialize_stats)132 libtextclassifier3::Status SchemaStore::Initialize(
133 InitializeStatsProto* initialize_stats) {
134 auto schema_proto_or = GetSchema();
135 if (absl_ports::IsNotFound(schema_proto_or.status())) {
136 // Don't have an existing schema proto, that's fine
137 return libtextclassifier3::Status::OK;
138 } else if (!schema_proto_or.ok()) {
139 // Real error when trying to read the existing schema
140 return schema_proto_or.status();
141 }
142 has_schema_successfully_set_ = true;
143
144 if (!InitializeDerivedFiles().ok()) {
145 ICING_VLOG(3)
146 << "Couldn't find derived files or failed to initialize them, "
147 "regenerating derived files for SchemaStore.";
148 std::unique_ptr<Timer> regenerate_timer = clock_.GetNewTimer();
149 if (initialize_stats != nullptr) {
150 initialize_stats->set_schema_store_recovery_cause(
151 InitializeStatsProto::IO_ERROR);
152 }
153 ICING_RETURN_IF_ERROR(RegenerateDerivedFiles());
154 if (initialize_stats != nullptr) {
155 initialize_stats->set_schema_store_recovery_latency_ms(
156 regenerate_timer->GetElapsedMilliseconds());
157 }
158 }
159
160 if (initialize_stats != nullptr) {
161 initialize_stats->set_num_schema_types(type_config_map_.size());
162 }
163
164 return libtextclassifier3::Status::OK;
165 }
166
InitializeDerivedFiles()167 libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() {
168 if (!HeaderExists()) {
169 // Without a header, we don't know if things are consistent between each
170 // other so the caller should just regenerate everything from ground truth.
171 return absl_ports::InternalError("SchemaStore header doesn't exist");
172 }
173
174 SchemaStore::Header header;
175 if (!filesystem_.Read(MakeHeaderFilename(base_dir_).c_str(), &header,
176 sizeof(header))) {
177 return absl_ports::InternalError(
178 absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
179 }
180
181 if (header.magic != SchemaStore::Header::kMagic) {
182 return absl_ports::InternalError(absl_ports::StrCat(
183 "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_)));
184 }
185
186 ICING_ASSIGN_OR_RETURN(
187 schema_type_mapper_,
188 KeyMapper<SchemaTypeId>::Create(filesystem_,
189 MakeSchemaTypeMapperFilename(base_dir_),
190 kSchemaTypeMapperMaxSize));
191
192 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
193 if (checksum.Get() != header.checksum) {
194 return absl_ports::InternalError(
195 "Combined checksum of SchemaStore was inconsistent");
196 }
197
198 // Update our in-memory data structures
199 type_config_map_.clear();
200 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
201 for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
202 // Update our type_config_map_
203 type_config_map_.emplace(type_config.schema_type(), type_config);
204 }
205 ICING_ASSIGN_OR_RETURN(
206 section_manager_,
207 SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
208
209 return libtextclassifier3::Status::OK;
210 }
211
RegenerateDerivedFiles()212 libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles() {
213 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
214
215 ICING_RETURN_IF_ERROR(ResetSchemaTypeMapper());
216 type_config_map_.clear();
217
218 for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
219 // Update our type_config_map_
220 type_config_map_.emplace(type_config.schema_type(), type_config);
221
222 // Assign a SchemaTypeId to the type
223 ICING_RETURN_IF_ERROR(schema_type_mapper_->Put(
224 type_config.schema_type(), schema_type_mapper_->num_keys()));
225 }
226
227 ICING_ASSIGN_OR_RETURN(
228 section_manager_,
229 SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
230
231 // Write the header
232 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
233 ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
234
235 return libtextclassifier3::Status::OK;
236 }
237
HeaderExists()238 bool SchemaStore::HeaderExists() {
239 if (!filesystem_.FileExists(MakeHeaderFilename(base_dir_).c_str())) {
240 return false;
241 }
242
243 int64_t file_size =
244 filesystem_.GetFileSize(MakeHeaderFilename(base_dir_).c_str());
245
246 // If it's been truncated to size 0 before, we consider it to be a new file
247 return file_size != 0 && file_size != Filesystem::kBadFileSize;
248 }
249
UpdateHeader(const Crc32 & checksum)250 libtextclassifier3::Status SchemaStore::UpdateHeader(const Crc32& checksum) {
251 // Write the header
252 SchemaStore::Header header;
253 header.magic = SchemaStore::Header::kMagic;
254 header.checksum = checksum.Get();
255
256 ScopedFd scoped_fd(
257 filesystem_.OpenForWrite(MakeHeaderFilename(base_dir_).c_str()));
258 // This should overwrite the header.
259 if (!scoped_fd.is_valid() ||
260 !filesystem_.Write(scoped_fd.get(), &header, sizeof(header)) ||
261 !filesystem_.DataSync(scoped_fd.get())) {
262 return absl_ports::InternalError(absl_ports::StrCat(
263 "Failed to write SchemaStore header: ", MakeHeaderFilename(base_dir_)));
264 }
265 return libtextclassifier3::Status::OK;
266 }
267
ResetSchemaTypeMapper()268 libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
269 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
270 schema_type_mapper_.reset();
271 // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
272 // that can support error logging.
273 libtextclassifier3::Status status = KeyMapper<SchemaTypeId>::Delete(
274 filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
275 if (!status.ok()) {
276 ICING_LOG(ERROR) << status.error_message()
277 << "Failed to delete old schema_type mapper";
278 return status;
279 }
280 ICING_ASSIGN_OR_RETURN(
281 schema_type_mapper_,
282 KeyMapper<SchemaTypeId>::Create(filesystem_,
283 MakeSchemaTypeMapperFilename(base_dir_),
284 kSchemaTypeMapperMaxSize));
285
286 return libtextclassifier3::Status::OK;
287 }
288
ComputeChecksum() const289 libtextclassifier3::StatusOr<Crc32> SchemaStore::ComputeChecksum() const {
290 Crc32 total_checksum;
291 if (!has_schema_successfully_set_) {
292 // Nothing to checksum
293 return total_checksum;
294 }
295 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
296 Crc32 schema_checksum;
297 schema_checksum.Append(schema_proto->SerializeAsString());
298
299 Crc32 schema_type_mapper_checksum = schema_type_mapper_->ComputeChecksum();
300
301 total_checksum.Append(std::to_string(schema_checksum.Get()));
302 total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
303
304 return total_checksum;
305 }
306
GetSchema() const307 libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema()
308 const {
309 return schema_file_.Read();
310 }
311
312 // TODO(cassiewang): Consider removing this definition of SetSchema if it's not
313 // needed by production code. It's currently being used by our tests, but maybe
314 // it's trivial to change our test code to also use the
315 // SetSchema(SchemaProto&& new_schema)
316 libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
SetSchema(const SchemaProto & new_schema,bool ignore_errors_and_delete_documents)317 SchemaStore::SetSchema(const SchemaProto& new_schema,
318 bool ignore_errors_and_delete_documents) {
319 return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents);
320 }
321
322 libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
SetSchema(SchemaProto && new_schema,bool ignore_errors_and_delete_documents)323 SchemaStore::SetSchema(SchemaProto&& new_schema,
324 bool ignore_errors_and_delete_documents) {
325 ICING_ASSIGN_OR_RETURN(SchemaUtil::DependencyMap new_dependency_map,
326 SchemaUtil::Validate(new_schema));
327
328 SetSchemaResult result;
329
330 auto schema_proto_or = GetSchema();
331 if (absl_ports::IsNotFound(schema_proto_or.status())) {
332 // We don't have a pre-existing schema, so anything is valid.
333 result.success = true;
334 } else if (!schema_proto_or.ok()) {
335 // Real error
336 return schema_proto_or.status();
337 } else {
338 // At this point, we're guaranteed that we have a schema.
339 const SchemaProto old_schema = *schema_proto_or.ValueOrDie();
340
341 // Assume we can set the schema unless proven otherwise.
342 result.success = true;
343
344 if (new_schema.SerializeAsString() == old_schema.SerializeAsString()) {
345 // Same schema as before. No need to update anything
346 return result;
347 }
348
349 // Different schema, track the differences and see if we can still write it
350 SchemaUtil::SchemaDelta schema_delta =
351 SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
352 new_dependency_map);
353
354 // An incompatible index is fine, we can just reindex
355 result.index_incompatible = schema_delta.index_incompatible;
356
357 for (const auto& schema_type : schema_delta.schema_types_deleted) {
358 // We currently don't support deletions, so mark this as not possible.
359 // This will change once we allow force-set schemas.
360 result.success = false;
361
362 result.schema_types_deleted_by_name.emplace(schema_type);
363
364 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
365 GetSchemaTypeId(schema_type));
366 result.schema_types_deleted_by_id.emplace(schema_type_id);
367 }
368
369 for (const auto& schema_type : schema_delta.schema_types_incompatible) {
370 // We currently don't support incompatible schemas, so mark this as
371 // not possible. This will change once we allow force-set schemas.
372 result.success = false;
373
374 result.schema_types_incompatible_by_name.emplace(schema_type);
375
376 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
377 GetSchemaTypeId(schema_type));
378 result.schema_types_incompatible_by_id.emplace(schema_type_id);
379 }
380
381 // SchemaTypeIds changing is fine, we can update the DocumentStore
382 result.old_schema_type_ids_changed =
383 SchemaTypeIdsChanged(old_schema, new_schema);
384 }
385
386 // We can force set the schema if the caller has told us to ignore any errors
387 result.success = result.success || ignore_errors_and_delete_documents;
388
389 if (result.success) {
390 // Write the schema (and potentially overwrite a previous schema)
391 ICING_RETURN_IF_ERROR(
392 schema_file_.Write(std::make_unique<SchemaProto>(new_schema)));
393 has_schema_successfully_set_ = true;
394
395 ICING_RETURN_IF_ERROR(RegenerateDerivedFiles());
396 }
397
398 return result;
399 }
400
401 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
GetSchemaTypeConfig(std::string_view schema_type) const402 SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const {
403 ICING_RETURN_IF_ERROR(CheckSchemaSet());
404 const auto& type_config_iter =
405 type_config_map_.find(std::string(schema_type));
406 if (type_config_iter == type_config_map_.end()) {
407 return absl_ports::NotFoundError(
408 absl_ports::StrCat("Schema type config '", schema_type, "' not found"));
409 }
410 return &type_config_iter->second;
411 }
412
GetSchemaTypeId(std::string_view schema_type) const413 libtextclassifier3::StatusOr<SchemaTypeId> SchemaStore::GetSchemaTypeId(
414 std::string_view schema_type) const {
415 ICING_RETURN_IF_ERROR(CheckSchemaSet());
416 return schema_type_mapper_->Get(schema_type);
417 }
418
419 libtextclassifier3::StatusOr<std::vector<std::string_view>>
GetStringSectionContent(const DocumentProto & document,std::string_view section_path) const420 SchemaStore::GetStringSectionContent(const DocumentProto& document,
421 std::string_view section_path) const {
422 ICING_RETURN_IF_ERROR(CheckSchemaSet());
423 return section_manager_->GetStringSectionContent(document, section_path);
424 }
425
426 libtextclassifier3::StatusOr<std::vector<std::string_view>>
GetStringSectionContent(const DocumentProto & document,SectionId section_id) const427 SchemaStore::GetStringSectionContent(const DocumentProto& document,
428 SectionId section_id) const {
429 ICING_RETURN_IF_ERROR(CheckSchemaSet());
430 return section_manager_->GetStringSectionContent(document, section_id);
431 }
432
433 libtextclassifier3::StatusOr<const SectionMetadata*>
GetSectionMetadata(SchemaTypeId schema_type_id,SectionId section_id) const434 SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id,
435 SectionId section_id) const {
436 ICING_RETURN_IF_ERROR(CheckSchemaSet());
437 return section_manager_->GetSectionMetadata(schema_type_id, section_id);
438 }
439
ExtractSections(const DocumentProto & document) const440 libtextclassifier3::StatusOr<std::vector<Section>> SchemaStore::ExtractSections(
441 const DocumentProto& document) const {
442 ICING_RETURN_IF_ERROR(CheckSchemaSet());
443 return section_manager_->ExtractSections(document);
444 }
445
PersistToDisk()446 libtextclassifier3::Status SchemaStore::PersistToDisk() {
447 if (!has_schema_successfully_set_) {
448 return libtextclassifier3::Status::OK;
449 }
450 ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk());
451 // Write the header
452 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
453 ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
454
455 return libtextclassifier3::Status::OK;
456 }
457
GetStorageInfo() const458 SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
459 SchemaStoreStorageInfoProto storage_info;
460 int64_t directory_size = filesystem_.GetDiskUsage(base_dir_.c_str());
461 if (directory_size != Filesystem::kBadFileSize) {
462 storage_info.set_schema_store_size(directory_size);
463 } else {
464 storage_info.set_schema_store_size(-1);
465 }
466 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
467 storage_info.set_num_schema_types(schema->types_size());
468 int total_sections = 0;
469 int num_types_sections_exhausted = 0;
470 for (const SchemaTypeConfigProto& type : schema->types()) {
471 auto sections_list_or =
472 section_manager_->GetMetadataList(type.schema_type());
473 if (!sections_list_or.ok()) {
474 continue;
475 }
476 total_sections += sections_list_or.ValueOrDie()->size();
477 if (sections_list_or.ValueOrDie()->size() == kMaxSectionId + 1) {
478 ++num_types_sections_exhausted;
479 }
480 }
481
482 storage_info.set_num_total_sections(total_sections);
483 storage_info.set_num_schema_types_sections_exhausted(
484 num_types_sections_exhausted);
485 return storage_info;
486 }
487
488 } // namespace lib
489 } // namespace icing
490