1 // Copyright (C) 2023 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_NUMERIC_INTEGER_INDEX_H_ 16 #define ICING_INDEX_NUMERIC_INTEGER_INDEX_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <unordered_map> 23 24 #include "icing/text_classifier/lib3/utils/base/status.h" 25 #include "icing/text_classifier/lib3/utils/base/statusor.h" 26 #include "icing/file/file-backed-proto.h" 27 #include "icing/file/filesystem.h" 28 #include "icing/file/memory-mapped-file.h" 29 #include "icing/index/numeric/integer-index-storage.h" 30 #include "icing/index/numeric/numeric-index.h" 31 #include "icing/index/numeric/posting-list-integer-index-serializer.h" 32 #include "icing/index/numeric/wildcard-property-storage.pb.h" 33 #include "icing/schema/schema-store.h" 34 #include "icing/store/document-id.h" 35 #include "icing/store/document-store.h" 36 #include "icing/util/crc32.h" 37 38 namespace icing { 39 namespace lib { 40 41 // IntegerIndex: a wrapper class for managing IntegerIndexStorage (a lower level 42 // persistent storage class for indexing and searching contents of integer type 43 // sections in documents) instances for different property paths. 44 // We separate indexable integer data from different properties into different 45 // storages, and IntegerIndex manages and handles indexable integer data 46 // appropriately to their corresponding IntegerIndexStorage instance according 47 // to the given property path. 48 class IntegerIndex : public NumericIndex<int64_t> { 49 public: 50 using PropertyToStorageMapType = 51 std::unordered_map<std::string, std::unique_ptr<IntegerIndexStorage>>; 52 53 // Maximum number of individual property storages that this index will allow 54 // before falling back to placing hits for any new properties into the 55 // 'wildcard' storage. 56 static constexpr int kMaxPropertyStorages = 32; 57 58 static constexpr int32_t kDefaultNumDataThresholdForBucketSplit = 59 IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit; 60 61 struct Info { 62 static constexpr int32_t kMagic = 0x5d8a1e8a; 63 64 int32_t magic; 65 DocumentId last_added_document_id; 66 int32_t num_data_threshold_for_bucket_split; 67 ComputeChecksumInfo68 Crc32 ComputeChecksum() const { 69 return Crc32( 70 std::string_view(reinterpret_cast<const char*>(this), sizeof(Info))); 71 } 72 } __attribute__((packed)); 73 static_assert(sizeof(Info) == 12, ""); 74 75 // Metadata file layout: <Crcs><Info> 76 static constexpr int32_t kCrcsMetadataFileOffset = 0; 77 static constexpr int32_t kInfoMetadataFileOffset = 78 static_cast<int32_t>(sizeof(Crcs)); 79 static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info); 80 static_assert(kMetadataFileSize == 24, ""); 81 82 static constexpr WorkingPathType kWorkingPathType = 83 WorkingPathType::kDirectory; 84 static constexpr std::string_view kFilePrefix = "integer_index"; 85 86 // Creates a new IntegerIndex instance to index integers. If any of the 87 // underlying file is missing, then delete the whole working_path and 88 // (re)initialize with new ones. Otherwise initialize and create the instance 89 // by existing files. 90 // 91 // filesystem: Object to make system level calls 92 // working_path: Specifies the working path for PersistentStorage. 93 // IntegerIndex uses working path as working directory and all 94 // related files will be stored under this directory. See 95 // PersistentStorage for more details about the concept of 96 // working_path. 97 // num_data_threshold_for_bucket_split: see IntegerIndexStorage::Options for 98 // more details. 99 // pre_mapping_fbv: flag indicating whether memory map max possible file size 100 // for underlying FileBackedVector before growing the actual 101 // file size. 102 // 103 // Returns: 104 // - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored 105 // checksum. 106 // - INTERNAL_ERROR on I/O errors. 107 // - Any FileBackedVector/MemoryMappedFile errors. 108 static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> Create( 109 const Filesystem& filesystem, std::string working_path, 110 int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv); 111 112 // Deletes IntegerIndex under working_path. 113 // 114 // Returns: 115 // - OK on success 116 // - INTERNAL_ERROR on I/O error Discard(const Filesystem & filesystem,const std::string & working_path)117 static libtextclassifier3::Status Discard(const Filesystem& filesystem, 118 const std::string& working_path) { 119 return PersistentStorage::Discard(filesystem, working_path, 120 kWorkingPathType); 121 } 122 123 ~IntegerIndex() override; 124 125 // Returns an Editor instance for adding new records into integer index for a 126 // given property, DocumentId and SectionId. See Editor for more details. Edit(std::string_view property_path,DocumentId document_id,SectionId section_id)127 std::unique_ptr<typename NumericIndex<int64_t>::Editor> Edit( 128 std::string_view property_path, DocumentId document_id, 129 SectionId section_id) override { 130 return std::make_unique<Editor>(property_path, document_id, section_id, 131 *this, num_data_threshold_for_bucket_split_, 132 pre_mapping_fbv_); 133 } 134 135 // Returns a DocHitInfoIterator for iterating through all docs which have the 136 // specified (integer) property contents in range [query_key_lower, 137 // query_key_upper]. 138 // When iterating through all relevant doc hits, it: 139 // - Merges multiple SectionIds of doc hits with same DocumentId into a single 140 // SectionIdMask and constructs DocHitInfo. 141 // - Returns DocHitInfo in descending DocumentId order. 142 // 143 // Returns: 144 // - On success: a DocHitInfoIterator instance 145 // - NOT_FOUND_ERROR if the given property_path doesn't exist 146 // - Any IntegerIndexStorage errors 147 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator( 148 std::string_view property_path, int64_t key_lower, int64_t key_upper, 149 const DocumentStore& document_store, const SchemaStore& schema_store, 150 int64_t current_time_ms) const override; 151 152 // Reduces internal file sizes by reclaiming space and ids of deleted 153 // documents. Integer index will convert all data (hits) to the new document 154 // ids and regenerate all index files. If all data in a property path are 155 // completely deleted, then the underlying storage will be discarded as well. 156 // 157 // - document_id_old_to_new: a map for converting old document id to new 158 // document id. 159 // - new_last_added_document_id: will be used to update the last added 160 // document id in the integer index. 161 // 162 // Returns: 163 // - OK on success 164 // - INTERNAL_ERROR on IO error 165 libtextclassifier3::Status Optimize( 166 const std::vector<DocumentId>& document_id_old_to_new, 167 DocumentId new_last_added_document_id) override; 168 169 // Clears all integer index data by discarding all existing storages, and set 170 // last_added_document_id to kInvalidDocumentId. 171 // 172 // Returns: 173 // - OK on success 174 // - INTERNAL_ERROR on I/O error 175 libtextclassifier3::Status Clear() override; 176 last_added_document_id()177 DocumentId last_added_document_id() const override { 178 return info().last_added_document_id; 179 } 180 set_last_added_document_id(DocumentId document_id)181 void set_last_added_document_id(DocumentId document_id) override { 182 SetInfoDirty(); 183 184 Info& info_ref = info(); 185 if (info_ref.last_added_document_id == kInvalidDocumentId || 186 document_id > info_ref.last_added_document_id) { 187 info_ref.last_added_document_id = document_id; 188 } 189 } 190 num_property_indices()191 int num_property_indices() const override { 192 return property_to_storage_map_.size() + 193 ((wildcard_index_storage_ == nullptr) ? 0 : 1); 194 } 195 196 private: 197 class Editor : public NumericIndex<int64_t>::Editor { 198 public: Editor(std::string_view property_path,DocumentId document_id,SectionId section_id,IntegerIndex & integer_index,int32_t num_data_threshold_for_bucket_split,bool pre_mapping_fbv)199 explicit Editor(std::string_view property_path, DocumentId document_id, 200 SectionId section_id, IntegerIndex& integer_index, 201 int32_t num_data_threshold_for_bucket_split, 202 bool pre_mapping_fbv) 203 : NumericIndex<int64_t>::Editor(property_path, document_id, section_id), 204 integer_index_(integer_index), 205 num_data_threshold_for_bucket_split_( 206 num_data_threshold_for_bucket_split), 207 pre_mapping_fbv_(pre_mapping_fbv) {} 208 209 ~Editor() override = default; 210 BufferKey(int64_t key)211 libtextclassifier3::Status BufferKey(int64_t key) override { 212 seen_keys_.push_back(key); 213 return libtextclassifier3::Status::OK; 214 } 215 216 libtextclassifier3::Status IndexAllBufferedKeys() && override; 217 218 private: 219 // Vector for caching all seen keys. Since IntegerIndexStorage::AddKeys 220 // sorts and dedupes keys, we can just simply use vector here and move it to 221 // AddKeys(). 222 std::vector<int64_t> seen_keys_; 223 224 IntegerIndex& integer_index_; // Does not own. 225 226 int32_t num_data_threshold_for_bucket_split_; 227 228 // Flag indicating whether memory map max possible file size for underlying 229 // FileBackedVector before growing the actual file size. 230 bool pre_mapping_fbv_; 231 }; 232 IntegerIndex(const Filesystem & filesystem,std::string && working_path,std::unique_ptr<PostingListIntegerIndexSerializer> posting_list_serializer,std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,PropertyToStorageMapType && property_to_storage_map,std::unique_ptr<FileBackedProto<WildcardPropertyStorage>> wildcard_property_storage,std::unordered_set<std::string> wildcard_properties_set,std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage,int32_t num_data_threshold_for_bucket_split,bool pre_mapping_fbv)233 explicit IntegerIndex( 234 const Filesystem& filesystem, std::string&& working_path, 235 std::unique_ptr<PostingListIntegerIndexSerializer> 236 posting_list_serializer, 237 std::unique_ptr<MemoryMappedFile> metadata_mmapped_file, 238 PropertyToStorageMapType&& property_to_storage_map, 239 std::unique_ptr<FileBackedProto<WildcardPropertyStorage>> 240 wildcard_property_storage, 241 std::unordered_set<std::string> wildcard_properties_set, 242 std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage, 243 int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) 244 : NumericIndex<int64_t>(filesystem, std::move(working_path), 245 kWorkingPathType), 246 posting_list_serializer_(std::move(posting_list_serializer)), 247 metadata_mmapped_file_(std::move(metadata_mmapped_file)), 248 property_to_storage_map_(std::move(property_to_storage_map)), 249 wildcard_property_storage_(std::move(wildcard_property_storage)), 250 wildcard_properties_set_(std::move(wildcard_properties_set)), 251 wildcard_index_storage_(std::move(wildcard_index_storage)), 252 num_data_threshold_for_bucket_split_( 253 num_data_threshold_for_bucket_split), 254 pre_mapping_fbv_(pre_mapping_fbv), 255 is_info_dirty_(false), 256 is_storage_dirty_(false) {} 257 258 static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> 259 InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, 260 int32_t num_data_threshold_for_bucket_split, 261 bool pre_mapping_fbv); 262 263 static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> 264 InitializeExistingFiles(const Filesystem& filesystem, 265 std::string&& working_path, 266 int32_t num_data_threshold_for_bucket_split, 267 bool pre_mapping_fbv); 268 269 // Adds the property path to the list of properties using wildcard storage. 270 // This will both update the in-memory list (wildcard_properties_set_) and 271 // the persistent list (wilcard_property_storage_). 272 // 273 // RETURNS: 274 // - OK on success 275 // - INTERNAL_ERROR if unable to successfully persist updated properties 276 // list in wildcard_property_storage_. 277 libtextclassifier3::Status AddPropertyToWildcardStorage( 278 const std::string& property_path); 279 280 // Transfers integer index data from the current integer index to 281 // new_integer_index. 282 // 283 // Returns: 284 // - OK on success 285 // - INTERNAL_ERROR on I/O error. This could potentially leave the storages 286 // in an invalid state and the caller should handle it properly (e.g. 287 // discard and rebuild) 288 libtextclassifier3::Status TransferIndex( 289 const std::vector<DocumentId>& document_id_old_to_new, 290 IntegerIndex* new_integer_index) const; 291 292 // Transfers integer index data from old_storage to new_integer_index. 293 // 294 // Returns: 295 // - OK on success 296 // - INTERNAL_ERROR on I/O error. This could potentially leave the storages 297 // in an invalid state and the caller should handle it properly (e.g. 298 // discard and rebuild) 299 libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> 300 TransferIntegerIndexStorage( 301 const std::vector<DocumentId>& document_id_old_to_new, 302 const IntegerIndexStorage* old_storage, const std::string& property_path, 303 IntegerIndex* new_integer_index) const; 304 305 // Transfers the persistent and in-memory list of properties using the 306 // wildcard storage from old_storage to new_integer_index. 307 // 308 // RETURNS: 309 // - OK on success 310 // - INTERNAL_ERROR if unable to successfully persist updated properties 311 // list in new_integer_index. 312 libtextclassifier3::Status TransferWildcardStorage( 313 IntegerIndex* new_integer_index) const; 314 315 // Flushes contents of all storages to underlying files. 316 // 317 // Returns: 318 // - OK on success 319 // - INTERNAL_ERROR on I/O error 320 libtextclassifier3::Status PersistStoragesToDisk(bool force) override; 321 322 // Flushes contents of metadata file. 323 // 324 // Returns: 325 // - OK on success 326 // - INTERNAL_ERROR on I/O error 327 libtextclassifier3::Status PersistMetadataToDisk(bool force) override; 328 329 // Computes and returns Info checksum. 330 // 331 // Returns: 332 // - Crc of the Info on success 333 libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override; 334 335 // Computes and returns all storages checksum. Checksums of (storage_crc, 336 // property_path) for all existing property paths will be combined together by 337 // XOR. 338 // 339 // Returns: 340 // - Crc of all storages on success 341 // - INTERNAL_ERROR if any data inconsistency 342 libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( 343 bool force) override; 344 crcs()345 Crcs& crcs() override { 346 return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() + 347 kCrcsMetadataFileOffset); 348 } 349 crcs()350 const Crcs& crcs() const override { 351 return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() + 352 kCrcsMetadataFileOffset); 353 } 354 info()355 Info& info() { 356 return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() + 357 kInfoMetadataFileOffset); 358 } 359 info()360 const Info& info() const { 361 return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() + 362 kInfoMetadataFileOffset); 363 } 364 SetInfoDirty()365 void SetInfoDirty() { is_info_dirty_ = true; } 366 // When storage is dirty, we have to set info dirty as well. So just expose 367 // SetDirty to set both. SetDirty()368 void SetDirty() { 369 is_info_dirty_ = true; 370 is_storage_dirty_ = true; 371 } 372 is_info_dirty()373 bool is_info_dirty() const { return is_info_dirty_; } is_storage_dirty()374 bool is_storage_dirty() const { return is_storage_dirty_; } 375 376 std::unique_ptr<PostingListIntegerIndexSerializer> posting_list_serializer_; 377 378 std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_; 379 380 // Property path to integer index storage map. 381 PropertyToStorageMapType property_to_storage_map_; 382 383 // Persistent list of properties that have added content to 384 // wildcard_index_storage_. 385 std::unique_ptr<FileBackedProto<WildcardPropertyStorage>> 386 wildcard_property_storage_; 387 388 // In-memory list of properties that have added content to 389 // wildcard_index_storage_. 390 std::unordered_set<std::string> wildcard_properties_set_; 391 392 // The index storage that is used once we have already created 393 // kMaxPropertyStorages in property_to_storage_map. 394 std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage_; 395 396 int32_t num_data_threshold_for_bucket_split_; 397 398 // Flag indicating whether memory map max possible file size for underlying 399 // FileBackedVector before growing the actual file size. 400 bool pre_mapping_fbv_; 401 402 bool is_info_dirty_; 403 bool is_storage_dirty_; 404 }; 405 406 } // namespace lib 407 } // namespace icing 408 409 #endif // ICING_INDEX_NUMERIC_INTEGER_INDEX_H_ 410