1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_ 16 #define ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <vector> 22 23 #include "icing/text_classifier/lib3/utils/base/status.h" 24 #include "icing/text_classifier/lib3/utils/base/statusor.h" 25 #include "icing/file/filesystem.h" 26 #include "icing/file/posting_list/flash-index-storage-header.h" 27 #include "icing/file/posting_list/index-block.h" 28 #include "icing/file/posting_list/posting-list-identifier.h" 29 #include "icing/file/posting_list/posting-list-used.h" 30 #include "icing/proto/debug.pb.h" 31 #include "icing/store/document-id.h" 32 33 namespace icing { 34 namespace lib { 35 36 // PostingListHolder: group PostingListUsed, id, and some other useful info for 37 // callers. 38 struct PostingListHolder { 39 // PostingListUsed owns an in-memory posting list data buffer. The data being 40 // interpreted is initialized via PRead from the storage. As such, we should 41 // sync it to disk after modifying it. 42 PostingListUsed posting_list; 43 44 // The PostingListIdentifier, which identifies both the block index and the 45 // posting list index on that block, is also returned for convenience. 46 PostingListIdentifier id; 47 48 // Next block index is also returned for convenience. If PostingListUsed is a 49 // max-sized posting list, then the caller has to use this value to handle 50 // chained max-sized posting list blocks. 51 uint32_t next_block_index; 52 PostingListHolderPostingListHolder53 explicit PostingListHolder(PostingListUsed&& posting_list_in, 54 PostingListIdentifier id_in, 55 uint32_t next_block_index_in) 56 : posting_list(std::move(posting_list_in)), 57 id(id_in), 58 next_block_index(next_block_index_in) {} 59 }; 60 61 // The FlashIndexStorage class manages the actual file that makes up blocks for 62 // posting lists. It allocates IndexBlocks as needed and maintains freelists to 63 // prevent excessive block fragmentation. 64 // 65 // It maintains two types of free lists: 66 // 1. On-disk, Header free list - This free list is stored in the Header 67 // block. There is a free list for every possible posting list size. Each 68 // entry for a posting list size contains the block_index of the 69 // IndexBlock that starts the free list chain. Each IndexBlock in the free 70 // list chain stores the index of the next IndexBlock in the chain. 71 // 2. In-memory free list - Like the Header free list, there is a free list of 72 // every possible posting list size. This free list contains not just the 73 // block_index of the available IndexBlock, but also the posting_list_index 74 // of the available PostingListUsed within the IndexBlock. This is because, 75 // unlike the Header free list, PostingListUseds are not actually freed 76 // when added to this free list. 77 // 78 // Whether or not the in-memory free list is used can be chosen via the 79 // in_memory param to the Create factory function. 80 // 81 // The advantage of using the in-memory free list is that it reduces the amount 82 // of flash writes made while editing the index (because actually freeing the 83 // PostingLists would require writing to that flash block). The disadvantage is 84 // that it introduces code complexity and potentially leaks blocks if power is 85 // lost or if FlashIndexStorage is destroyed before emptying the free list. 86 class FlashIndexStorage { 87 public: 88 // Creates a FlashIndexStorage at index_filename. in_memory determines whether 89 // or not the FlashIndexStorage maintains an in-memory freelist in order to 90 // avoid writes to the on-disk freelist. 91 // 92 // RETURNS: 93 // - On success, a valid instance of FlashIndexStorage 94 // - FAILED_PRECONDITION_ERROR if filesystem or serializer is null 95 // - INTERNAL_ERROR if unable to create a new header or read the existing 96 // one from disk. 97 static libtextclassifier3::StatusOr<FlashIndexStorage> Create( 98 std::string index_filename, const Filesystem* filesystem, 99 PostingListSerializer* serializer, bool in_memory = true); 100 101 // Reads magic from existing file header. We need this during Icing 102 // initialization phase to determine the version. 103 // 104 // RETURNS: 105 // - On success, a valid magic 106 // - FAILED_PRECONDITION_ERROR if filesystem is null 107 // - NOT_FOUND_ERROR if the flash index file doesn't exist 108 // - INTERNAL_ERROR on I/O error 109 static libtextclassifier3::StatusOr<int> ReadHeaderMagic( 110 const Filesystem* filesystem, const std::string& index_filename); 111 112 FlashIndexStorage(FlashIndexStorage&&) = default; 113 FlashIndexStorage(const FlashIndexStorage&) = delete; 114 FlashIndexStorage& operator=(FlashIndexStorage&&) = default; 115 FlashIndexStorage& operator=(const FlashIndexStorage&) = delete; 116 117 ~FlashIndexStorage(); 118 119 // Selects block size to use. 120 static uint32_t SelectBlockSize(); 121 122 // Retrieves the PostingList referred to by PostingListIdentifier. This 123 // posting list must have been previously allocated by a prior call to 124 // AllocatePostingList. 125 // 126 // RETURNS: 127 // - On success, a valid instance of PostingListHolder containing the 128 // requested PostingListUsed. 129 // - Any IndexBlock errors 130 libtextclassifier3::StatusOr<PostingListHolder> GetPostingList( 131 PostingListIdentifier id) const; 132 133 // Allocates and returns a PostingListHolder containing a PostingListUsed that 134 // can fit min_posting_list_bytes. 135 // 136 // RETURNS: 137 // - On success, a valid instance of PostingListHolder containing the 138 // requested PostingListUsed. 139 // - INVALID_ARGUMENT_ERROR if min_posting_list_bytes > 140 // max_posting_list_bytes() 141 // - RESOURCE_EXHAUSTED_ERROR if unable to grow the index to create a 142 // PostingListUsed of the requested size. 143 // - Any IndexBlock errors 144 libtextclassifier3::StatusOr<PostingListHolder> AllocatePostingList( 145 uint32_t min_posting_list_bytes); 146 147 // Allocates a new IndexBlock with a single max-sized PostingListUsed. This 148 // chains index blocks by setting the next_block_index field of this new 149 // block's header to be prev_block_index and returns a PostingListHolder 150 // containing a max-sized PostingListUsed. 151 // 152 // RETURNS: 153 // - On success, a valid instance of PostingListHolder containing the 154 // requested PostingListUsed. 155 // - RESOURCE_EXHAUSTED_ERROR if unable to grow the index to create a 156 // PostingListUsed of max size 157 // - Any IndexBlock errors 158 libtextclassifier3::StatusOr<PostingListHolder> 159 AllocateAndChainMaxSizePostingList(uint32_t prev_block_index); 160 161 // Frees the PostingListUsed that this holder holds. 162 // 163 // RETURNS: 164 // - OK on success 165 // - Any IndexBlock errors 166 libtextclassifier3::Status FreePostingList(PostingListHolder&& holder); 167 168 // Writes back the PostingListUsed that this holder holds to disk. 169 // 170 // RETURNS: 171 // - OK on success 172 // - Any IndexBlock errors 173 libtextclassifier3::Status WritePostingListToDisk( 174 const PostingListHolder& holder); 175 176 // Discards all existing data by deleting the existing file and 177 // re-initializing a new one. 178 // 179 // RETURNS: 180 // - OK on success 181 // - INTERNAL_ERROR if unable to delete existing files or initialize a new 182 // file with header 183 libtextclassifier3::Status Reset(); 184 185 // Used to track the largest docid indexed in the index. get_last_indexed_docid()186 DocumentId get_last_indexed_docid() const { 187 return header_block_->header()->last_indexed_docid; 188 } set_last_indexed_docid(DocumentId docid)189 void set_last_indexed_docid(DocumentId docid) { 190 header_block_->header()->last_indexed_docid = docid; 191 } 192 193 // Updates the header and persists all changes to the index to disk. Returns 194 // true on success. 195 bool PersistToDisk(); 196 197 // Returns the size of the index file in bytes. GetDiskUsage()198 int64_t GetDiskUsage() const { 199 return filesystem_->GetDiskUsage(storage_sfd_.get()); 200 } 201 202 // Returns the size of the index file used to contains data. GetElementsSize()203 uint64_t GetElementsSize() const { 204 // Element size is the same as disk size excluding the header block. 205 return GetDiskUsage() - block_size(); 206 } 207 num_blocks()208 int num_blocks() const { return num_blocks_; } 209 210 // Gets the byte size of max sized posting list. max_posting_list_bytes()211 uint32_t max_posting_list_bytes() const { 212 return IndexBlock::CalculateMaxPostingListBytes( 213 block_size(), serializer_->GetDataTypeBytes()); 214 } 215 216 // Info about the index based on the block size. block_size()217 int block_size() const { return header_block_->header()->block_size; } 218 219 // Num blocks starts at 1 since the first block is the header. empty()220 bool empty() const { return num_blocks_ <= 1; } 221 222 // The percentage of the maximum index size that is free. Allocated blocks are 223 // treated as fully used, even if they are only partially used. In this way, 224 // min_free_fraction is a lower bound of available space. min_free_fraction()225 double min_free_fraction() const { 226 return 1.0 - static_cast<double>(num_blocks_) / kMaxBlockIndex; 227 } 228 serializer()229 const PostingListSerializer* serializer() const { return serializer_; } serializer()230 PostingListSerializer* serializer() { return serializer_; } 231 232 // TODO(b/222349894) Convert the string output to a protocol buffer instead. 233 void GetDebugInfo(DebugInfoVerbosity::Code verbosity, std::string* out) const; 234 235 private: FlashIndexStorage(const Filesystem * filesystem,std::string && index_filename,PostingListSerializer * serializer,bool has_in_memory_freelists)236 explicit FlashIndexStorage(const Filesystem* filesystem, 237 std::string&& index_filename, 238 PostingListSerializer* serializer, 239 bool has_in_memory_freelists) 240 : filesystem_(filesystem), 241 index_filename_(std::move(index_filename)), 242 serializer_(serializer), 243 num_blocks_(0), 244 has_in_memory_freelists_(has_in_memory_freelists) {} 245 246 // Init the index from persistence. Create if file does not exist. We do not 247 // erase corrupt files. 248 // 249 // Returns false if unable to create a new header or if the existing one is 250 // corrupt. 251 bool Init(); 252 253 // Create or open the header block. Returns true on success. 254 bool InitHeader(); 255 256 // Create a new header block for an empty index file. 257 bool CreateHeader(); 258 259 // Loads the header stored at the beginning of the index file and validates 260 // the values stored in it. 261 bool OpenHeader(int64_t file_size); 262 263 // Adds the IndexBlock referred to by block_index in the on-disk free list 264 // with index block_info_index. 265 void AddToOnDiskFreeList(uint32_t block_index, int block_info_index, 266 IndexBlock* index_block); 267 268 // Removes the IndexBlock referred to by block_index from the Header free list 269 // with index block_info_index. 270 // 271 // RETURNS: 272 // - OK on success 273 // - Any IndexBlock errors 274 libtextclassifier3::Status RemoveFromOnDiskFreeList(uint32_t block_index, 275 int block_info_index, 276 IndexBlock* index_block); 277 278 // RETURNS: 279 // - On success, a valid PostingListHolder created from the first entry of 280 // the in-memory freelist at block_info_index 281 // - OUT_OF_RANGE_ERROR if in_memory_freelists_ contains 282 // PostingListIdentifier with block_index >= num_blocks_ 283 // - NOT_FOUND_ERROR if there was no entry in the freelist 284 // - Any IndexBlock errors 285 libtextclassifier3::StatusOr<PostingListHolder> 286 GetPostingListFromInMemoryFreeList(int block_info_index); 287 288 // RETURNS: 289 // - On success, a valid PostingListHolder created from the first entry of 290 // the on-disk freelist at block_info_index 291 // - OUT_OF_RANGE_ERROR if header()->index_block_infos[block_info_index] 292 // contains block_index >= num_blocks_ 293 // - NOT_FOUND_ERROR if there was no entry in the freelist 294 // - Any IndexBlock errors 295 libtextclassifier3::StatusOr<PostingListHolder> 296 GetPostingListFromOnDiskFreeList(int block_info_index); 297 298 // Returns: 299 // - On success, a valid PostingListHolder created from a newly allocated 300 // IndexBlock. 301 // - RESOURCE_EXHAUSTED if the index couldn't be grown to fit a new 302 // IndexBlock. 303 // - Any IndexBlock errors 304 libtextclassifier3::StatusOr<PostingListHolder> AllocateNewPostingList( 305 int block_info_index); 306 307 // Returns: 308 // - On success, a newly created IndexBlock at block_index with posting 309 // lists of size posting_list_size 310 // - OUT_OF_RANGE_ERROR if block_index >= num_blocks_ 311 // - Any IndexBlock errors 312 libtextclassifier3::StatusOr<IndexBlock> CreateIndexBlock( 313 uint32_t block_index, uint32_t posting_list_size) const; 314 315 // Returns: 316 // - On success, the IndexBlock that exists at block_index 317 // - OUT_OF_RANGE_ERROR if block_index >= num_blocks_ 318 // - Any IndexBlock errors 319 libtextclassifier3::StatusOr<IndexBlock> GetIndexBlock( 320 uint32_t block_index) const; 321 322 // Add a new block to the end of the file and return its block 323 // index. Returns kInvalidBlockIndex if unable to grow the index file. 324 int GrowIndex(); 325 326 // Return the index into index_block_infos of the smallest posting_list free 327 // list that can fit posting_list_bytes or -1 if posting_list_bytes exceeds 328 // the max-sized posting list. 329 int FindBestIndexBlockInfo(uint32_t posting_list_bytes) const; 330 331 // Flushes the in-memory free list to disk. 332 // 333 // RETURNS: 334 // - OK on success 335 // - Any IndexBlock errors 336 libtextclassifier3::Status FlushInMemoryFreeList(); 337 338 const Filesystem* filesystem_; // not owned; can't be null 339 std::string index_filename_; 340 341 PostingListSerializer* serializer_; // not owned; can't be null 342 343 // We open the index file into this fd. 344 ScopedFd storage_sfd_; 345 346 int num_blocks_; // can be inferred from index file size 347 348 std::unique_ptr<HeaderBlock> header_block_; 349 350 // In-memory cache of free posting lists. 351 struct FreeList { 352 // Experimentally determined that high watermark for largest 353 // freelist was ~3500. 354 static constexpr size_t kMaxSize = 4096; 355 356 // Push a new PostingListIdentifier if there is space. 357 void Push(PostingListIdentifier id); 358 359 // Attempt to pop a PostingListIdentifier. 360 // 361 // RETURNS: 362 // - identifier of a free posting list, on success 363 // - NOT_FOUND if there are no free posting lists on this free list. 364 libtextclassifier3::StatusOr<PostingListIdentifier> TryPop(); 365 366 std::string DebugString() const; 367 368 private: 369 std::vector<PostingListIdentifier> free_list_; 370 int free_list_size_high_watermark_ = 0; 371 int num_dropped_free_list_entries_ = 0; 372 }; 373 std::vector<FreeList> in_memory_freelists_; 374 375 bool has_in_memory_freelists_; 376 }; 377 378 } // namespace lib 379 } // namespace icing 380 381 #endif // ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_ 382