1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_FLASH_INDEX_STORAGE_H_ 16 #define ICING_INDEX_FLASH_INDEX_STORAGE_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 22 #include "icing/text_classifier/lib3/utils/base/statusor.h" 23 #include "icing/absl_ports/canonical_errors.h" 24 #include "icing/file/filesystem.h" 25 #include "icing/index/main/flash-index-storage-header.h" 26 #include "icing/index/main/index-block.h" 27 #include "icing/index/main/posting-list-free.h" 28 #include "icing/index/main/posting-list-identifier.h" 29 #include "icing/index/main/posting-list-used.h" 30 #include "icing/legacy/core/icing-packed-pod.h" 31 #include "icing/store/document-id.h" 32 33 namespace icing { 34 namespace lib { 35 36 // The PostingListHolder struct exists to group together related PostingListUsed 37 // IndexBlock pairs and their ids. 38 struct PostingListHolder { 39 // PostingListUseds interpret data that they themselves do NOT own. The data 40 // being interpreted is stored on a flash block and its memory mapping is 41 // owned by the IndexBlock. As such, the lifecycle of the PostingListUsed must 42 // NOT exceed the lifecycle of the IndexBlock. 43 PostingListUsed posting_list; 44 IndexBlock block; 45 // The PostingListIdentifier, which identifies both the IndexBlock and the 46 // PostingListUsed, is also returned for convenience. 47 PostingListIdentifier id; 48 }; 49 50 // The FlashIndexStorage class manages the actual file that makes up the index. 51 // It allocates IndexBlocks as needed and maintains freelists to prevent 52 // excessive block fragmentation. 53 // 54 // It maintains two types of free lists: 55 // 1. On-disk, Header free list - This free list is stored in the Header 56 // block. There is a free list for every possible posting list size. Each 57 // entry for a posting list size contains the block_index of the 58 // IndexBlock that starts the free list chain. Each IndexBlock in the free 59 // list chain stores the index of the next IndexBlock in the chain. 60 // 2. In-memory free list - Like the Header free list, there is a free list of 61 // every possible posting list size. This free list contains not just the 62 // block_index of the available IndexBlock, but also the posting_list_index 63 // of the available PostingListUsed within the IndexBlock. This is because, 64 // unlike the Header free list, PostingListUseds are not actually freed 65 // when added to this free list. 66 // 67 // Whether or not the in-memory free list is used can be chosen via the 68 // in_memory param to the Create factory function. 69 // 70 // The advantage of using the in-memory free list is that it reduces the amount 71 // of flash writes made while editing the index (because actually freeing the 72 // PostingLists would require writing to that flash block). The disadvantage is 73 // that it introduces code complexity and potentially leaks blocks if power is 74 // lost or if FlashIndexStorage is destroyed before emptying the free list. 75 class FlashIndexStorage { 76 public: 77 // Creates a FlashIndexStorage at index_filename. in_memory determines whether 78 // or not the FlashIndexStorage maintains an in-memory freelist in order to 79 // avoid writes to the on-disk freelist. 80 // 81 // RETURNS: 82 // - On success, a valid instance of FlashIndexStorage 83 // - INTERNAL error if unable to create a new header or read the existing 84 // one from disk. 85 static libtextclassifier3::StatusOr<FlashIndexStorage> Create( 86 const std::string& index_filename, const Filesystem* filesystem, 87 bool in_memory = true); 88 89 // Retrieve the PostingList referred to by PostingListIdentifier. This posting 90 // list must have been previously allocated by a prior call to 91 // AllocatePostingList. 92 // 93 // RETURNS: 94 // - On success, a valid instance of PostingListHolder containing the 95 // requested PostingListUsed. 96 // - INVALID_ARGUMENT if id.posting_list_index() is out of bounds in the 97 // IndexBlock referred to by id.block_index() 98 // - INTERNAL_ERROR if unable to access the region in file. 99 libtextclassifier3::StatusOr<PostingListHolder> GetPostingList( 100 PostingListIdentifier id) const; 101 102 // Allocates and returns a PostingListHolder containing a PostingListUsed that 103 // can fit min_posting_list_bytes. 104 // 105 // RETURNS: 106 // - On success, a valid instance of PostingListHolder containing the 107 // requested PostingListUsed. 108 // - RESOURCE_EXHAUSTED error if unable to grow the index to create a 109 // PostingListUsed of the requested size. 110 libtextclassifier3::StatusOr<PostingListHolder> AllocatePostingList( 111 uint32_t min_posting_list_bytes); 112 113 ~FlashIndexStorage(); 114 FlashIndexStorage(FlashIndexStorage&&) = default; 115 FlashIndexStorage(const FlashIndexStorage&) = delete; 116 FlashIndexStorage& operator=(FlashIndexStorage&&) = default; 117 FlashIndexStorage& operator=(const FlashIndexStorage&) = delete; 118 119 // Free the PostingListUsed that this holder holds. 120 void FreePostingList(PostingListHolder holder); 121 122 // Used to track the largest docid indexed in the index. get_last_indexed_docid()123 DocumentId get_last_indexed_docid() const { 124 return header_block_->header()->last_indexed_docid; 125 } set_last_indexed_docid(DocumentId docid)126 void set_last_indexed_docid(DocumentId docid) { 127 header_block_->header()->last_indexed_docid = docid; 128 } 129 130 // Updates the header and persists all changes to the index to disk. Returns 131 // true on success. 132 bool PersistToDisk(); 133 134 // Returns the size of the index file in bytes. GetDiskUsage()135 int64_t GetDiskUsage() const { 136 return filesystem_->GetDiskUsage(block_fd_.get()); 137 } 138 139 // Returns the size of the index file used to contains hits. GetElementsSize()140 uint64_t GetElementsSize() const { 141 // Element size is the same as disk size excluding the header block. 142 return GetDiskUsage() - block_size(); 143 } 144 num_blocks()145 int num_blocks() const { return num_blocks_; } 146 147 // Info about the index based on the block size. block_size()148 int block_size() const { return header_block_->header()->block_size; } 149 150 // Num blocks starts at 1 since the first block is the header. empty()151 bool empty() const { return num_blocks_ <= 1; } 152 153 // The percentage of the maximum index size that is free. Allocated blocks are 154 // treated as fully used, even if they are only partially used. In this way, 155 // min_free_fraction is a lower bound of available space. min_free_fraction()156 double min_free_fraction() const { 157 return 1.0 - static_cast<double>(num_blocks_) / kMaxBlockIndex; 158 } 159 160 libtextclassifier3::Status Reset(); 161 162 // TODO(b/222349894) Convert the string output to a protocol buffer instead. 163 void GetDebugInfo(int verbosity, std::string* out) const; 164 165 private: 166 FlashIndexStorage(const std::string& index_filename, 167 const Filesystem* filesystem, bool has_in_memory_freelists); 168 169 // Init the index from persistence. Create if file does not exist. We do not 170 // erase corrupt files. 171 // 172 // Returns false if unable to create a new header or if the existing one is 173 // corrupt. 174 bool Init(); 175 176 // Create or open the header block. Returns true on success. 177 bool InitHeader(); 178 179 // Create a new header block for an empty index file. 180 bool CreateHeader(); 181 182 // Loads the header stored at the beginning of the index file and validates 183 // the values stored in it. 184 bool OpenHeader(int64_t file_size); 185 186 // Add the IndexBlock referred to by block_index in the on-disk free list with 187 // index block_info_index. 188 void AddToOnDiskFreeList(uint32_t block_index, int block_info_index, 189 IndexBlock* index_block); 190 191 // Remove the IndexBlock referred to by block_index from the Header free list 192 // with index block_info_index. 193 void RemoveFromOnDiskFreeList(uint32_t block_index, int block_info_index, 194 IndexBlock* index_block); 195 196 // Returns: 197 // - On success, a valid PostingListHolder created from the first entry of 198 // the in-memory freelist at block_info_index 199 // - NOT_FOUND if there was no entry in the freelist 200 // - RESOURCE_EXHAUSTED if the PostingList in the freelist couldn't be 201 // allocated for some reason. 202 libtextclassifier3::StatusOr<PostingListHolder> 203 GetPostingListFromInMemoryFreeList(int block_info_index); 204 205 // Returns: 206 // - On success, a valid PostingListHolder created from the first entry of 207 // the on-disk freelist at block_info_index 208 // - NOT_FOUND if there was no entry in the freelist 209 // - RESOURCE_EXHAUSTED if the PostingList in the freelist couldn't be 210 // allocated for some reason. 211 libtextclassifier3::StatusOr<PostingListHolder> 212 GetPostingListFromOnDiskFreeList(int block_info_index); 213 214 // Returns: 215 // - On success, a valid PostingListHolder created from a newly allocated 216 // IndexBlock. 217 // - RESOURCE_EXHAUSTED if the index couldn't be grown to fit a new 218 // IndexBlock. 219 libtextclassifier3::StatusOr<PostingListHolder> AllocateNewPostingList( 220 int block_info_index); 221 222 // Returns: 223 // - On success, a newly created IndexBlock at block_index with posting 224 // lists of size posting_list_size 225 // - INTERNAL_ERROR if unable to access the region in file representing the 226 // IndexBlock 227 libtextclassifier3::StatusOr<IndexBlock> CreateIndexBlock( 228 int block_index, uint32_t posting_list_size) const; 229 230 // Returns: 231 // - On success, the IndexBlock that exists at block_index 232 // - INTERNAL_ERROR if unable to access the region in file representing the 233 // IndexBlock 234 libtextclassifier3::StatusOr<IndexBlock> GetIndexBlock(int block_index) const; 235 236 // Add a new block to the end of the file and return its block 237 // index. Returns kInvalidBlockIndex if unable to grow the index file. 238 int GrowIndex(); 239 240 // Return the index into index_block_infos of the smallest posting_list free 241 // list that can fit posting_list_bytes or -1 if posting_list_bytes exceeds 242 // the max-sized posting list. 243 int FindBestIndexBlockInfo(uint32_t posting_list_bytes) const; 244 245 // Flushes the in-memory free list to disk. 246 void FlushInMemoryFreeList(); 247 248 // Underlying filename. 249 std::string index_filename_; 250 251 // We open the index file into this fd. 252 ScopedFd block_fd_; 253 int num_blocks_; // can be inferred from index file size 254 255 std::unique_ptr<HeaderBlock> header_block_; 256 257 // In-memory cache of free posting lists. 258 struct FreeList { 259 // Experimentally determined that high watermark for largest 260 // freelist was ~3500. 261 static constexpr size_t kMaxSize = 4096; 262 263 // Push a new PostingListIdentifier if there is space. 264 void Push(PostingListIdentifier id); 265 266 // Attempt to pop a PostingListIdentifier. 267 // 268 // RETURNS: 269 // - identifier of a free posting list, on success 270 // - NOT_FOUND if there are no free posting lists on this free list. 271 libtextclassifier3::StatusOr<PostingListIdentifier> TryPop(); 272 273 std::string DebugString() const; 274 275 private: 276 std::vector<PostingListIdentifier> free_list_; 277 int free_list_size_high_watermark_ = 0; 278 int num_dropped_free_list_entries_ = 0; 279 }; 280 std::vector<FreeList> in_memory_freelists_; 281 282 const Filesystem* filesystem_; // not owned; can't be null 283 284 bool has_in_memory_freelists_; 285 }; 286 287 } // namespace lib 288 } // namespace icing 289 290 #endif // ICING_INDEX_FLASH_INDEX_STORAGE_H_ 291