• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_
16 #define ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <vector>
22 
23 #include "icing/text_classifier/lib3/utils/base/status.h"
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "icing/file/filesystem.h"
26 #include "icing/file/posting_list/flash-index-storage-header.h"
27 #include "icing/file/posting_list/index-block.h"
28 #include "icing/file/posting_list/posting-list-identifier.h"
29 #include "icing/file/posting_list/posting-list-used.h"
30 #include "icing/proto/debug.pb.h"
31 #include "icing/store/document-id.h"
32 
33 namespace icing {
34 namespace lib {
35 
36 // PostingListHolder: group PostingListUsed, id, and some other useful info for
37 // callers.
38 struct PostingListHolder {
39   // PostingListUsed owns an in-memory posting list data buffer. The data being
40   // interpreted is initialized via PRead from the storage. As such, we should
41   // sync it to disk after modifying it.
42   PostingListUsed posting_list;
43 
44   // The PostingListIdentifier, which identifies both the block index and the
45   // posting list index on that block, is also returned for convenience.
46   PostingListIdentifier id;
47 
48   // Next block index is also returned for convenience. If PostingListUsed is a
49   // max-sized posting list, then the caller has to use this value to handle
50   // chained max-sized posting list blocks.
51   uint32_t next_block_index;
52 
PostingListHolderPostingListHolder53   explicit PostingListHolder(PostingListUsed&& posting_list_in,
54                              PostingListIdentifier id_in,
55                              uint32_t next_block_index_in)
56       : posting_list(std::move(posting_list_in)),
57         id(id_in),
58         next_block_index(next_block_index_in) {}
59 };
60 
61 // The FlashIndexStorage class manages the actual file that makes up blocks for
62 // posting lists. It allocates IndexBlocks as needed and maintains freelists to
63 // prevent excessive block fragmentation.
64 //
65 // It maintains two types of free lists:
66 //   1. On-disk, Header free list - This free list is stored in the Header
67 //      block. There is a free list for every possible posting list size. Each
68 //      entry for a posting list size contains the block_index of the
69 //      IndexBlock that starts the free list chain. Each IndexBlock in the free
70 //      list chain stores the index of the next IndexBlock in the chain.
71 //   2. In-memory free list - Like the Header free list, there is a free list of
72 //      every possible posting list size. This free list contains not just the
73 //      block_index of the available IndexBlock, but also the posting_list_index
74 //      of the available PostingListUsed within the IndexBlock. This is because,
75 //      unlike the Header free list, PostingListUseds are not actually freed
76 //      when added to this free list.
77 //
78 // Whether or not the in-memory free list is used can be chosen via the
79 // in_memory param to the Create factory function.
80 //
81 // The advantage of using the in-memory free list is that it reduces the amount
82 // of flash writes made while editing the index (because actually freeing the
83 // PostingLists would require writing to that flash block). The disadvantage is
84 // that it introduces code complexity and potentially leaks blocks if power is
85 // lost or if FlashIndexStorage is destroyed before emptying the free list.
86 class FlashIndexStorage {
87  public:
88   // Creates a FlashIndexStorage at index_filename. in_memory determines whether
89   // or not the FlashIndexStorage maintains an in-memory freelist in order to
90   // avoid writes to the on-disk freelist.
91   //
92   // RETURNS:
93   //   - On success, a valid instance of FlashIndexStorage
94   //   - FAILED_PRECONDITION_ERROR if filesystem or serializer is null
95   //   - INTERNAL_ERROR if unable to create a new header or read the existing
96   //     one from disk.
97   static libtextclassifier3::StatusOr<FlashIndexStorage> Create(
98       std::string index_filename, const Filesystem* filesystem,
99       PostingListSerializer* serializer, bool in_memory = true);
100 
101   // Reads magic from existing file header. We need this during Icing
102   // initialization phase to determine the version.
103   //
104   // RETURNS:
105   //   - On success, a valid magic
106   //   - FAILED_PRECONDITION_ERROR if filesystem is null
107   //   - NOT_FOUND_ERROR if the flash index file doesn't exist
108   //   - INTERNAL_ERROR on I/O error
109   static libtextclassifier3::StatusOr<int> ReadHeaderMagic(
110       const Filesystem* filesystem, const std::string& index_filename);
111 
112   FlashIndexStorage(FlashIndexStorage&&) = default;
113   FlashIndexStorage(const FlashIndexStorage&) = delete;
114   FlashIndexStorage& operator=(FlashIndexStorage&&) = default;
115   FlashIndexStorage& operator=(const FlashIndexStorage&) = delete;
116 
117   ~FlashIndexStorage();
118 
119   // Selects block size to use.
120   static uint32_t SelectBlockSize();
121 
122   // Retrieves the PostingList referred to by PostingListIdentifier. This
123   // posting list must have been previously allocated by a prior call to
124   // AllocatePostingList.
125   //
126   // RETURNS:
127   //   - On success, a valid instance of PostingListHolder containing the
128   //     requested PostingListUsed.
129   //   - Any IndexBlock errors
130   libtextclassifier3::StatusOr<PostingListHolder> GetPostingList(
131       PostingListIdentifier id) const;
132 
133   // Allocates and returns a PostingListHolder containing a PostingListUsed that
134   // can fit min_posting_list_bytes.
135   //
136   // RETURNS:
137   //   - On success, a valid instance of PostingListHolder containing the
138   //     requested PostingListUsed.
139   //   - INVALID_ARGUMENT_ERROR if min_posting_list_bytes >
140   //     max_posting_list_bytes()
141   //   - RESOURCE_EXHAUSTED_ERROR if unable to grow the index to create a
142   //     PostingListUsed of the requested size.
143   //   - Any IndexBlock errors
144   libtextclassifier3::StatusOr<PostingListHolder> AllocatePostingList(
145       uint32_t min_posting_list_bytes);
146 
147   // Allocates a new IndexBlock with a single max-sized PostingListUsed. This
148   // chains index blocks by setting the next_block_index field of this new
149   // block's header to be prev_block_index and returns a PostingListHolder
150   // containing a max-sized PostingListUsed.
151   //
152   // RETURNS:
153   //   - On success, a valid instance of PostingListHolder containing the
154   //     requested PostingListUsed.
155   //   - RESOURCE_EXHAUSTED_ERROR if unable to grow the index to create a
156   //     PostingListUsed of max size
157   //   - Any IndexBlock errors
158   libtextclassifier3::StatusOr<PostingListHolder>
159   AllocateAndChainMaxSizePostingList(uint32_t prev_block_index);
160 
161   // Frees the PostingListUsed that this holder holds.
162   //
163   // RETURNS:
164   //   - OK on success
165   //   - Any IndexBlock errors
166   libtextclassifier3::Status FreePostingList(PostingListHolder&& holder);
167 
168   // Writes back the PostingListUsed that this holder holds to disk.
169   //
170   // RETURNS:
171   //   - OK on success
172   //   - Any IndexBlock errors
173   libtextclassifier3::Status WritePostingListToDisk(
174       const PostingListHolder& holder);
175 
176   // Discards all existing data by deleting the existing file and
177   // re-initializing a new one.
178   //
179   // RETURNS:
180   //   - OK on success
181   //   - INTERNAL_ERROR if unable to delete existing files or initialize a new
182   //     file with header
183   libtextclassifier3::Status Reset();
184 
185   // Used to track the largest docid indexed in the index.
get_last_indexed_docid()186   DocumentId get_last_indexed_docid() const {
187     return header_block_->header()->last_indexed_docid;
188   }
set_last_indexed_docid(DocumentId docid)189   void set_last_indexed_docid(DocumentId docid) {
190     header_block_->header()->last_indexed_docid = docid;
191   }
192 
193   // Updates the header and persists all changes to the index to disk. Returns
194   // true on success.
195   bool PersistToDisk();
196 
197   // Returns the size of the index file in bytes.
GetDiskUsage()198   int64_t GetDiskUsage() const {
199     return filesystem_->GetDiskUsage(storage_sfd_.get());
200   }
201 
202   // Returns the size of the index file used to contains data.
GetElementsSize()203   uint64_t GetElementsSize() const {
204     // Element size is the same as disk size excluding the header block.
205     return GetDiskUsage() - block_size();
206   }
207 
num_blocks()208   int num_blocks() const { return num_blocks_; }
209 
210   // Gets the byte size of max sized posting list.
max_posting_list_bytes()211   uint32_t max_posting_list_bytes() const {
212     return IndexBlock::CalculateMaxPostingListBytes(
213         block_size(), serializer_->GetDataTypeBytes());
214   }
215 
216   // Info about the index based on the block size.
block_size()217   int block_size() const { return header_block_->header()->block_size; }
218 
219   // Num blocks starts at 1 since the first block is the header.
empty()220   bool empty() const { return num_blocks_ <= 1; }
221 
222   // The percentage of the maximum index size that is free. Allocated blocks are
223   // treated as fully used, even if they are only partially used. In this way,
224   // min_free_fraction is a lower bound of available space.
min_free_fraction()225   double min_free_fraction() const {
226     return 1.0 - static_cast<double>(num_blocks_) / kMaxBlockIndex;
227   }
228 
serializer()229   const PostingListSerializer* serializer() const { return serializer_; }
serializer()230   PostingListSerializer* serializer() { return serializer_; }
231 
232   // TODO(b/222349894) Convert the string output to a protocol buffer instead.
233   void GetDebugInfo(DebugInfoVerbosity::Code verbosity, std::string* out) const;
234 
235  private:
FlashIndexStorage(const Filesystem * filesystem,std::string && index_filename,PostingListSerializer * serializer,bool has_in_memory_freelists)236   explicit FlashIndexStorage(const Filesystem* filesystem,
237                              std::string&& index_filename,
238                              PostingListSerializer* serializer,
239                              bool has_in_memory_freelists)
240       : filesystem_(filesystem),
241         index_filename_(std::move(index_filename)),
242         serializer_(serializer),
243         num_blocks_(0),
244         has_in_memory_freelists_(has_in_memory_freelists) {}
245 
246   // Init the index from persistence. Create if file does not exist. We do not
247   // erase corrupt files.
248   //
249   // Returns false if unable to create a new header or if the existing one is
250   // corrupt.
251   bool Init();
252 
253   // Create or open the header block. Returns true on success.
254   bool InitHeader();
255 
256   // Create a new header block for an empty index file.
257   bool CreateHeader();
258 
259   // Loads the header stored at the beginning of the index file and validates
260   // the values stored in it.
261   bool OpenHeader(int64_t file_size);
262 
263   // Adds the IndexBlock referred to by block_index in the on-disk free list
264   // with index block_info_index.
265   void AddToOnDiskFreeList(uint32_t block_index, int block_info_index,
266                            IndexBlock* index_block);
267 
268   // Removes the IndexBlock referred to by block_index from the Header free list
269   // with index block_info_index.
270   //
271   // RETURNS:
272   //   - OK on success
273   //   - Any IndexBlock errors
274   libtextclassifier3::Status RemoveFromOnDiskFreeList(uint32_t block_index,
275                                                       int block_info_index,
276                                                       IndexBlock* index_block);
277 
278   // RETURNS:
279   //   - On success, a valid PostingListHolder created from the first entry of
280   //     the in-memory freelist at block_info_index
281   //   - OUT_OF_RANGE_ERROR if in_memory_freelists_ contains
282   //     PostingListIdentifier with block_index >= num_blocks_
283   //   - NOT_FOUND_ERROR if there was no entry in the freelist
284   //   - Any IndexBlock errors
285   libtextclassifier3::StatusOr<PostingListHolder>
286   GetPostingListFromInMemoryFreeList(int block_info_index);
287 
288   // RETURNS:
289   //   - On success, a valid PostingListHolder created from the first entry of
290   //     the on-disk freelist at block_info_index
291   //   - OUT_OF_RANGE_ERROR if header()->index_block_infos[block_info_index]
292   //     contains block_index >= num_blocks_
293   //   - NOT_FOUND_ERROR if there was no entry in the freelist
294   //   - Any IndexBlock errors
295   libtextclassifier3::StatusOr<PostingListHolder>
296   GetPostingListFromOnDiskFreeList(int block_info_index);
297 
298   // Returns:
299   //   - On success, a valid PostingListHolder created from a newly allocated
300   //     IndexBlock.
301   //   - RESOURCE_EXHAUSTED if the index couldn't be grown to fit a new
302   //     IndexBlock.
303   //   - Any IndexBlock errors
304   libtextclassifier3::StatusOr<PostingListHolder> AllocateNewPostingList(
305       int block_info_index);
306 
307   // Returns:
308   //   - On success, a newly created IndexBlock at block_index with posting
309   //     lists of size posting_list_size
310   //   - OUT_OF_RANGE_ERROR if block_index >= num_blocks_
311   //   - Any IndexBlock errors
312   libtextclassifier3::StatusOr<IndexBlock> CreateIndexBlock(
313       uint32_t block_index, uint32_t posting_list_size) const;
314 
315   // Returns:
316   //   - On success, the IndexBlock that exists at block_index
317   //   - OUT_OF_RANGE_ERROR if block_index >= num_blocks_
318   //   - Any IndexBlock errors
319   libtextclassifier3::StatusOr<IndexBlock> GetIndexBlock(
320       uint32_t block_index) const;
321 
322   // Add a new block to the end of the file and return its block
323   // index. Returns kInvalidBlockIndex if unable to grow the index file.
324   int GrowIndex();
325 
326   // Return the index into index_block_infos of the smallest posting_list free
327   // list that can fit posting_list_bytes or -1 if posting_list_bytes exceeds
328   // the max-sized posting list.
329   int FindBestIndexBlockInfo(uint32_t posting_list_bytes) const;
330 
331   // Flushes the in-memory free list to disk.
332   //
333   // RETURNS:
334   //   - OK on success
335   //   - Any IndexBlock errors
336   libtextclassifier3::Status FlushInMemoryFreeList();
337 
338   const Filesystem* filesystem_;  // not owned; can't be null
339   std::string index_filename_;
340 
341   PostingListSerializer* serializer_;  // not owned; can't be null
342 
343   // We open the index file into this fd.
344   ScopedFd storage_sfd_;
345 
346   int num_blocks_;  // can be inferred from index file size
347 
348   std::unique_ptr<HeaderBlock> header_block_;
349 
350   // In-memory cache of free posting lists.
351   struct FreeList {
352     // Experimentally determined that high watermark for largest
353     // freelist was ~3500.
354     static constexpr size_t kMaxSize = 4096;
355 
356     // Push a new PostingListIdentifier if there is space.
357     void Push(PostingListIdentifier id);
358 
359     // Attempt to pop a PostingListIdentifier.
360     //
361     // RETURNS:
362     //  - identifier of a free posting list, on success
363     //  - NOT_FOUND if there are no free posting lists on this free list.
364     libtextclassifier3::StatusOr<PostingListIdentifier> TryPop();
365 
366     std::string DebugString() const;
367 
368    private:
369     std::vector<PostingListIdentifier> free_list_;
370     int free_list_size_high_watermark_ = 0;
371     int num_dropped_free_list_entries_ = 0;
372   };
373   std::vector<FreeList> in_memory_freelists_;
374 
375   bool has_in_memory_freelists_;
376 };
377 
378 }  // namespace lib
379 }  // namespace icing
380 
381 #endif  // ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_
382