• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_NUMERIC_INTEGER_INDEX_STORAGE_H_
16 #define ICING_INDEX_NUMERIC_INTEGER_INDEX_STORAGE_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <vector>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/file/file-backed-vector.h"
27 #include "icing/file/filesystem.h"
28 #include "icing/file/memory-mapped-file.h"
29 #include "icing/file/persistent-storage.h"
30 #include "icing/file/posting_list/flash-index-storage.h"
31 #include "icing/file/posting_list/posting-list-identifier.h"
32 #include "icing/index/iterator/doc-hit-info-iterator.h"
33 #include "icing/index/numeric/integer-index-data.h"
34 #include "icing/index/numeric/posting-list-integer-index-serializer.h"
35 #include "icing/schema/section.h"
36 #include "icing/store/document-id.h"
37 #include "icing/util/crc32.h"
38 
39 namespace icing {
40 namespace lib {
41 
42 // IntegerIndexStorage: a class for indexing (persistent storage) and searching
43 // contents of integer type sections in documents.
44 // - Accepts new integer contents (a.k.a keys) and adds records (BasicHit, key)
45 //   into the integer index.
46 // - Stores records (BasicHit, key) in posting lists and compresses them.
47 // - Bucketizes these records by key to make range query more efficient and
48 //   manages them with the corresponding posting lists.
49 //   - When a posting list reaches the max size and is full, the mechanism of
50 //     PostingListAccessor is to create another new (max-size) posting list and
51 //     chain them together.
52 //   - It will be inefficient if we store all records in the same PL chain. E.g.
53 //     small range query needs to iterate through the whole PL chain but skips a
54 //     lot of non-relevant records (whose keys don't belong to the query range).
55 //   - Therefore, we implement the splitting mechanism to split a full max-size
56 //     posting list. Also adjust range of the original bucket and add new
57 //     buckets.
58 //   - Ranges of all buckets are disjoint and the union of them is [INT64_MIN,
59 //     INT64_MAX].
60 //   - Buckets should be sorted, so we can do binary search to find the desired
61 //     bucket(s). However, we may split a bucket into several buckets, and the
62 //     cost to insert newly created buckets is high.
63 //   - Thus, we introduce an unsorted bucket array for newly created buckets,
64 //     and merge unsorted buckets into the sorted bucket array only if length of
65 //     the unsorted bucket array exceeds the threshold. This mechanism will
66 //     reduce # of merging events and amortize the overall cost for bucket order
67 //     maintenance.
68 //     Note: some tree data structures (e.g. segment tree, B+ tree) maintain the
69 //     bucket order more efficiently than the sorted/unsorted bucket array
70 //     mechanism, but the implementation is more complicated and doesn't improve
71 //     the performance too much according to our analysis, so currently we
72 //     choose sorted/unsorted bucket array.
73 //   - Then we do binary search on the sorted bucket array and sequential search
74 //     on the unsorted bucket array.
75 class IntegerIndexStorage : public PersistentStorage {
76  public:
77   struct Info {
78     static constexpr int32_t kMagic = 0xc4bf0ccc;
79 
80     int32_t magic;
81     int32_t num_data;
82 
ComputeChecksumInfo83     Crc32 ComputeChecksum() const {
84       return Crc32(
85           std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
86     }
87   } __attribute__((packed));
88   static_assert(sizeof(Info) == 8, "");
89 
90   // Bucket
91   class Bucket {
92    public:
93     // Absolute max # of buckets allowed. Since the absolute max file size of
94     // FileBackedVector on 32-bit platform is ~2^28, we can at most have ~13.4M
95     // buckets. To make it power of 2, round it down to 2^23. Also since we're
96     // using FileBackedVector to store buckets, add some static_asserts to
97     // ensure numbers here are compatible with FileBackedVector.
98     static constexpr int32_t kMaxNumBuckets = 1 << 23;
99 
100     explicit Bucket(int64_t key_lower, int64_t key_upper,
101                     PostingListIdentifier posting_list_identifier =
102                         PostingListIdentifier::kInvalid)
key_lower_(key_lower)103         : key_lower_(key_lower),
104           key_upper_(key_upper),
105           posting_list_identifier_(posting_list_identifier) {}
106 
107     bool operator<(const Bucket& other) const {
108       return key_lower_ < other.key_lower_;
109     }
110 
111     // For FileBackedVector
112     bool operator==(const Bucket& other) const {
113       return key_lower_ == other.key_lower_ && key_upper_ == other.key_upper_ &&
114              posting_list_identifier_ == other.posting_list_identifier_;
115     }
116 
key_lower()117     int64_t key_lower() const { return key_lower_; }
118 
key_upper()119     int64_t key_upper() const { return key_upper_; }
120 
set_key_lower(int64_t key_lower)121     void set_key_lower(int64_t key_lower) { key_lower_ = key_lower; }
122 
set_key_upper(int64_t key_upper)123     void set_key_upper(int64_t key_upper) { key_upper_ = key_upper; }
124 
posting_list_identifier()125     PostingListIdentifier posting_list_identifier() const {
126       return posting_list_identifier_;
127     }
set_posting_list_identifier(PostingListIdentifier posting_list_identifier)128     void set_posting_list_identifier(
129         PostingListIdentifier posting_list_identifier) {
130       posting_list_identifier_ = posting_list_identifier;
131     }
132 
133    private:
134     int64_t key_lower_;
135     int64_t key_upper_;
136     PostingListIdentifier posting_list_identifier_;
137   } __attribute__((packed));
138   static_assert(sizeof(Bucket) == 20, "");
139   static_assert(sizeof(Bucket) == FileBackedVector<Bucket>::kElementTypeSize,
140                 "Bucket type size is inconsistent with FileBackedVector "
141                 "element type size");
142   static_assert(Bucket::kMaxNumBuckets <=
143                     (FileBackedVector<Bucket>::kMaxFileSize -
144                      FileBackedVector<Bucket>::Header::kHeaderSize) /
145                         FileBackedVector<Bucket>::kElementTypeSize,
146                 "Max # of buckets cannot fit into FileBackedVector");
147 
148   struct Options {
OptionsOptions149     explicit Options(bool pre_mapping_fbv_in)
150         : pre_mapping_fbv(pre_mapping_fbv_in) {}
151 
OptionsOptions152     explicit Options(std::vector<Bucket> custom_init_sorted_buckets_in,
153                      std::vector<Bucket> custom_init_unsorted_buckets_in,
154                      bool pre_mapping_fbv_in)
155         : custom_init_sorted_buckets(std::move(custom_init_sorted_buckets_in)),
156           custom_init_unsorted_buckets(
157               std::move(custom_init_unsorted_buckets_in)),
158           pre_mapping_fbv(pre_mapping_fbv_in) {}
159 
160     bool IsValid() const;
161 
HasCustomInitBucketsOptions162     bool HasCustomInitBuckets() const {
163       return !custom_init_sorted_buckets.empty() ||
164              !custom_init_unsorted_buckets.empty();
165     }
166 
167     // Custom buckets when initializing new files. If both are empty, then the
168     // initial bucket is (INT64_MIN, INT64_MAX). Usually we only set them in the
169     // unit test. Note that all buckets in custom_init_sorted_buckets and
170     // custom_init_unsorted_buckets should be disjoint and the range union
171     // should be [INT64_MIN, INT64_MAX].
172     std::vector<Bucket> custom_init_sorted_buckets;
173     std::vector<Bucket> custom_init_unsorted_buckets;
174 
175     // Flag indicating whether memory map max possible file size for underlying
176     // FileBackedVector before growing the actual file size.
177     bool pre_mapping_fbv;
178   };
179 
180   // Metadata file layout: <Crcs><Info>
181   static constexpr int32_t kCrcsMetadataFileOffset = 0;
182   static constexpr int32_t kInfoMetadataFileOffset =
183       static_cast<int32_t>(sizeof(Crcs));
184   static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info);
185   static_assert(kMetadataFileSize == 20, "");
186 
187   static constexpr WorkingPathType kWorkingPathType =
188       WorkingPathType::kDirectory;
189   static constexpr std::string_view kFilePrefix = "integer_index_storage";
190 
191   // # of data threshold for bucket merging during optimization (TransferIndex).
192   // If total # data of adjacent buckets exceed this value, then flush the
193   // accumulated data. Otherwise merge buckets and their data.
194   //
195   // Calculated by: 0.7 * (kMaxPostingListSize / sizeof(IntegerIndexData)),
196   // where kMaxPostingListSize = (kPageSize - sizeof(IndexBlock::BlockHeader)).
197   static constexpr int32_t kNumDataThresholdForBucketMerge = 240;
198 
199   // # of data threshold for bucket splitting during indexing (AddKeys).
200   // When the posting list of a bucket is full, we will try to split data into
201   // multiple buckets according to their keys. In order to achieve good
202   // (amortized) time complexity, we want # of data in new buckets to be at most
203   // half # of elements in a full posting list.
204   //
205   // Calculated by: 0.5 * (kMaxPostingListSize / sizeof(IntegerIndexData)),
206   // where kMaxPostingListSize = (kPageSize - sizeof(IndexBlock::BlockHeader)).
207   static constexpr int32_t kNumDataThresholdForBucketSplit = 170;
208 
209   // Length threshold to sort and merge unsorted buckets into sorted buckets. If
210   // the length of unsorted_buckets exceed the threshold, then call
211   // SortBuckets().
212   static constexpr int32_t kUnsortedBucketsLengthThreshold = 50;
213 
214   // Creates a new IntegerIndexStorage instance to index integers (for a single
215   // property). If any of the underlying file is missing, then delete the whole
216   // working_path and (re)initialize with new ones. Otherwise initialize and
217   // create the instance by existing files.
218   //
219   // filesystem: Object to make system level calls
220   // working_path: Specifies the working path for PersistentStorage.
221   //               IntegerIndexStorage uses working path as working directory
222   //               and all related files will be stored under this directory. It
223   //               takes full ownership and of working_path_, including
224   //               creation/deletion. It is the caller's responsibility to
225   //               specify correct working path and avoid mixing different
226   //               persistent storages together under the same path. Also the
227   //               caller has the ownership for the parent directory of
228   //               working_path_, and it is responsible for parent directory
229   //               creation/deletion. See PersistentStorage for more details
230   //               about the concept of working_path.
231   // options: Options instance.
232   // posting_list_serializer: a PostingListIntegerIndexSerializer instance to
233   //                          serialize/deserialize integer index data to/from
234   //                          posting lists.
235   //
236   // Returns:
237   //   - INVALID_ARGUMENT_ERROR if any value in options is invalid.
238   //   - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
239   //                               checksum.
240   //   - INTERNAL_ERROR on I/O errors.
241   //   - Any FileBackedVector/FlashIndexStorage errors.
242   static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
243   Create(const Filesystem& filesystem, std::string working_path,
244          Options options,
245          PostingListIntegerIndexSerializer* posting_list_serializer);
246 
247   // Deletes IntegerIndexStorage under working_path.
248   //
249   // Returns:
250   //   - OK on success
251   //   - INTERNAL_ERROR on I/O error
Discard(const Filesystem & filesystem,const std::string & working_path)252   static libtextclassifier3::Status Discard(const Filesystem& filesystem,
253                                             const std::string& working_path) {
254     return PersistentStorage::Discard(filesystem, working_path,
255                                       kWorkingPathType);
256   }
257 
258   // Delete copy and move constructor/assignment operator.
259   IntegerIndexStorage(const IntegerIndexStorage&) = delete;
260   IntegerIndexStorage& operator=(const IntegerIndexStorage&) = delete;
261 
262   IntegerIndexStorage(IntegerIndexStorage&&) = delete;
263   IntegerIndexStorage& operator=(IntegerIndexStorage&&) = delete;
264 
265   ~IntegerIndexStorage() override;
266 
267   // Batch adds new keys (of the same DocumentId and SectionId) into the integer
268   // index storage.
269   // Note that since we separate different property names into different integer
270   // index storages, it is impossible to have keys in a single document across
271   // multiple sections to add into the same integer index storage.
272   //
273   // Returns:
274   //   - OK on success
275   //   - Any FileBackedVector or PostingList errors
276   libtextclassifier3::Status AddKeys(DocumentId document_id,
277                                      SectionId section_id,
278                                      std::vector<int64_t>&& new_keys);
279 
280   // Returns a DocHitInfoIteratorNumeric<int64_t> (in DocHitInfoIterator
281   // interface type format) for iterating through all docs which have the
282   // specified (integer) property contents in range [query_key_lower,
283   // query_key_upper].
284   // When iterating through all relevant doc hits, it:
285   // - Merges multiple SectionIds of doc hits with same DocumentId into a single
286   //   SectionIdMask and constructs DocHitInfo.
287   // - Returns DocHitInfo in descending DocumentId order.
288   //
289   // Returns:
290   //   - On success: a DocHitInfoIterator(Numeric)
291   //   - INVALID_ARGUMENT_ERROR if query_key_lower > query_key_upper
292   //   - Any FileBackedVector or PostingList errors
293   libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
294       int64_t query_key_lower, int64_t query_key_upper) const;
295 
296   // Transfers integer index data from the current storage to new_storage and
297   // optimizes buckets (for new_storage only), i.e. merging adjacent buckets if
298   // total # of data among them are less than or equal to
299   // kNumDataThresholdForBucketMerge.
300   //
301   // REQUIRES: new_storage should be a newly created storage instance, i.e. not
302   //   contain any data. Otherwise, existing data and posting lists won't be
303   //   freed and space will be wasted.
304   //
305   // Returns:
306   //   - OK on success
307   //   - OUT_OF_RANGE_ERROR if sorted buckets length exceeds the limit after
308   //     merging
309   //   - INTERNAL_ERROR on IO error
310   libtextclassifier3::Status TransferIndex(
311       const std::vector<DocumentId>& document_id_old_to_new,
312       IntegerIndexStorage* new_storage) const;
313 
num_data()314   int32_t num_data() const { return info().num_data; }
315 
316  private:
IntegerIndexStorage(const Filesystem & filesystem,std::string && working_path,Options && options,PostingListIntegerIndexSerializer * posting_list_serializer,std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets,std::unique_ptr<FlashIndexStorage> flash_index_storage)317   explicit IntegerIndexStorage(
318       const Filesystem& filesystem, std::string&& working_path,
319       Options&& options,
320       PostingListIntegerIndexSerializer* posting_list_serializer,
321       std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,
322       std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,
323       std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets,
324       std::unique_ptr<FlashIndexStorage> flash_index_storage)
325       : PersistentStorage(filesystem, std::move(working_path),
326                           kWorkingPathType),
327         options_(std::move(options)),
328         posting_list_serializer_(posting_list_serializer),
329         metadata_mmapped_file_(std::move(metadata_mmapped_file)),
330         sorted_buckets_(std::move(sorted_buckets)),
331         unsorted_buckets_(std::move(unsorted_buckets)),
332         flash_index_storage_(std::move(flash_index_storage)) {}
333 
334   static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
335   InitializeNewFiles(
336       const Filesystem& filesystem, std::string&& working_path,
337       Options&& options,
338       PostingListIntegerIndexSerializer* posting_list_serializer);
339 
340   static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
341   InitializeExistingFiles(
342       const Filesystem& filesystem, std::string&& working_path,
343       Options&& options,
344       PostingListIntegerIndexSerializer* posting_list_serializer);
345 
346   // Flushes data into posting list(s), creates a new bucket with range
347   // [key_lower, key_upper], and appends it into sorted buckets for storage.
348   // It is a helper function for TransferIndex.
349   //
350   // Returns:
351   //   - OK on success
352   //   - INTERNAL_ERROR if fails to write existing data into posting list(s)
353   //   - Any FileBackedVector or PostingList errors
354   static libtextclassifier3::Status FlushDataIntoNewSortedBucket(
355       int64_t key_lower, int64_t key_upper,
356       std::vector<IntegerIndexData>&& data, IntegerIndexStorage* storage);
357 
358   // Flushes contents of all storages to underlying files.
359   //
360   // Returns:
361   //   - OK on success
362   //   - INTERNAL_ERROR on I/O error
363   libtextclassifier3::Status PersistStoragesToDisk() override;
364 
365   // Flushes contents of metadata file.
366   //
367   // Returns:
368   //   - OK on success
369   //   - INTERNAL_ERROR on I/O error
370   libtextclassifier3::Status PersistMetadataToDisk() override;
371 
372   // Computes and returns Info checksum.
373   //
374   // Returns:
375   //   - Crc of the Info on success
376   libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() override;
377 
378   // Computes and returns all storages checksum. Checksums of sorted_buckets_,
379   // unsorted_buckets_ will be combined together by XOR.
380   // TODO(b/259744228): implement and include flash_index_storage checksum
381   //
382   // Returns:
383   //   - Crc of all storages on success
384   //   - INTERNAL_ERROR if any data inconsistency
385   libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() override;
386 
387   // Helper function to add keys in range [it_start, it_end) into the given
388   // bucket. It handles the bucket and its corresponding posting list(s) to make
389   // searching and indexing efficient.
390   //
391   // When the (single) posting list of the bucket is full:
392   // - If the size of posting list hasn't reached the max size, then just simply
393   //   add a new key into it, and PostingListAccessor mechanism will
394   //   automatically double the size of the posting list.
395   // - Else:
396   //   - If the bucket is splittable (i.e. key_lower < key_upper), then split it
397   //     into several new buckets with new ranges, and split the data (according
398   //     to their keys and the range of new buckets) of the original posting
399   //     list into several new posting lists.
400   //   - Otherwise, just simply add a new key into it, and PostingListAccessor
401   //     mechanism will automatically create a new max size posting list and
402   //     chain them.
403   //
404   // Returns:
405   //   - On success: a vector of new Buckets (to add into the unsorted bucket
406   //     array later)
407   //   - Any FileBackedVector or PostingList errors
408   libtextclassifier3::StatusOr<std::vector<Bucket>>
409   AddKeysIntoBucketAndSplitIfNecessary(
410       DocumentId document_id, SectionId section_id,
411       const std::vector<int64_t>::const_iterator& it_start,
412       const std::vector<int64_t>::const_iterator& it_end,
413       FileBackedVector<Bucket>::MutableView& mutable_bucket);
414 
415   // Merges all unsorted buckets into sorted buckets and clears unsorted
416   // buckets.
417   //
418   // Returns:
419   //   - OK on success
420   //   - OUT_OF_RANGE_ERROR if sorted buckets length exceeds the limit after
421   //     merging
422   //   - Any FileBackedVector errors
423   libtextclassifier3::Status SortBuckets();
424 
crcs()425   Crcs& crcs() override {
426     return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() +
427                                     kCrcsMetadataFileOffset);
428   }
429 
crcs()430   const Crcs& crcs() const override {
431     return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() +
432                                           kCrcsMetadataFileOffset);
433   }
434 
info()435   Info& info() {
436     return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() +
437                                     kInfoMetadataFileOffset);
438   }
439 
info()440   const Info& info() const {
441     return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() +
442                                           kInfoMetadataFileOffset);
443   }
444 
445   Options options_;
446 
447   PostingListIntegerIndexSerializer* posting_list_serializer_;  // Does not own.
448 
449   std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_;
450   std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets_;
451   std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets_;
452   std::unique_ptr<FlashIndexStorage> flash_index_storage_;
453 };
454 
455 }  // namespace lib
456 }  // namespace icing
457 
458 #endif  // ICING_INDEX_NUMERIC_INTEGER_INDEX_STORAGE_H_
459