• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_NUMERIC_INTEGER_INDEX_H_
16 #define ICING_INDEX_NUMERIC_INTEGER_INDEX_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/file/file-backed-proto.h"
27 #include "icing/file/filesystem.h"
28 #include "icing/file/memory-mapped-file.h"
29 #include "icing/index/numeric/integer-index-storage.h"
30 #include "icing/index/numeric/numeric-index.h"
31 #include "icing/index/numeric/posting-list-integer-index-serializer.h"
32 #include "icing/index/numeric/wildcard-property-storage.pb.h"
33 #include "icing/schema/schema-store.h"
34 #include "icing/store/document-id.h"
35 #include "icing/store/document-store.h"
36 #include "icing/util/crc32.h"
37 
38 namespace icing {
39 namespace lib {
40 
41 // IntegerIndex: a wrapper class for managing IntegerIndexStorage (a lower level
42 // persistent storage class for indexing and searching contents of integer type
43 // sections in documents) instances for different property paths.
44 // We separate indexable integer data from different properties into different
45 // storages, and IntegerIndex manages and handles indexable integer data
46 // appropriately to their corresponding IntegerIndexStorage instance according
47 // to the given property path.
48 class IntegerIndex : public NumericIndex<int64_t> {
49  public:
50   using PropertyToStorageMapType =
51       std::unordered_map<std::string, std::unique_ptr<IntegerIndexStorage>>;
52 
53   // Maximum number of individual property storages that this index will allow
54   // before falling back to placing hits for any new properties into the
55   // 'wildcard' storage.
56   static constexpr int kMaxPropertyStorages = 32;
57 
58   static constexpr int32_t kDefaultNumDataThresholdForBucketSplit =
59       IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit;
60 
61   struct Info {
62     static constexpr int32_t kMagic = 0x5d8a1e8a;
63 
64     int32_t magic;
65     DocumentId last_added_document_id;
66     int32_t num_data_threshold_for_bucket_split;
67 
ComputeChecksumInfo68     Crc32 ComputeChecksum() const {
69       return Crc32(
70           std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
71     }
72   } __attribute__((packed));
73   static_assert(sizeof(Info) == 12, "");
74 
75   // Metadata file layout: <Crcs><Info>
76   static constexpr int32_t kCrcsMetadataFileOffset = 0;
77   static constexpr int32_t kInfoMetadataFileOffset =
78       static_cast<int32_t>(sizeof(Crcs));
79   static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info);
80   static_assert(kMetadataFileSize == 24, "");
81 
82   static constexpr WorkingPathType kWorkingPathType =
83       WorkingPathType::kDirectory;
84   static constexpr std::string_view kFilePrefix = "integer_index";
85 
86   // Creates a new IntegerIndex instance to index integers. If any of the
87   // underlying file is missing, then delete the whole working_path and
88   // (re)initialize with new ones. Otherwise initialize and create the instance
89   // by existing files.
90   //
91   // filesystem: Object to make system level calls
92   // working_path: Specifies the working path for PersistentStorage.
93   //               IntegerIndex uses working path as working directory and all
94   //               related files will be stored under this directory. See
95   //               PersistentStorage for more details about the concept of
96   //               working_path.
97   // num_data_threshold_for_bucket_split: see IntegerIndexStorage::Options for
98   //                                      more details.
99   // pre_mapping_fbv: flag indicating whether memory map max possible file size
100   //                  for underlying FileBackedVector before growing the actual
101   //                  file size.
102   //
103   // Returns:
104   //   - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
105   //                               checksum.
106   //   - INTERNAL_ERROR on I/O errors.
107   //   - Any FileBackedVector/MemoryMappedFile errors.
108   static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> Create(
109       const Filesystem& filesystem, std::string working_path,
110       int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv);
111 
112   // Deletes IntegerIndex under working_path.
113   //
114   // Returns:
115   //   - OK on success
116   //   - INTERNAL_ERROR on I/O error
Discard(const Filesystem & filesystem,const std::string & working_path)117   static libtextclassifier3::Status Discard(const Filesystem& filesystem,
118                                             const std::string& working_path) {
119     return PersistentStorage::Discard(filesystem, working_path,
120                                       kWorkingPathType);
121   }
122 
123   ~IntegerIndex() override;
124 
125   // Returns an Editor instance for adding new records into integer index for a
126   // given property, DocumentId and SectionId. See Editor for more details.
Edit(std::string_view property_path,DocumentId document_id,SectionId section_id)127   std::unique_ptr<typename NumericIndex<int64_t>::Editor> Edit(
128       std::string_view property_path, DocumentId document_id,
129       SectionId section_id) override {
130     return std::make_unique<Editor>(property_path, document_id, section_id,
131                                     *this, num_data_threshold_for_bucket_split_,
132                                     pre_mapping_fbv_);
133   }
134 
135   // Returns a DocHitInfoIterator for iterating through all docs which have the
136   // specified (integer) property contents in range [query_key_lower,
137   // query_key_upper].
138   // When iterating through all relevant doc hits, it:
139   // - Merges multiple SectionIds of doc hits with same DocumentId into a single
140   //   SectionIdMask and constructs DocHitInfo.
141   // - Returns DocHitInfo in descending DocumentId order.
142   //
143   // Returns:
144   //   - On success: a DocHitInfoIterator instance
145   //   - NOT_FOUND_ERROR if the given property_path doesn't exist
146   //   - Any IntegerIndexStorage errors
147   libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
148       std::string_view property_path, int64_t key_lower, int64_t key_upper,
149       const DocumentStore& document_store, const SchemaStore& schema_store,
150       int64_t current_time_ms) const override;
151 
152   // Reduces internal file sizes by reclaiming space and ids of deleted
153   // documents. Integer index will convert all data (hits) to the new document
154   // ids and regenerate all index files. If all data in a property path are
155   // completely deleted, then the underlying storage will be discarded as well.
156   //
157   // - document_id_old_to_new: a map for converting old document id to new
158   //   document id.
159   // - new_last_added_document_id: will be used to update the last added
160   //                               document id in the integer index.
161   //
162   // Returns:
163   //   - OK on success
164   //   - INTERNAL_ERROR on IO error
165   libtextclassifier3::Status Optimize(
166       const std::vector<DocumentId>& document_id_old_to_new,
167       DocumentId new_last_added_document_id) override;
168 
169   // Clears all integer index data by discarding all existing storages, and set
170   // last_added_document_id to kInvalidDocumentId.
171   //
172   // Returns:
173   //   - OK on success
174   //   - INTERNAL_ERROR on I/O error
175   libtextclassifier3::Status Clear() override;
176 
last_added_document_id()177   DocumentId last_added_document_id() const override {
178     return info().last_added_document_id;
179   }
180 
set_last_added_document_id(DocumentId document_id)181   void set_last_added_document_id(DocumentId document_id) override {
182     SetInfoDirty();
183 
184     Info& info_ref = info();
185     if (info_ref.last_added_document_id == kInvalidDocumentId ||
186         document_id > info_ref.last_added_document_id) {
187       info_ref.last_added_document_id = document_id;
188     }
189   }
190 
num_property_indices()191   int num_property_indices() const override {
192     return property_to_storage_map_.size() +
193            ((wildcard_index_storage_ == nullptr) ? 0 : 1);
194   }
195 
196  private:
197   class Editor : public NumericIndex<int64_t>::Editor {
198    public:
Editor(std::string_view property_path,DocumentId document_id,SectionId section_id,IntegerIndex & integer_index,int32_t num_data_threshold_for_bucket_split,bool pre_mapping_fbv)199     explicit Editor(std::string_view property_path, DocumentId document_id,
200                     SectionId section_id, IntegerIndex& integer_index,
201                     int32_t num_data_threshold_for_bucket_split,
202                     bool pre_mapping_fbv)
203         : NumericIndex<int64_t>::Editor(property_path, document_id, section_id),
204           integer_index_(integer_index),
205           num_data_threshold_for_bucket_split_(
206               num_data_threshold_for_bucket_split),
207           pre_mapping_fbv_(pre_mapping_fbv) {}
208 
209     ~Editor() override = default;
210 
BufferKey(int64_t key)211     libtextclassifier3::Status BufferKey(int64_t key) override {
212       seen_keys_.push_back(key);
213       return libtextclassifier3::Status::OK;
214     }
215 
216     libtextclassifier3::Status IndexAllBufferedKeys() && override;
217 
218    private:
219     // Vector for caching all seen keys. Since IntegerIndexStorage::AddKeys
220     // sorts and dedupes keys, we can just simply use vector here and move it to
221     // AddKeys().
222     std::vector<int64_t> seen_keys_;
223 
224     IntegerIndex& integer_index_;  // Does not own.
225 
226     int32_t num_data_threshold_for_bucket_split_;
227 
228     // Flag indicating whether memory map max possible file size for underlying
229     // FileBackedVector before growing the actual file size.
230     bool pre_mapping_fbv_;
231   };
232 
IntegerIndex(const Filesystem & filesystem,std::string && working_path,std::unique_ptr<PostingListIntegerIndexSerializer> posting_list_serializer,std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,PropertyToStorageMapType && property_to_storage_map,std::unique_ptr<FileBackedProto<WildcardPropertyStorage>> wildcard_property_storage,std::unordered_set<std::string> wildcard_properties_set,std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage,int32_t num_data_threshold_for_bucket_split,bool pre_mapping_fbv)233   explicit IntegerIndex(
234       const Filesystem& filesystem, std::string&& working_path,
235       std::unique_ptr<PostingListIntegerIndexSerializer>
236           posting_list_serializer,
237       std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,
238       PropertyToStorageMapType&& property_to_storage_map,
239       std::unique_ptr<FileBackedProto<WildcardPropertyStorage>>
240           wildcard_property_storage,
241       std::unordered_set<std::string> wildcard_properties_set,
242       std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage,
243       int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv)
244       : NumericIndex<int64_t>(filesystem, std::move(working_path),
245                               kWorkingPathType),
246         posting_list_serializer_(std::move(posting_list_serializer)),
247         metadata_mmapped_file_(std::move(metadata_mmapped_file)),
248         property_to_storage_map_(std::move(property_to_storage_map)),
249         wildcard_property_storage_(std::move(wildcard_property_storage)),
250         wildcard_properties_set_(std::move(wildcard_properties_set)),
251         wildcard_index_storage_(std::move(wildcard_index_storage)),
252         num_data_threshold_for_bucket_split_(
253             num_data_threshold_for_bucket_split),
254         pre_mapping_fbv_(pre_mapping_fbv),
255         is_info_dirty_(false),
256         is_storage_dirty_(false) {}
257 
258   static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>>
259   InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path,
260                      int32_t num_data_threshold_for_bucket_split,
261                      bool pre_mapping_fbv);
262 
263   static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>>
264   InitializeExistingFiles(const Filesystem& filesystem,
265                           std::string&& working_path,
266                           int32_t num_data_threshold_for_bucket_split,
267                           bool pre_mapping_fbv);
268 
269   // Adds the property path to the list of properties using wildcard storage.
270   // This will both update the in-memory list (wildcard_properties_set_) and
271   // the persistent list (wilcard_property_storage_).
272   //
273   // RETURNS:
274   //   - OK on success
275   //   - INTERNAL_ERROR if unable to successfully persist updated properties
276   //     list in wildcard_property_storage_.
277   libtextclassifier3::Status AddPropertyToWildcardStorage(
278       const std::string& property_path);
279 
280   // Transfers integer index data from the current integer index to
281   // new_integer_index.
282   //
283   // Returns:
284   //   - OK on success
285   //   - INTERNAL_ERROR on I/O error. This could potentially leave the storages
286   //     in an invalid state and the caller should handle it properly (e.g.
287   //     discard and rebuild)
288   libtextclassifier3::Status TransferIndex(
289       const std::vector<DocumentId>& document_id_old_to_new,
290       IntegerIndex* new_integer_index) const;
291 
292   // Transfers integer index data from old_storage to new_integer_index.
293   //
294   // Returns:
295   //   - OK on success
296   //   - INTERNAL_ERROR on I/O error. This could potentially leave the storages
297   //     in an invalid state and the caller should handle it properly (e.g.
298   //     discard and rebuild)
299   libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
300   TransferIntegerIndexStorage(
301       const std::vector<DocumentId>& document_id_old_to_new,
302       const IntegerIndexStorage* old_storage, const std::string& property_path,
303       IntegerIndex* new_integer_index) const;
304 
305   // Transfers the persistent and in-memory list of properties using the
306   // wildcard storage from old_storage to new_integer_index.
307   //
308   // RETURNS:
309   //   - OK on success
310   //   - INTERNAL_ERROR if unable to successfully persist updated properties
311   //     list in new_integer_index.
312   libtextclassifier3::Status TransferWildcardStorage(
313       IntegerIndex* new_integer_index) const;
314 
315   // Flushes contents of all storages to underlying files.
316   //
317   // Returns:
318   //   - OK on success
319   //   - INTERNAL_ERROR on I/O error
320   libtextclassifier3::Status PersistStoragesToDisk(bool force) override;
321 
322   // Flushes contents of metadata file.
323   //
324   // Returns:
325   //   - OK on success
326   //   - INTERNAL_ERROR on I/O error
327   libtextclassifier3::Status PersistMetadataToDisk(bool force) override;
328 
329   // Computes and returns Info checksum.
330   //
331   // Returns:
332   //   - Crc of the Info on success
333   libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override;
334 
335   // Computes and returns all storages checksum. Checksums of (storage_crc,
336   // property_path) for all existing property paths will be combined together by
337   // XOR.
338   //
339   // Returns:
340   //   - Crc of all storages on success
341   //   - INTERNAL_ERROR if any data inconsistency
342   libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum(
343       bool force) override;
344 
crcs()345   Crcs& crcs() override {
346     return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() +
347                                     kCrcsMetadataFileOffset);
348   }
349 
crcs()350   const Crcs& crcs() const override {
351     return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() +
352                                           kCrcsMetadataFileOffset);
353   }
354 
info()355   Info& info() {
356     return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() +
357                                     kInfoMetadataFileOffset);
358   }
359 
info()360   const Info& info() const {
361     return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() +
362                                           kInfoMetadataFileOffset);
363   }
364 
SetInfoDirty()365   void SetInfoDirty() { is_info_dirty_ = true; }
366   // When storage is dirty, we have to set info dirty as well. So just expose
367   // SetDirty to set both.
SetDirty()368   void SetDirty() {
369     is_info_dirty_ = true;
370     is_storage_dirty_ = true;
371   }
372 
is_info_dirty()373   bool is_info_dirty() const { return is_info_dirty_; }
is_storage_dirty()374   bool is_storage_dirty() const { return is_storage_dirty_; }
375 
376   std::unique_ptr<PostingListIntegerIndexSerializer> posting_list_serializer_;
377 
378   std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_;
379 
380   // Property path to integer index storage map.
381   PropertyToStorageMapType property_to_storage_map_;
382 
383   // Persistent list of properties that have added content to
384   // wildcard_index_storage_.
385   std::unique_ptr<FileBackedProto<WildcardPropertyStorage>>
386       wildcard_property_storage_;
387 
388   // In-memory list of properties that have added content to
389   // wildcard_index_storage_.
390   std::unordered_set<std::string> wildcard_properties_set_;
391 
392   // The index storage that is used once we have already created
393   // kMaxPropertyStorages in property_to_storage_map.
394   std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage_;
395 
396   int32_t num_data_threshold_for_bucket_split_;
397 
398   // Flag indicating whether memory map max possible file size for underlying
399   // FileBackedVector before growing the actual file size.
400   bool pre_mapping_fbv_;
401 
402   bool is_info_dirty_;
403   bool is_storage_dirty_;
404 };
405 
406 }  // namespace lib
407 }  // namespace icing
408 
409 #endif  // ICING_INDEX_NUMERIC_INTEGER_INDEX_H_
410