• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_MAIN_MAIN_INDEX_H_
16 #define ICING_INDEX_MAIN_MAIN_INDEX_H_
17 
18 #include <memory>
19 
20 #include "icing/text_classifier/lib3/utils/base/status.h"
21 #include "icing/text_classifier/lib3/utils/base/statusor.h"
22 #include "icing/file/filesystem.h"
23 #include "icing/file/posting_list/flash-index-storage.h"
24 #include "icing/index/lite/term-id-hit-pair.h"
25 #include "icing/index/main/posting-list-hit-accessor.h"
26 #include "icing/index/main/posting-list-hit-serializer.h"
27 #include "icing/index/term-id-codec.h"
28 #include "icing/index/term-metadata.h"
29 #include "icing/legacy/index/icing-dynamic-trie.h"
30 #include "icing/legacy/index/icing-filesystem.h"
31 #include "icing/proto/debug.pb.h"
32 #include "icing/proto/scoring.pb.h"
33 #include "icing/proto/storage.pb.h"
34 #include "icing/proto/term.pb.h"
35 #include "icing/store/namespace-id.h"
36 #include "icing/store/suggestion-result-checker.h"
37 #include "icing/util/status-macros.h"
38 
39 namespace icing {
40 namespace lib {
41 
42 class MainIndex {
43  public:
44   // RETURNS:
45   //  - valid instance of MainIndex, on success.
46   //  - INTERNAL error if unable to create the lexicon or flash storage.
47   static libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> Create(
48       const std::string& index_directory, const Filesystem* filesystem,
49       const IcingFilesystem* icing_filesystem);
50 
51   // Reads magic from existing flash index storage file header. We need this
52   // during Icing initialization phase to determine the version.
53   //
54   // RETURNS:
55   //   - On success, a valid magic.
56   //   - NOT_FOUND if the flash index doesn't exist.
57   //   - INTERNAL on I/O error.
58   static libtextclassifier3::StatusOr<int> ReadFlashIndexMagic(
59       const Filesystem* filesystem, const std::string& index_directory);
60 
61   // Get a PostingListHitAccessor that holds the posting list chain for 'term'.
62   //
63   // RETURNS:
64   //  - On success, a valid PostingListHitAccessor
65   //  - NOT_FOUND if term is not present in the main index.
66   libtextclassifier3::StatusOr<std::unique_ptr<PostingListHitAccessor>>
67   GetAccessorForExactTerm(const std::string& term);
68 
69   // Get a PostingListHitAccessor for 'prefix'.
70   //
71   // RETURNS:
72   //  - On success, a result containing a valid PostingListHitAccessor.
73   //  - NOT_FOUND if neither 'prefix' nor any terms for which 'prefix' is a
74   //    prefix are present in the main index.
75   struct GetPrefixAccessorResult {
76     // A PostingListHitAccessor that holds the posting list chain for the term
77     // that best represents 'prefix' in the main index.
78     std::unique_ptr<PostingListHitAccessor> accessor;
79     // True if the returned posting list chain is for 'prefix' or false if the
80     // returned posting list chain is for a term for which 'prefix' is a prefix.
81     bool exact;
82 
GetPrefixAccessorResultGetPrefixAccessorResult83     explicit GetPrefixAccessorResult(
84         std::unique_ptr<PostingListHitAccessor> accessor_in, bool exact_in)
85         : accessor(std::move(accessor_in)), exact(exact_in) {}
86   };
87   libtextclassifier3::StatusOr<GetPrefixAccessorResult>
88   GetAccessorForPrefixTerm(const std::string& prefix);
89 
90   // Finds terms with the given prefix in the given result checker. The
91   // input prefix must be normalized, otherwise inaccurate results may be
92   // returned. If scoring_match_type is EXACT, only exact hit will be counted
93   // and it is PREFIX, both prefix and exact hits will be counted. Results are
94   // not sorted specifically and are in lexigraphical order. Number of results
95   // are no more than 'num_to_return'.
96   //
97   // Returns:
98   //   A list of TermMetadata on success
99   //   INTERNAL_ERROR if failed to access term data.
100   libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
101       const std::string& prefix, TermMatchType::Code scoring_match_type,
102       SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
103       const SuggestionResultChecker* suggestion_result_checker);
104 
105   struct LexiconMergeOutputs {
106     // Maps from main_lexicon tvi for new branching point to the main_lexicon
107     // tvi for posting list whose hits must be backfilled.
108     std::unordered_map<uint32_t, uint32_t> backfill_map;
109 
110     // Maps from lexicon tvis to main_lexicon tvis.
111     std::unordered_map<uint32_t, uint32_t> other_tvi_to_main_tvi;
112 
113     // Maps from main lexicon tvi to the block index. Tvis with no entry do not
114     // have an allocated posting list.
115     std::unordered_map<uint32_t, int> main_tvi_to_block_index;
116 
117     // Maps from the lexicon tvi to the beginning position in
118     // prefix_tvis_buf and the length.
119     std::unordered_map<uint32_t, std::pair<int, int>>
120         other_tvi_to_prefix_main_tvis;
121 
122     // Stores tvis that are mapped to by other_tvi_to_prefix_tvis.
123     std::vector<uint32_t> prefix_tvis_buf;
124   };
125 
126   // Merge the lexicon into the main lexicon and populate the data
127   // structures necessary to translate lite tvis to main tvis, track backfilling
128   // and expanding lite terms to prefix terms.
129   //
130   // RETURNS:
131   //   - OK on success
132   //   - INTERNAL on IO error while writing to the main lexicon.
MergeLexicon(const IcingDynamicTrie & other_lexicon)133   libtextclassifier3::StatusOr<LexiconMergeOutputs> MergeLexicon(
134       const IcingDynamicTrie& other_lexicon) {
135     // Backfill branch points need to be added first so that the backfill_map
136     // can be correctly populated.
137     ICING_ASSIGN_OR_RETURN(LexiconMergeOutputs outputs,
138                            AddBackfillBranchPoints(other_lexicon));
139     ICING_ASSIGN_OR_RETURN(outputs,
140                            AddTerms(other_lexicon, std::move(outputs)));
141     // Non-backfill branch points need to be added last so that the mapping of
142     // newly added terms to prefix terms can be correctly populated (prefix
143     // terms might be branch points between two new terms or between a
144     // pre-existing term and a new term).
145     ICING_ASSIGN_OR_RETURN(outputs,
146                            AddBranchPoints(other_lexicon, std::move(outputs)));
147     return outputs;
148   }
149 
150   // Add hits to the main index and backfill from existing posting lists to new
151   // backfill branch points.
152   //
153   // The backfill_map maps from main_lexicon tvi for a newly added branching
154   // point to the main_lexicon tvi for the posting list whose hits must be
155   // backfilled. backfill_map should be populated as part of LexiconMergeOutputs
156   // in MergeLexicon and be blindly passed to this function.
157   //
158   // RETURNS:
159   //  - OK on success
160   //  - INVALID_ARGUMENT if one of the elements in the lite index has a term_id
161   //  exceeds the max TermId, is not valid or is not less than pre-existing hits
162   //  in the main index.
163   //  - INTERNAL_ERROR if unable to mmap necessary IndexBlocks
164   //  - RESOURCE_EXHAUSTED error if unable to grow the index
165   libtextclassifier3::Status AddHits(
166       const TermIdCodec& term_id_codec,
167       std::unordered_map<uint32_t, uint32_t>&& backfill_map,
168       std::vector<TermIdHitPair>&& hits, DocumentId last_added_document_id);
169 
PersistToDisk()170   libtextclassifier3::Status PersistToDisk() {
171     if (main_lexicon_->Sync() && flash_index_storage_->PersistToDisk()) {
172       return libtextclassifier3::Status::OK;
173     }
174     return absl_ports::InternalError("Unable to sync main index components.");
175   }
176 
last_added_document_id()177   DocumentId last_added_document_id() const {
178     return flash_index_storage_->get_last_indexed_docid();
179   }
180 
Reset()181   libtextclassifier3::Status Reset() {
182     ICING_RETURN_IF_ERROR(flash_index_storage_->Reset());
183     main_lexicon_->Clear();
184     return libtextclassifier3::Status::OK;
185   }
186 
Warm()187   void Warm() { main_lexicon_->Warm(); }
188 
189   // Returns:
190   //  - elements size of lexicon and index, on success
191   //  - INTERNAL on IO error
192   libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
193 
194   // Takes the provided storage_info, populates the fields related to the main
195   // index and returns that storage_info.
196   //
197   // If an IO error occurs while trying to calculate the value for a field, then
198   // that field will be set to -1.
199   IndexStorageInfoProto GetStorageInfo(
200       IndexStorageInfoProto storage_info) const;
201 
202   // Returns debug information for the main index in out.
203   // verbosity = BASIC, simplest debug information - just the lexicon
204   // verbosity = DETAILED, more detailed debug information including raw
205   // postings lists.
206   std::string GetDebugInfo(DebugInfoVerbosity::Code verbosity) const;
207 
208   // Reduces internal file sizes by reclaiming space of deleted documents.
209   //
210   // This method will update the last_added_docid of the index to the largest
211   // document id that still appears in the index after compaction.
212   //
213   // Returns:
214   //   OK on success
215   //   INTERNAL_ERROR on IO error, this indicates that the index may be in an
216   //                               invalid state and should be cleared.
217   libtextclassifier3::Status Optimize(
218       const std::vector<DocumentId>& document_id_old_to_new);
219 
220  private:
221   explicit MainIndex(const std::string& index_directory,
222                      const Filesystem* filesystem,
223                      const IcingFilesystem* icing_filesystem);
224 
225   libtextclassifier3::Status Init();
226 
227   // Helpers for merging the lexicon
228   // Add all 'backfill' branch points. Backfill branch points are prefix
229   // branch points that are a prefix of terms that existed in the lexicon
230   // to the merge.
231   //
232   // For example, if the main lexicon only contains "foot" and is then merged
233   // with a lite lexicon containing only "fool", then a backfill branch point
234   // for "foo" will be added to contain prefix hits from both the pre-existing
235   // posting list for "foot" and the new posting list for "fool".
236   //
237   // Populates LexiconMergeOutputs.backfill_map
238   //
239   // RETURNS:
240   //   - OK on success
241   //   - INTERNAL on IO error while writing to the main lexicon.
242   libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBackfillBranchPoints(
243       const IcingDynamicTrie& other_lexicon);
244 
245   // Add all terms from the lexicon.
246   //
247   // Populates LexiconMergeOutputs.other_tvi_to_main_tvi
248   //
249   // RETURNS:
250   //   - OK on success
251   //   - INTERNAL on IO error while writing to the main lexicon.
252   libtextclassifier3::StatusOr<LexiconMergeOutputs> AddTerms(
253       const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);
254 
255   // Add all branch points for terms added from the lexicon.
256   // For example, if the main lexicon is empty and is then merged with a
257   // lexicon containing only "foot" and "fool", then a branch point for "foo"
258   // will be added to contain prefix hits from both "foot" and "fool".
259   //
260   // Populates LexiconMergeOutputs.other_tvi_to_prefix_main_tvis and
261   // LexiconMergeOutputs.prefix_tvis_buf;
262   //
263   // RETURNS:
264   //   - OK on success
265   //   - INTERNAL on IO error while writing to the main lexicon.
266   libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBranchPoints(
267       const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);
268 
269   // Copies all properties from old_tvi in the other lexicon to the new_tvi in
270   // the main lexicon.
271   // Returns true on success, false if an IO error is encountered.
272   bool CopyProperties(const IcingDynamicTrie::PropertyReadersAll& prop_reader,
273                       const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
274                       uint32_t new_main_tvi);
275 
276   // Add all hits between [hit_elements, hit_elements + len) to main_index,
277   // updating the entry in the main lexicon at trie_value_index to point to the
278   // resulting posting list. Hits are sorted in descending document id order, so
279   // they should be to posting lists in reverse (starting at hit_elements
280   // + len - 1) and working backwards. Therefore, hit_elements must be in sorted
281   // order.
282   //
283   // trie_value_index may point to a valid posting list id if there is a
284   // pre-existing posting list to append to.
285   //
286   // If backfill_posting_list_id is valid, then the hits from the posting list
287   // identified by backfill_posting_list_id should be added to the new posting
288   // list before the hits in hit_elements.
289   //
290   // RETURNS:
291   //  - OK on success
292   //  - INVALID_ARGUMENT if posting_list_id stored at trie_value_index is valid
293   //  but points out of bounds in the IndexBlock referred to by
294   //  id.block_index(), if one of the hits from [hit_elements,hit_elements+len)
295   //  is not valid, or if one of the hits from [hit_elements,hit_elements+len)
296   //  is not less than the previously added hits.
297   //  - INTERNAL_ERROR if posting_list_id stored at trie_value_index is valid
298   //  but points to an invalid block index or if unable to mmap the IndexBlock.
299   //  - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
300   //  posting list.
301   libtextclassifier3::Status AddHitsForTerm(
302       uint32_t tvi, PostingListIdentifier backfill_posting_list_id,
303       const TermIdHitPair* hit_elements, size_t len);
304 
305   // Adds all prefix hits or hits from prefix sections present on the posting
306   // list identified by backfill_posting_list_id to hit_accum.
307   //
308   // RETURNS:
309   //  - OK, on success
310   //  - INVALID_ARGUMENT if backfill_posting_list_id points out of bounds in the
311   //  IndexBlock referred to by id.block_index()
312   //  - INTERNAL_ERROR if unable to mmap the block identified by
313   //  backfill_posting_list_id or if the posting list identified by
314   //  backfill_posting_list_id has been corrupted.
315   //  - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
316   //  posting list.
317   libtextclassifier3::Status AddPrefixBackfillHits(
318       PostingListIdentifier backfill_posting_list_id,
319       PostingListHitAccessor* hit_accum);
320 
321   // Transfer hits from old_pl_accessor to new_index for term.
322   //
323   // Returns:
324   //   largest document id added to the translated posting list, on success
325   //   INTERNAL_ERROR on IO error
326   static libtextclassifier3::StatusOr<DocumentId> TransferAndAddHits(
327       const std::vector<DocumentId>& document_id_old_to_new, const char* term,
328       PostingListHitAccessor& old_pl_accessor, MainIndex* new_index);
329 
330   // Transfer hits from the current main index to new_index.
331   //
332   // Returns:
333   //   OK on success
334   //   INTERNAL_ERROR on IO error
335   libtextclassifier3::Status TransferIndex(
336       const std::vector<DocumentId>& document_id_old_to_new,
337       MainIndex* new_index);
338 
339   std::string base_dir_;
340   const Filesystem* filesystem_;
341   const IcingFilesystem* icing_filesystem_;
342   std::unique_ptr<PostingListHitSerializer> posting_list_hit_serializer_;
343   std::unique_ptr<FlashIndexStorage> flash_index_storage_;
344   std::unique_ptr<IcingDynamicTrie> main_lexicon_;
345 };
346 
347 }  // namespace lib
348 }  // namespace icing
349 
350 #endif  // ICING_INDEX_MAIN_MAIN_INDEX_H_
351