1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/main/main-index-merger.h"
16 
17 #include <cstdint>
18 #include <cstring>
19 #include <memory>
20 #include <unordered_map>
21 
22 #include "icing/absl_ports/canonical_errors.h"
23 #include "icing/file/posting_list/index-block.h"
24 #include "icing/index/lite/term-id-hit-pair.h"
25 #include "icing/index/term-id-codec.h"
26 #include "icing/legacy/core/icing-string-util.h"
27 #include "icing/util/logging.h"
28 #include "icing/util/status-macros.h"
29 
30 namespace icing {
31 namespace lib {
32 
33 namespace {
34 
35 class HitSelector {
36  public:
37   // Returns whether or not term_id_hit_pair has the same term_id, document_id
38   // and section_id as the previously selected hits.
IsEquivalentHit(const TermIdHitPair & term_id_hit_pair)39   bool IsEquivalentHit(const TermIdHitPair& term_id_hit_pair) {
40     return prev_.term_id() == term_id_hit_pair.term_id() &&
41            prev_.hit().document_id() == term_id_hit_pair.hit().document_id() &&
42            prev_.hit().section_id() == term_id_hit_pair.hit().section_id();
43   }
44 
45   // Merges term_id_hit_pair with previously added hits.
SelectIfBetter(const TermIdHitPair & term_id_hit_pair)46   void SelectIfBetter(const TermIdHitPair& term_id_hit_pair) {
47     if (term_id_hit_pair.hit().is_prefix_hit()) {
48       SelectPrefixHitIfBetter(term_id_hit_pair);
49     } else {
50       SelectExactHitIfBetter(term_id_hit_pair);
51     }
52     prev_ = term_id_hit_pair;
53   }
54 
55   // Adds all valid, selected hits to hits starting at position pos in hits.
56   // Returns the offset in hits after the position of the last added hit.
57   // This function may add between 0-2 hits depending on whether the HitSelector
58   // holds both a valid exact hit and a valid prefix hit, one of those or none.
InsertSelectedHits(size_t pos,std::vector<TermIdHitPair> * hits)59   size_t InsertSelectedHits(size_t pos, std::vector<TermIdHitPair>* hits) {
60     // Given the prefix/exact hits for a given term+docid+sectionid, push needed
61     // hits into hits array at offset pos. Return new pos.
62     if (best_prefix_hit_.hit().is_valid() && best_exact_hit_.hit().is_valid()) {
63       (*hits)[pos++] = best_exact_hit_;
64       const Hit& prefix_hit = best_prefix_hit_.hit();
65       // The prefix hit has score equal to the sum of the scores, capped at
66       // kMaxTermFrequency.
67       Hit::TermFrequency final_term_frequency = std::min(
68           static_cast<int>(Hit::kMaxTermFrequency),
69           prefix_hit.term_frequency() + best_exact_hit_.hit().term_frequency());
70       best_prefix_hit_ = TermIdHitPair(
71           best_prefix_hit_.term_id(),
72           Hit(prefix_hit.section_id(), prefix_hit.document_id(),
73               final_term_frequency, prefix_hit.is_in_prefix_section(),
74               prefix_hit.is_prefix_hit()));
75       (*hits)[pos++] = best_prefix_hit_;
76       // Ensure sorted.
77       if (best_prefix_hit_.hit() < best_exact_hit_.hit()) {
78         std::swap((*hits)[pos - 1], (*hits)[pos - 2]);
79       }
80     } else if (best_prefix_hit_.hit().is_valid()) {
81       (*hits)[pos++] = best_prefix_hit_;
82     } else if (best_exact_hit_.hit().is_valid()) {
83       (*hits)[pos++] = best_exact_hit_;
84     }
85 
86     return pos;
87   }
88 
Reset()89   void Reset() {
90     best_prefix_hit_ = TermIdHitPair();
91     best_exact_hit_ = TermIdHitPair();
92     prev_ = TermIdHitPair();
93   }
94 
95  private:
SelectPrefixHitIfBetter(const TermIdHitPair & term_id_hit_pair)96   void SelectPrefixHitIfBetter(const TermIdHitPair& term_id_hit_pair) {
97     if (!best_prefix_hit_.hit().is_valid()) {
98       best_prefix_hit_ = term_id_hit_pair;
99     } else {
100       const Hit& hit = term_id_hit_pair.hit();
101       // Create a new prefix hit with term_frequency as the sum of the term
102       // frequencies. The term frequency is capped at kMaxTermFrequency.
103       Hit::TermFrequency final_term_frequency = std::min(
104           static_cast<int>(Hit::kMaxTermFrequency),
105           hit.term_frequency() + best_prefix_hit_.hit().term_frequency());
106       best_prefix_hit_ = TermIdHitPair(
107           term_id_hit_pair.term_id(),
108           Hit(hit.section_id(), hit.document_id(), final_term_frequency,
109               best_prefix_hit_.hit().is_in_prefix_section(),
110               best_prefix_hit_.hit().is_prefix_hit()));
111     }
112   }
113 
SelectExactHitIfBetter(const TermIdHitPair & term_id_hit_pair)114   void SelectExactHitIfBetter(const TermIdHitPair& term_id_hit_pair) {
115     if (!best_exact_hit_.hit().is_valid()) {
116       best_exact_hit_ = term_id_hit_pair;
117     } else {
118       const Hit& hit = term_id_hit_pair.hit();
119       // Create a new exact hit with term_frequency as the sum of the term
120       // frequencies. The term frequency is capped at kMaxHitScore.
121       Hit::TermFrequency final_term_frequency = std::min(
122           static_cast<int>(Hit::kMaxTermFrequency),
123           hit.term_frequency() + best_exact_hit_.hit().term_frequency());
124       best_exact_hit_ = TermIdHitPair(
125           term_id_hit_pair.term_id(),
126           Hit(hit.section_id(), hit.document_id(), final_term_frequency,
127               best_exact_hit_.hit().is_in_prefix_section(),
128               best_exact_hit_.hit().is_prefix_hit()));
129     }
130   }
131 
132   TermIdHitPair best_prefix_hit_;
133   TermIdHitPair best_exact_hit_;
134   TermIdHitPair prev_;
135 };
136 
137 class HitComparator {
138  public:
HitComparator(const TermIdCodec & term_id_codec,const std::unordered_map<uint32_t,int> & main_tvi_to_block_index)139   explicit HitComparator(
140       const TermIdCodec& term_id_codec,
141       const std::unordered_map<uint32_t, int>& main_tvi_to_block_index)
142       : term_id_codec_(&term_id_codec),
143         main_tvi_to_block_index_(&main_tvi_to_block_index) {}
144 
operator ()(const TermIdHitPair & lhs,const TermIdHitPair & rhs) const145   bool operator()(const TermIdHitPair& lhs, const TermIdHitPair& rhs) const {
146     // Primary sort by index block. This acheives two things:
147     // 1. It reduces the number of flash writes by grouping together new hits
148     // for terms whose posting lists might share the same index block.
149     // 2. More importantly, this ensures that newly added backfill branch points
150     // will be populated first (because all newly added terms have an invalid
151     // block index of 0) before any new hits are added to the postings lists
152     // that they backfill from.
153     int lhs_index_block = GetIndexBlock(lhs.term_id());
154     int rhs_index_block = GetIndexBlock(rhs.term_id());
155     if (lhs_index_block == rhs_index_block) {
156       // Secondary sort by term_id and hit.
157       return lhs.value() < rhs.value();
158     }
159     return lhs_index_block < rhs_index_block;
160   }
161 
162  private:
GetIndexBlock(uint32_t term_id) const163   int GetIndexBlock(uint32_t term_id) const {
164     auto term_info_or = term_id_codec_->DecodeTermInfo(term_id);
165     if (!term_info_or.ok()) {
166       ICING_LOG(WARNING)
167           << "Unable to decode term-info during merge. This shouldn't happen.";
168       return kInvalidBlockIndex;
169     }
170     TermIdCodec::DecodedTermInfo term_info =
171         std::move(term_info_or).ValueOrDie();
172     auto itr = main_tvi_to_block_index_->find(term_info.tvi);
173     if (itr == main_tvi_to_block_index_->end()) {
174       return kInvalidBlockIndex;
175     }
176     return itr->second;
177   }
178 
179   const TermIdCodec* term_id_codec_;
180   const std::unordered_map<uint32_t, int>* main_tvi_to_block_index_;
181 };
182 
183 // A helper function to dedupe hits stored in hits. Suppose that the lite index
184 // contained a single document with two hits in a single prefix section: "foot"
185 // and "fool". When expanded, there would be four hits:
186 // {"fo", docid0, sectionid0}
187 // {"fo", docid0, sectionid0}
188 // {"foot", docid0, sectionid0}
189 // {"fool", docid0, sectionid0}
190 //
191 // The first two are duplicates of each other. So, this function will dedupe
192 // and shrink hits to be:
193 // {"fo", docid0, sectionid0}
194 // {"foot", docid0, sectionid0}
195 // {"fool", docid0, sectionid0}
196 //
197 // When two or more prefix hits are duplicates, merge into one hit with term
198 // frequency as the sum of the term frequencies. If there is both an exact and
199 // prefix hit for the same term, keep the exact hit as it is, update the prefix
200 // hit so that its term frequency is the sum of the term frequencies.
DedupeHits(std::vector<TermIdHitPair> * hits,const TermIdCodec & term_id_codec,const std::unordered_map<uint32_t,int> & main_tvi_to_block_index)201 void DedupeHits(
202     std::vector<TermIdHitPair>* hits, const TermIdCodec& term_id_codec,
203     const std::unordered_map<uint32_t, int>& main_tvi_to_block_index) {
204   // Now all terms are grouped together and all hits for a term are sorted.
205   // Merge equivalent hits into one.
206   std::sort(hits->begin(), hits->end(),
207             HitComparator(term_id_codec, main_tvi_to_block_index));
208   size_t current_offset = 0;
209   HitSelector hit_selector;
210   for (const TermIdHitPair& term_id_hit_pair : *hits) {
211     if (!hit_selector.IsEquivalentHit(term_id_hit_pair)) {
212       // We've reached a new hit. Insert the previously selected hits that we
213       // had accumulated and reset to add this new hit.
214       current_offset = hit_selector.InsertSelectedHits(current_offset, hits);
215       hit_selector.Reset();
216     }
217     // Update best exact and prefix hit.
218     hit_selector.SelectIfBetter(term_id_hit_pair);
219   }
220 
221   // Push last.
222   current_offset = hit_selector.InsertSelectedHits(current_offset, hits);
223 
224   hits->resize(current_offset);
225 }
226 
227 // Based on experiments with full prefix expansion, the multiplier
228 // is ~4x.
229 constexpr int kAvgPrefixesPerTerm = 4;
230 
231 }  // namespace
232 
233 libtextclassifier3::StatusOr<std::vector<TermIdHitPair>>
TranslateAndExpandLiteHits(const LiteIndex & lite_index,const TermIdCodec & term_id_codec,const MainIndex::LexiconMergeOutputs & lexicon_merge_outputs)234 MainIndexMerger::TranslateAndExpandLiteHits(
235     const LiteIndex& lite_index, const TermIdCodec& term_id_codec,
236     const MainIndex::LexiconMergeOutputs& lexicon_merge_outputs) {
237   std::vector<TermIdHitPair> hits;
238   if (lite_index.empty()) {
239     return hits;
240   }
241   // Reserve enough space for the average number of prefixes per term and the
242   // terms themselves.
243   hits.reserve(lite_index.size() * (kAvgPrefixesPerTerm + 1));
244 
245   // Translate lite tvis to main tvis.
246   for (const TermIdHitPair& term_id_hit_pair : lite_index) {
247     uint32_t cur_term_id = term_id_hit_pair.term_id();
248     ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term,
249                            term_id_codec.DecodeTermInfo(cur_term_id));
250     Hit hit(term_id_hit_pair.hit());
251 
252     // 1. Translate and push original.
253     auto itr =
254         lexicon_merge_outputs.other_tvi_to_main_tvi.find(cur_decoded_term.tvi);
255     if (itr == lexicon_merge_outputs.other_tvi_to_main_tvi.cend()) {
256       // b/37273773
257       return absl_ports::InternalError(IcingStringUtil::StringPrintf(
258           "Trying to translate lite tvi %u that was never added to the lexicon",
259           cur_decoded_term.tvi));
260     }
261     ICING_ASSIGN_OR_RETURN(uint32_t term_id,
262                            term_id_codec.EncodeTvi(itr->second, TviType::MAIN));
263     hits.emplace_back(term_id, hit);
264 
265     // 2. Expand hits in prefix sections.
266     if (hit.is_in_prefix_section()) {
267       // Hit was in a prefix section. Push prefixes. Turn on prefix bit.
268       auto itr_prefixes =
269           lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.find(
270               cur_decoded_term.tvi);
271       if (itr_prefixes ==
272           lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.end()) {
273         ICING_VLOG(1) << "No necessary prefix expansion for "
274                       << cur_decoded_term.tvi;
275         continue;
276       }
277       // The tvis of all prefixes of this hit's term that appear in the main
278       // lexicon are between [prefix_tvis_buf[offset],
279       // prefix_tvis_buf[offset+len]).
280       size_t offset = itr_prefixes->second.first;
281       size_t len = itr_prefixes->second.second;
282       size_t offset_end_exclusive = offset + len;
283       Hit prefix_hit(hit.section_id(), hit.document_id(), hit.term_frequency(),
284                      /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
285       for (; offset < offset_end_exclusive; ++offset) {
286         // Take the tvi (in the main lexicon) of each prefix term.
287         uint32_t prefix_main_tvi =
288             lexicon_merge_outputs.prefix_tvis_buf[offset];
289         // Convert it to a term_id.
290         ICING_ASSIGN_OR_RETURN(
291             uint32_t prefix_term_id,
292             term_id_codec.EncodeTvi(prefix_main_tvi, TviType::MAIN));
293         // Create add an element for this prefix TermId and prefix Hit to hits.
294         hits.emplace_back(prefix_term_id, prefix_hit);
295       }
296     }
297   }
298   // 3. Remove any duplicate hits.
299   DedupeHits(&hits, term_id_codec,
300              lexicon_merge_outputs.main_tvi_to_block_index);
301   return hits;
302 }
303 
304 }  // namespace lib
305 }  // namespace icing
306