• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/main/main-index-merger.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <cstring>
20 #include <unordered_map>
21 #include <utility>
22 #include <vector>
23 
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "icing/absl_ports/canonical_errors.h"
26 #include "icing/file/posting_list/posting-list-common.h"
27 #include "icing/index/hit/hit.h"
28 #include "icing/index/lite/lite-index.h"
29 #include "icing/index/lite/term-id-hit-pair.h"
30 #include "icing/index/main/main-index.h"
31 #include "icing/index/term-id-codec.h"
32 #include "icing/legacy/core/icing-string-util.h"
33 #include "icing/util/logging.h"
34 #include "icing/util/math-util.h"
35 #include "icing/util/status-macros.h"
36 
37 namespace icing {
38 namespace lib {
39 
40 namespace {
41 
42 class HitSelector {
43  public:
44   // Returns whether or not term_id_hit_pair has the same term_id, document_id
45   // and section_id as the previously selected hits.
IsEquivalentHit(const TermIdHitPair & term_id_hit_pair)46   bool IsEquivalentHit(const TermIdHitPair& term_id_hit_pair) {
47     return prev_.term_id() == term_id_hit_pair.term_id() &&
48            prev_.hit().document_id() == term_id_hit_pair.hit().document_id() &&
49            prev_.hit().section_id() == term_id_hit_pair.hit().section_id();
50   }
51 
52   // Merges term_id_hit_pair with previously added hits.
SelectIfBetter(const TermIdHitPair & term_id_hit_pair)53   void SelectIfBetter(const TermIdHitPair& term_id_hit_pair) {
54     if (term_id_hit_pair.hit().is_prefix_hit()) {
55       SelectPrefixHitIfBetter(term_id_hit_pair);
56     } else {
57       SelectExactHitIfBetter(term_id_hit_pair);
58     }
59     prev_ = term_id_hit_pair;
60   }
61 
62   // Adds all valid, selected hits to hits starting at position pos in hits.
63   // Returns the offset in hits after the position of the last added hit.
64   // This function may add between 0-2 hits depending on whether the HitSelector
65   // holds both a valid exact hit and a valid prefix hit, one of those or none.
InsertSelectedHits(size_t pos,std::vector<TermIdHitPair> * hits)66   size_t InsertSelectedHits(size_t pos, std::vector<TermIdHitPair>* hits) {
67     // Given the prefix/exact hits for a given term+docid+sectionid, push needed
68     // hits into hits array at offset pos. Return new pos.
69     if (best_prefix_hit_.hit().is_valid() && best_exact_hit_.hit().is_valid()) {
70       (*hits)[pos++] = best_exact_hit_;
71       const Hit& prefix_hit = best_prefix_hit_.hit();
72       // The prefix hit has score equal to the sum of the scores, capped at
73       // kMaxTermFrequency.
74       Hit::TermFrequency final_term_frequency = std::min(
75           static_cast<int>(Hit::kMaxTermFrequency),
76           prefix_hit.term_frequency() + best_exact_hit_.hit().term_frequency());
77       best_prefix_hit_ = TermIdHitPair(
78           best_prefix_hit_.term_id(),
79           Hit(prefix_hit.section_id(), prefix_hit.document_id(),
80               final_term_frequency, prefix_hit.is_in_prefix_section(),
81               prefix_hit.is_prefix_hit(), prefix_hit.is_stemmed_hit()));
82       (*hits)[pos++] = best_prefix_hit_;
83       // Ensure sorted.
84       if (best_prefix_hit_.hit() < best_exact_hit_.hit()) {
85         std::swap((*hits)[pos - 1], (*hits)[pos - 2]);
86       }
87     } else if (best_prefix_hit_.hit().is_valid()) {
88       (*hits)[pos++] = best_prefix_hit_;
89     } else if (best_exact_hit_.hit().is_valid()) {
90       (*hits)[pos++] = best_exact_hit_;
91     }
92 
93     return pos;
94   }
95 
Reset()96   void Reset() {
97     best_prefix_hit_ = TermIdHitPair();
98     best_exact_hit_ = TermIdHitPair();
99     prev_ = TermIdHitPair();
100   }
101 
102  private:
SelectPrefixHitIfBetter(const TermIdHitPair & term_id_hit_pair)103   void SelectPrefixHitIfBetter(const TermIdHitPair& term_id_hit_pair) {
104     if (!best_prefix_hit_.hit().is_valid()) {
105       best_prefix_hit_ = term_id_hit_pair;
106     } else {
107       const Hit& hit = term_id_hit_pair.hit();
108       // Create a new prefix hit with term_frequency as the sum of the term
109       // frequencies. The term frequency is capped at kMaxTermFrequency.
110       Hit::TermFrequency final_term_frequency = std::min(
111           static_cast<int>(Hit::kMaxTermFrequency),
112           hit.term_frequency() + best_prefix_hit_.hit().term_frequency());
113       best_prefix_hit_ = TermIdHitPair(
114           term_id_hit_pair.term_id(),
115           Hit(hit.section_id(), hit.document_id(), final_term_frequency,
116               best_prefix_hit_.hit().is_in_prefix_section(),
117               best_prefix_hit_.hit().is_prefix_hit(),
118               best_prefix_hit_.hit().is_stemmed_hit()));
119     }
120   }
121 
SelectExactHitIfBetter(const TermIdHitPair & term_id_hit_pair)122   void SelectExactHitIfBetter(const TermIdHitPair& term_id_hit_pair) {
123     if (!best_exact_hit_.hit().is_valid()) {
124       best_exact_hit_ = term_id_hit_pair;
125     } else {
126       const Hit& hit = term_id_hit_pair.hit();
127       // Create a new exact hit with term_frequency as the sum of the term
128       // frequencies. The term frequency is capped at kMaxHitScore.
129       Hit::TermFrequency final_term_frequency = std::min(
130           static_cast<int>(Hit::kMaxTermFrequency),
131           hit.term_frequency() + best_exact_hit_.hit().term_frequency());
132       best_exact_hit_ = TermIdHitPair(
133           term_id_hit_pair.term_id(),
134           Hit(hit.section_id(), hit.document_id(), final_term_frequency,
135               best_exact_hit_.hit().is_in_prefix_section(),
136               best_exact_hit_.hit().is_prefix_hit(),
137               best_exact_hit_.hit().is_stemmed_hit()));
138     }
139   }
140 
141   TermIdHitPair best_prefix_hit_;
142   TermIdHitPair best_exact_hit_;
143   TermIdHitPair prev_;
144 };
145 
GetIndexBlock(uint32_t term_id,const std::unordered_map<uint32_t,int> & main_tvi_to_block_index,const TermIdCodec & term_id_codec)146 int GetIndexBlock(
147     uint32_t term_id,
148     const std::unordered_map<uint32_t, int>& main_tvi_to_block_index,
149     const TermIdCodec& term_id_codec) {
150   auto term_info_or = term_id_codec.DecodeTermInfo(term_id);
151   if (!term_info_or.ok()) {
152     ICING_LOG(WARNING)
153         << "Unable to decode term-info during merge. This shouldn't happen.";
154     return kInvalidBlockIndex;
155   }
156   TermIdCodec::DecodedTermInfo term_info = std::move(term_info_or).ValueOrDie();
157   auto itr = main_tvi_to_block_index.find(term_info.tvi);
158   if (itr == main_tvi_to_block_index.end()) {
159     return kInvalidBlockIndex;
160   }
161   return itr->second;
162 }
163 
SortHits(std::vector<TermIdHitPair> & hits,const std::unordered_map<uint32_t,int> & main_tvi_to_block_index,const TermIdCodec & term_id_codec)164 void SortHits(std::vector<TermIdHitPair>& hits,
165               const std::unordered_map<uint32_t, int>& main_tvi_to_block_index,
166               const TermIdCodec& term_id_codec) {
167   std::vector<int> indices;
168   indices.reserve(hits.size());
169   std::vector<int> index_blocks;
170   index_blocks.reserve(hits.size());
171   for (int i = 0; i < hits.size(); ++i) {
172     indices.push_back(i);
173     index_blocks.push_back(GetIndexBlock(
174         hits[i].term_id(), main_tvi_to_block_index, term_id_codec));
175   }
176   std::sort(indices.begin(), indices.end(), [&](int i, int j) {
177     // Primary sort by index block. This achieves two things:
178     // 1. It reduces the number of flash writes by grouping together new hits
179     // for terms whose posting lists might share the same index block.
180     // 2. More importantly, this ensures that newly added backfill branch points
181     // will be populated first (because all newly added terms have an invalid
182     // block index of 0) before any new hits are added to the postings lists
183     // that they backfill from.
184     if (index_blocks[i] == index_blocks[j]) {
185       return hits[i].value() < hits[j].value();
186     }
187     return index_blocks[i] < index_blocks[j];
188   });
189 
190   math_util::ApplyPermutation(indices, hits);
191 }
192 
193 // A helper function to dedupe hits stored in hits. Suppose that the lite index
194 // contained a single document with two hits in a single prefix section: "foot"
195 // and "fool". When expanded, there would be four hits:
196 // {"fo", docid0, sectionid0}
197 // {"fo", docid0, sectionid0}
198 // {"foot", docid0, sectionid0}
199 // {"fool", docid0, sectionid0}
200 //
201 // The first two are duplicates of each other. So, this function will dedupe
202 // and shrink hits to be:
203 // {"fo", docid0, sectionid0}
204 // {"foot", docid0, sectionid0}
205 // {"fool", docid0, sectionid0}
206 //
207 // When two or more prefix hits are duplicates, merge into one hit with term
208 // frequency as the sum of the term frequencies. If there is both an exact and
209 // prefix hit for the same term, keep the exact hit as it is, update the prefix
210 // hit so that its term frequency is the sum of the term frequencies.
DedupeHits(std::vector<TermIdHitPair> * hits,const TermIdCodec & term_id_codec,const std::unordered_map<uint32_t,int> & main_tvi_to_block_index)211 void DedupeHits(
212     std::vector<TermIdHitPair>* hits, const TermIdCodec& term_id_codec,
213     const std::unordered_map<uint32_t, int>& main_tvi_to_block_index) {
214   // Now all terms are grouped together and all hits for a term are sorted.
215   // Merge equivalent hits into one.
216   SortHits(*hits, main_tvi_to_block_index, term_id_codec);
217   size_t current_offset = 0;
218   HitSelector hit_selector;
219   for (const TermIdHitPair& term_id_hit_pair : *hits) {
220     if (!hit_selector.IsEquivalentHit(term_id_hit_pair)) {
221       // We've reached a new hit. Insert the previously selected hits that we
222       // had accumulated and reset to add this new hit.
223       current_offset = hit_selector.InsertSelectedHits(current_offset, hits);
224       hit_selector.Reset();
225     }
226     // Update best exact and prefix hit.
227     hit_selector.SelectIfBetter(term_id_hit_pair);
228   }
229 
230   // Push last.
231   current_offset = hit_selector.InsertSelectedHits(current_offset, hits);
232 
233   hits->resize(current_offset);
234 }
235 
236 // Based on experiments with full prefix expansion, the multiplier
237 // is ~4x.
238 constexpr int kAvgPrefixesPerTerm = 4;
239 
240 }  // namespace
241 
242 libtextclassifier3::StatusOr<std::vector<TermIdHitPair>>
TranslateAndExpandLiteHits(const LiteIndex & lite_index,const TermIdCodec & term_id_codec,const MainIndex::LexiconMergeOutputs & lexicon_merge_outputs)243 MainIndexMerger::TranslateAndExpandLiteHits(
244     const LiteIndex& lite_index, const TermIdCodec& term_id_codec,
245     const MainIndex::LexiconMergeOutputs& lexicon_merge_outputs) {
246   std::vector<TermIdHitPair> hits;
247   if (lite_index.empty()) {
248     return hits;
249   }
250   // Reserve enough space for the average number of prefixes per term and the
251   // terms themselves.
252   hits.reserve(lite_index.size() * (kAvgPrefixesPerTerm + 1));
253 
254   // Translate lite tvis to main tvis.
255   for (const TermIdHitPair& term_id_hit_pair : lite_index) {
256     uint32_t cur_term_id = term_id_hit_pair.term_id();
257     ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term,
258                            term_id_codec.DecodeTermInfo(cur_term_id));
259     Hit hit(term_id_hit_pair.hit());
260 
261     // 1. Translate and push original.
262     auto itr =
263         lexicon_merge_outputs.other_tvi_to_main_tvi.find(cur_decoded_term.tvi);
264     if (itr == lexicon_merge_outputs.other_tvi_to_main_tvi.cend()) {
265       // b/37273773
266       return absl_ports::InternalError(IcingStringUtil::StringPrintf(
267           "Trying to translate lite tvi %u that was never added to the lexicon",
268           cur_decoded_term.tvi));
269     }
270     ICING_ASSIGN_OR_RETURN(uint32_t term_id,
271                            term_id_codec.EncodeTvi(itr->second, TviType::MAIN));
272     hits.emplace_back(term_id, hit);
273 
274     // 2. Expand hits in prefix sections.
275     if (hit.is_in_prefix_section()) {
276       // Hit was in a prefix section. Push prefixes. Turn on prefix bit.
277       auto itr_prefixes =
278           lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.find(
279               cur_decoded_term.tvi);
280       if (itr_prefixes ==
281           lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.end()) {
282         ICING_VLOG(1) << "No necessary prefix expansion for "
283                       << cur_decoded_term.tvi;
284         continue;
285       }
286       // The tvis of all prefixes of this hit's term that appear in the main
287       // lexicon are between [prefix_tvis_buf[offset],
288       // prefix_tvis_buf[offset+len]).
289       size_t offset = itr_prefixes->second.first;
290       size_t len = itr_prefixes->second.second;
291       size_t offset_end_exclusive = offset + len;
292       Hit prefix_hit(hit.section_id(), hit.document_id(), hit.term_frequency(),
293                      /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true,
294                      /*is_stemmed_hit=*/false);
295       for (; offset < offset_end_exclusive; ++offset) {
296         // Take the tvi (in the main lexicon) of each prefix term.
297         uint32_t prefix_main_tvi =
298             lexicon_merge_outputs.prefix_tvis_buf[offset];
299         // Convert it to a term_id.
300         ICING_ASSIGN_OR_RETURN(
301             uint32_t prefix_term_id,
302             term_id_codec.EncodeTvi(prefix_main_tvi, TviType::MAIN));
303         // Create add an element for this prefix TermId and prefix Hit to hits.
304         hits.emplace_back(prefix_term_id, prefix_hit);
305       }
306     }
307   }
308   // 3. Remove any duplicate hits.
309   DedupeHits(&hits, term_id_codec,
310              lexicon_merge_outputs.main_tvi_to_block_index);
311   return hits;
312 }
313 
314 }  // namespace lib
315 }  // namespace icing
316