• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/scoring/ranker.h"
16 
17 #include <algorithm>
18 #include <vector>
19 
20 #include "icing/text_classifier/lib3/utils/base/statusor.h"
21 #include "icing/absl_ports/canonical_errors.h"
22 #include "icing/scoring/scored-document-hit.h"
23 #include "icing/util/logging.h"
24 
25 namespace icing {
26 namespace lib {
27 
28 namespace {
29 // For all the heap manipulations in this file, we use a vector to represent the
30 // heap. The element at index 0 is the root node. For any node at index i, its
31 // left child node is at 2 * i + 1, its right child node is at 2 * i + 2.
32 
33 // Helper function to wrap the heapify algorithm, it heapifies the target
34 // subtree node in place.
35 // TODO(b/152934343) refactor the heapify function and making it into a class.
Heapify(std::vector<ScoredDocumentHit> * scored_document_hits,int target_subtree_root_index,const ScoredDocumentHitComparator & scored_document_hit_comparator)36 void Heapify(
37     std::vector<ScoredDocumentHit>* scored_document_hits,
38     int target_subtree_root_index,
39     const ScoredDocumentHitComparator& scored_document_hit_comparator) {
40   const int heap_size = scored_document_hits->size();
41   if (target_subtree_root_index >= heap_size) {
42     return;
43   }
44 
45   // Initializes subtree root as the current best node.
46   int best = target_subtree_root_index;
47   // If we represent a heap in an array/vector, indices of left and right
48   // children can be calculated.
49   const int left = target_subtree_root_index * 2 + 1;
50   const int right = target_subtree_root_index * 2 + 2;
51 
52   // If left child is better than current best
53   if (left < heap_size &&
54       scored_document_hit_comparator(scored_document_hits->at(left),
55                                      scored_document_hits->at(best))) {
56     best = left;
57   }
58 
59   // If right child is better than current best
60   if (right < heap_size &&
61       scored_document_hit_comparator(scored_document_hits->at(right),
62                                      scored_document_hits->at(best))) {
63     best = right;
64   }
65 
66   // If the best is not the subtree root, swap and continue heapifying the lower
67   // level subtree
68   if (best != target_subtree_root_index) {
69     std::swap(scored_document_hits->at(best),
70               scored_document_hits->at(target_subtree_root_index));
71     Heapify(scored_document_hits, best, scored_document_hit_comparator);
72   }
73 }
74 
75 // Heapify the given term vector from top to bottom. Call it after add or
76 // replace an element at the front of the vector.
HeapifyTermDown(std::vector<TermMetadata> & scored_terms,int target_subtree_root_index)77 void HeapifyTermDown(std::vector<TermMetadata>& scored_terms,
78                      int target_subtree_root_index) {
79   int heap_size = scored_terms.size();
80   if (target_subtree_root_index >= heap_size) {
81     return;
82   }
83 
84   // Initializes subtree root as the current minimum node.
85   int min = target_subtree_root_index;
86   // If we represent a heap in an array/vector, indices of left and right
87   // children can be calculated as such.
88   const int left = target_subtree_root_index * 2 + 1;
89   const int right = target_subtree_root_index * 2 + 2;
90 
91   // If left child is smaller than current minimum.
92   if (left < heap_size &&
93       scored_terms.at(left).score < scored_terms.at(min).score) {
94     min = left;
95   }
96 
97   // If right child is smaller than current minimum.
98   if (right < heap_size &&
99       scored_terms.at(right).score < scored_terms.at(min).score) {
100     min = right;
101   }
102 
103   // If the minimum is not the subtree root, swap and continue heapifying the
104   // lower level subtree.
105   if (min != target_subtree_root_index) {
106     std::swap(scored_terms.at(min), scored_terms.at(target_subtree_root_index));
107     HeapifyTermDown(scored_terms, min);
108   }
109 }
110 
111 // Heapify the given term vector from bottom to top. Call it after add an
112 // element at the end of the vector.
HeapifyTermUp(std::vector<TermMetadata> & scored_terms,int target_subtree_child_index)113 void HeapifyTermUp(std::vector<TermMetadata>& scored_terms,
114                    int target_subtree_child_index) {
115   // If we represent a heap in an array/vector, indices of root can be
116   // calculated as such.
117   const int root = (target_subtree_child_index + 1) / 2 - 1;
118 
119   // If the current child is smaller than the root, swap and continue heapifying
120   // the upper level subtree
121   if (root >= 0 && scored_terms.at(target_subtree_child_index).score <
122                        scored_terms.at(root).score) {
123     std::swap(scored_terms.at(root),
124               scored_terms.at(target_subtree_child_index));
125     HeapifyTermUp(scored_terms, root);
126   }
127 }
128 
PopRootTerm(std::vector<TermMetadata> & scored_terms)129 TermMetadata PopRootTerm(std::vector<TermMetadata>& scored_terms) {
130   if (scored_terms.empty()) {
131     // Return an invalid TermMetadata as a sentinel value.
132     return TermMetadata(/*content_in=*/"", /*hit_count_in=*/-1);
133   }
134 
135   // Steps to extract root from heap:
136   // 1. copy out root
137   TermMetadata root = scored_terms.at(0);
138   const size_t last_node_index = scored_terms.size() - 1;
139   // 2. swap root and the last node
140   std::swap(scored_terms.at(0), scored_terms.at(last_node_index));
141   // 3. remove last node
142   scored_terms.pop_back();
143   // 4. heapify root
144   HeapifyTermDown(scored_terms, /*target_subtree_root_index=*/0);
145   return root;
146 }
147 
148 }  // namespace
149 
BuildHeapInPlace(std::vector<ScoredDocumentHit> * scored_document_hits,const ScoredDocumentHitComparator & scored_document_hit_comparator)150 void BuildHeapInPlace(
151     std::vector<ScoredDocumentHit>* scored_document_hits,
152     const ScoredDocumentHitComparator& scored_document_hit_comparator) {
153   const int heap_size = scored_document_hits->size();
154   // Since we use a vector to represent the heap, [size / 2 - 1] is the index
155   // of the parent node of the last node.
156   for (int subtree_root_index = heap_size / 2 - 1; subtree_root_index >= 0;
157        subtree_root_index--) {
158     Heapify(scored_document_hits, subtree_root_index,
159             scored_document_hit_comparator);
160   }
161 }
162 
PushToTermHeap(TermMetadata term,int number_to_return,std::vector<TermMetadata> & scored_terms_heap)163 void PushToTermHeap(TermMetadata term, int number_to_return,
164                     std::vector<TermMetadata>& scored_terms_heap) {
165   if (scored_terms_heap.size() < number_to_return) {
166     scored_terms_heap.push_back(std::move(term));
167     // We insert at end, so we should heapify bottom up.
168     HeapifyTermUp(scored_terms_heap, scored_terms_heap.size() - 1);
169   } else if (scored_terms_heap.at(0).score < term.score) {
170     scored_terms_heap.at(0) = std::move(term);
171     // We insert at root, so we should heapify top down.
172     HeapifyTermDown(scored_terms_heap, /*target_subtree_root_index=*/0);
173   }
174 }
175 
PopNextTopResultFromHeap(std::vector<ScoredDocumentHit> * scored_document_hits_heap,const ScoredDocumentHitComparator & scored_document_hit_comparator)176 libtextclassifier3::StatusOr<ScoredDocumentHit> PopNextTopResultFromHeap(
177     std::vector<ScoredDocumentHit>* scored_document_hits_heap,
178     const ScoredDocumentHitComparator& scored_document_hit_comparator) {
179   if (scored_document_hits_heap->empty()) {
180     // An invalid ScoredDocumentHit
181     return absl_ports::ResourceExhaustedError("Heap is empty");
182   }
183 
184   // Steps to extract root from heap:
185   // 1. copy out root
186   ScoredDocumentHit root = scored_document_hits_heap->at(0);
187   const size_t last_node_index = scored_document_hits_heap->size() - 1;
188   // 2. swap root and the last node
189   std::swap(scored_document_hits_heap->at(0),
190             scored_document_hits_heap->at(last_node_index));
191   // 3. remove last node
192   scored_document_hits_heap->pop_back();
193   // 4. heapify root
194   Heapify(scored_document_hits_heap, /*target_subtree_root_index=*/0,
195           scored_document_hit_comparator);
196   return root;
197 }
198 
PopTopResultsFromHeap(std::vector<ScoredDocumentHit> * scored_document_hits_heap,int num_results,const ScoredDocumentHitComparator & scored_document_hit_comparator)199 std::vector<ScoredDocumentHit> PopTopResultsFromHeap(
200     std::vector<ScoredDocumentHit>* scored_document_hits_heap, int num_results,
201     const ScoredDocumentHitComparator& scored_document_hit_comparator) {
202   std::vector<ScoredDocumentHit> scored_document_hit_result;
203   int result_size = std::min(
204       num_results, static_cast<int>(scored_document_hits_heap->size()));
205   while (result_size-- > 0) {
206     libtextclassifier3::StatusOr<ScoredDocumentHit> next_best_document_hit_or =
207         PopNextTopResultFromHeap(scored_document_hits_heap,
208                                  scored_document_hit_comparator);
209     if (next_best_document_hit_or.ok()) {
210       scored_document_hit_result.push_back(
211           std::move(next_best_document_hit_or).ValueOrDie());
212     } else {
213       ICING_VLOG(1) << next_best_document_hit_or.status().error_message();
214     }
215   }
216   return scored_document_hit_result;
217 }
218 
PopAllTermsFromHeap(std::vector<TermMetadata> & scored_terms_heap)219 std::vector<TermMetadata> PopAllTermsFromHeap(
220     std::vector<TermMetadata>& scored_terms_heap) {
221   std::vector<TermMetadata> top_term_result;
222   top_term_result.reserve(scored_terms_heap.size());
223   while (!scored_terms_heap.empty()) {
224     top_term_result.push_back(PopRootTerm(scored_terms_heap));
225   }
226   return top_term_result;
227 }
228 
229 }  // namespace lib
230 }  // namespace icing
231