• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/scoring/ranker.h"
16 
17 #include <algorithm>
18 #include <vector>
19 
20 #include "icing/text_classifier/lib3/utils/base/statusor.h"
21 #include "icing/absl_ports/canonical_errors.h"
22 #include "icing/scoring/scored-document-hit.h"
23 #include "icing/util/logging.h"
24 
25 namespace icing {
26 namespace lib {
27 
28 namespace {
29 // For all the heap manipulations in this file, we use a vector to represent the
30 // heap. The element at index 0 is the root node. For any node at index i, its
31 // left child node is at 2 * i + 1, its right child node is at 2 * i + 2.
32 
33 // Helper function to wrap the heapify algorithm, it heapifies the target
34 // subtree node in place.
35 // TODO(b/152934343) refactor the heapify function and making it into a class.
Heapify(std::vector<ScoredDocumentHit> * scored_document_hits,int target_subtree_root_index,const ScoredDocumentHitComparator & scored_document_hit_comparator)36 void Heapify(
37     std::vector<ScoredDocumentHit>* scored_document_hits,
38     int target_subtree_root_index,
39     const ScoredDocumentHitComparator& scored_document_hit_comparator) {
40   const int heap_size = scored_document_hits->size();
41   if (target_subtree_root_index >= heap_size) {
42     return;
43   }
44 
45   // Initializes subtree root as the current best node.
46   int best = target_subtree_root_index;
47   // If we represent a heap in an array/vector, indices of left and right
48   // children can be calculated.
49   const int left = target_subtree_root_index * 2 + 1;
50   const int right = target_subtree_root_index * 2 + 2;
51 
52   // If left child is better than current best
53   if (left < heap_size &&
54       scored_document_hit_comparator(scored_document_hits->at(left),
55                                      scored_document_hits->at(best))) {
56     best = left;
57   }
58 
59   // If right child is better than current best
60   if (right < heap_size &&
61       scored_document_hit_comparator(scored_document_hits->at(right),
62                                      scored_document_hits->at(best))) {
63     best = right;
64   }
65 
66   // If the best is not the subtree root, swap and continue heapifying the lower
67   // level subtree
68   if (best != target_subtree_root_index) {
69     std::swap(scored_document_hits->at(best),
70               scored_document_hits->at(target_subtree_root_index));
71     Heapify(scored_document_hits, best, scored_document_hit_comparator);
72   }
73 }
74 
75 // Heapify the given term vector from top to bottom. Call it after add or
76 // replace an element at the front of the vector.
HeapifyTermDown(std::vector<TermMetadata> & scored_terms,int target_subtree_root_index)77 void HeapifyTermDown(std::vector<TermMetadata>& scored_terms,
78                      int target_subtree_root_index) {
79   int heap_size = scored_terms.size();
80   if (target_subtree_root_index >= heap_size) {
81     return;
82   }
83 
84   // Initializes subtree root as the current minimum node.
85   int min = target_subtree_root_index;
86   // If we represent a heap in an array/vector, indices of left and right
87   // children can be calculated as such.
88   const int left = target_subtree_root_index * 2 + 1;
89   const int right = target_subtree_root_index * 2 + 2;
90 
91   // If left child is smaller than current minimum.
92   if (left < heap_size &&
93       scored_terms.at(left).hit_count < scored_terms.at(min).hit_count) {
94     min = left;
95   }
96 
97   // If right child is smaller than current minimum.
98   if (right < heap_size &&
99       scored_terms.at(right).hit_count < scored_terms.at(min).hit_count) {
100     min = right;
101   }
102 
103   // If the minimum is not the subtree root, swap and continue heapifying the
104   // lower level subtree.
105   if (min != target_subtree_root_index) {
106     std::swap(scored_terms.at(min),
107               scored_terms.at(target_subtree_root_index));
108     HeapifyTermDown(scored_terms, min);
109   }
110 }
111 
112 // Heapify the given term vector from bottom to top. Call it after add an
113 // element at the end of the vector.
HeapifyTermUp(std::vector<TermMetadata> & scored_terms,int target_subtree_child_index)114 void HeapifyTermUp(std::vector<TermMetadata>& scored_terms,
115                    int target_subtree_child_index) {
116   // If we represent a heap in an array/vector, indices of root can be
117   // calculated as such.
118   const int root = (target_subtree_child_index + 1) / 2 - 1;
119 
120   // If the current child is smaller than the root, swap and continue heapifying
121   // the upper level subtree
122   if (root >= 0 && scored_terms.at(target_subtree_child_index).hit_count <
123                        scored_terms.at(root).hit_count) {
124     std::swap(scored_terms.at(root),
125               scored_terms.at(target_subtree_child_index));
126     HeapifyTermUp(scored_terms, root);
127   }
128 }
129 
PopRootTerm(std::vector<TermMetadata> & scored_terms)130 TermMetadata PopRootTerm(std::vector<TermMetadata>& scored_terms) {
131   if (scored_terms.empty()) {
132     // Return an invalid TermMetadata as a sentinel value.
133     return TermMetadata(/*content_in=*/"", /*hit_count_in=*/-1);
134   }
135 
136   // Steps to extract root from heap:
137   // 1. copy out root
138   TermMetadata root = scored_terms.at(0);
139   const size_t last_node_index = scored_terms.size() - 1;
140   // 2. swap root and the last node
141   std::swap(scored_terms.at(0), scored_terms.at(last_node_index));
142   // 3. remove last node
143   scored_terms.pop_back();
144   // 4. heapify root
145   HeapifyTermDown(scored_terms, /*target_subtree_root_index=*/0);
146   return root;
147 }
148 
149 // Helper function to extract the root from the heap. The heap structure will be
150 // maintained.
151 //
152 // Returns:
153 //   The current root element on success
154 //   RESOURCE_EXHAUSTED_ERROR if heap is empty
PopRoot(std::vector<ScoredDocumentHit> * scored_document_hits_heap,const ScoredDocumentHitComparator & scored_document_hit_comparator)155 libtextclassifier3::StatusOr<ScoredDocumentHit> PopRoot(
156     std::vector<ScoredDocumentHit>* scored_document_hits_heap,
157     const ScoredDocumentHitComparator& scored_document_hit_comparator) {
158   if (scored_document_hits_heap->empty()) {
159     // An invalid ScoredDocumentHit
160     return absl_ports::ResourceExhaustedError("Heap is empty");
161   }
162 
163   // Steps to extract root from heap:
164   // 1. copy out root
165   ScoredDocumentHit root = scored_document_hits_heap->at(0);
166   const size_t last_node_index = scored_document_hits_heap->size() - 1;
167   // 2. swap root and the last node
168   std::swap(scored_document_hits_heap->at(0),
169             scored_document_hits_heap->at(last_node_index));
170   // 3. remove last node
171   scored_document_hits_heap->pop_back();
172   // 4. heapify root
173   Heapify(scored_document_hits_heap, /*target_subtree_root_index=*/0,
174           scored_document_hit_comparator);
175   return root;
176 }
177 
178 }  // namespace
179 
BuildHeapInPlace(std::vector<ScoredDocumentHit> * scored_document_hits,const ScoredDocumentHitComparator & scored_document_hit_comparator)180 void BuildHeapInPlace(
181     std::vector<ScoredDocumentHit>* scored_document_hits,
182     const ScoredDocumentHitComparator& scored_document_hit_comparator) {
183   const int heap_size = scored_document_hits->size();
184   // Since we use a vector to represent the heap, [size / 2 - 1] is the index
185   // of the parent node of the last node.
186   for (int subtree_root_index = heap_size / 2 - 1; subtree_root_index >= 0;
187        subtree_root_index--) {
188     Heapify(scored_document_hits, subtree_root_index,
189             scored_document_hit_comparator);
190   }
191 }
192 
PushToTermHeap(TermMetadata term,int number_to_return,std::vector<TermMetadata> & scored_terms_heap)193 void PushToTermHeap(TermMetadata term, int number_to_return,
194                     std::vector<TermMetadata>& scored_terms_heap) {
195   if (scored_terms_heap.size() < number_to_return) {
196     scored_terms_heap.push_back(std::move(term));
197     // We insert at end, so we should heapify bottom up.
198     HeapifyTermUp(scored_terms_heap, scored_terms_heap.size() - 1);
199   } else if (scored_terms_heap.at(0).hit_count < term.hit_count) {
200     scored_terms_heap.at(0) = std::move(term);
201     // We insert at root, so we should heapify top down.
202     HeapifyTermDown(scored_terms_heap, /*target_subtree_root_index=*/0);
203   }
204 }
205 
PopTopResultsFromHeap(std::vector<ScoredDocumentHit> * scored_document_hits_heap,int num_results,const ScoredDocumentHitComparator & scored_document_hit_comparator)206 std::vector<ScoredDocumentHit> PopTopResultsFromHeap(
207     std::vector<ScoredDocumentHit>* scored_document_hits_heap, int num_results,
208     const ScoredDocumentHitComparator& scored_document_hit_comparator) {
209   std::vector<ScoredDocumentHit> scored_document_hit_result;
210   int result_size = std::min(
211       num_results, static_cast<int>(scored_document_hits_heap->size()));
212   while (result_size-- > 0) {
213     libtextclassifier3::StatusOr<ScoredDocumentHit> next_best_document_hit_or =
214         PopRoot(scored_document_hits_heap, scored_document_hit_comparator);
215     if (next_best_document_hit_or.ok()) {
216       scored_document_hit_result.push_back(
217           std::move(next_best_document_hit_or).ValueOrDie());
218     } else {
219       ICING_VLOG(1) << next_best_document_hit_or.status().error_message();
220     }
221   }
222   return scored_document_hit_result;
223 }
224 
PopAllTermsFromHeap(std::vector<TermMetadata> & scored_terms_heap)225 std::vector<TermMetadata> PopAllTermsFromHeap(
226     std::vector<TermMetadata>& scored_terms_heap) {
227   std::vector<TermMetadata> top_term_result;
228   top_term_result.reserve(scored_terms_heap.size());
229   while (!scored_terms_heap.empty()) {
230     top_term_result.push_back(PopRootTerm(scored_terms_heap));
231   }
232   return top_term_result;
233 }
234 
235 }  // namespace lib
236 }  // namespace icing
237