1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/scoring/ranker.h"
16
17 #include <algorithm>
18 #include <vector>
19
20 #include "icing/text_classifier/lib3/utils/base/statusor.h"
21 #include "icing/absl_ports/canonical_errors.h"
22 #include "icing/scoring/scored-document-hit.h"
23 #include "icing/util/logging.h"
24
25 namespace icing {
26 namespace lib {
27
28 namespace {
29 // For all the heap manipulations in this file, we use a vector to represent the
30 // heap. The element at index 0 is the root node. For any node at index i, its
31 // left child node is at 2 * i + 1, its right child node is at 2 * i + 2.
32
33 // Helper function to wrap the heapify algorithm, it heapifies the target
34 // subtree node in place.
35 // TODO(b/152934343) refactor the heapify function and making it into a class.
Heapify(std::vector<ScoredDocumentHit> * scored_document_hits,int target_subtree_root_index,const ScoredDocumentHitComparator & scored_document_hit_comparator)36 void Heapify(
37 std::vector<ScoredDocumentHit>* scored_document_hits,
38 int target_subtree_root_index,
39 const ScoredDocumentHitComparator& scored_document_hit_comparator) {
40 const int heap_size = scored_document_hits->size();
41 if (target_subtree_root_index >= heap_size) {
42 return;
43 }
44
45 // Initializes subtree root as the current best node.
46 int best = target_subtree_root_index;
47 // If we represent a heap in an array/vector, indices of left and right
48 // children can be calculated.
49 const int left = target_subtree_root_index * 2 + 1;
50 const int right = target_subtree_root_index * 2 + 2;
51
52 // If left child is better than current best
53 if (left < heap_size &&
54 scored_document_hit_comparator(scored_document_hits->at(left),
55 scored_document_hits->at(best))) {
56 best = left;
57 }
58
59 // If right child is better than current best
60 if (right < heap_size &&
61 scored_document_hit_comparator(scored_document_hits->at(right),
62 scored_document_hits->at(best))) {
63 best = right;
64 }
65
66 // If the best is not the subtree root, swap and continue heapifying the lower
67 // level subtree
68 if (best != target_subtree_root_index) {
69 std::swap(scored_document_hits->at(best),
70 scored_document_hits->at(target_subtree_root_index));
71 Heapify(scored_document_hits, best, scored_document_hit_comparator);
72 }
73 }
74
75 // Heapify the given term vector from top to bottom. Call it after add or
76 // replace an element at the front of the vector.
HeapifyTermDown(std::vector<TermMetadata> & scored_terms,int target_subtree_root_index)77 void HeapifyTermDown(std::vector<TermMetadata>& scored_terms,
78 int target_subtree_root_index) {
79 int heap_size = scored_terms.size();
80 if (target_subtree_root_index >= heap_size) {
81 return;
82 }
83
84 // Initializes subtree root as the current minimum node.
85 int min = target_subtree_root_index;
86 // If we represent a heap in an array/vector, indices of left and right
87 // children can be calculated as such.
88 const int left = target_subtree_root_index * 2 + 1;
89 const int right = target_subtree_root_index * 2 + 2;
90
91 // If left child is smaller than current minimum.
92 if (left < heap_size &&
93 scored_terms.at(left).hit_count < scored_terms.at(min).hit_count) {
94 min = left;
95 }
96
97 // If right child is smaller than current minimum.
98 if (right < heap_size &&
99 scored_terms.at(right).hit_count < scored_terms.at(min).hit_count) {
100 min = right;
101 }
102
103 // If the minimum is not the subtree root, swap and continue heapifying the
104 // lower level subtree.
105 if (min != target_subtree_root_index) {
106 std::swap(scored_terms.at(min),
107 scored_terms.at(target_subtree_root_index));
108 HeapifyTermDown(scored_terms, min);
109 }
110 }
111
112 // Heapify the given term vector from bottom to top. Call it after add an
113 // element at the end of the vector.
HeapifyTermUp(std::vector<TermMetadata> & scored_terms,int target_subtree_child_index)114 void HeapifyTermUp(std::vector<TermMetadata>& scored_terms,
115 int target_subtree_child_index) {
116 // If we represent a heap in an array/vector, indices of root can be
117 // calculated as such.
118 const int root = (target_subtree_child_index + 1) / 2 - 1;
119
120 // If the current child is smaller than the root, swap and continue heapifying
121 // the upper level subtree
122 if (root >= 0 && scored_terms.at(target_subtree_child_index).hit_count <
123 scored_terms.at(root).hit_count) {
124 std::swap(scored_terms.at(root),
125 scored_terms.at(target_subtree_child_index));
126 HeapifyTermUp(scored_terms, root);
127 }
128 }
129
PopRootTerm(std::vector<TermMetadata> & scored_terms)130 TermMetadata PopRootTerm(std::vector<TermMetadata>& scored_terms) {
131 if (scored_terms.empty()) {
132 // Return an invalid TermMetadata as a sentinel value.
133 return TermMetadata(/*content_in=*/"", /*hit_count_in=*/-1);
134 }
135
136 // Steps to extract root from heap:
137 // 1. copy out root
138 TermMetadata root = scored_terms.at(0);
139 const size_t last_node_index = scored_terms.size() - 1;
140 // 2. swap root and the last node
141 std::swap(scored_terms.at(0), scored_terms.at(last_node_index));
142 // 3. remove last node
143 scored_terms.pop_back();
144 // 4. heapify root
145 HeapifyTermDown(scored_terms, /*target_subtree_root_index=*/0);
146 return root;
147 }
148
149 // Helper function to extract the root from the heap. The heap structure will be
150 // maintained.
151 //
152 // Returns:
153 // The current root element on success
154 // RESOURCE_EXHAUSTED_ERROR if heap is empty
PopRoot(std::vector<ScoredDocumentHit> * scored_document_hits_heap,const ScoredDocumentHitComparator & scored_document_hit_comparator)155 libtextclassifier3::StatusOr<ScoredDocumentHit> PopRoot(
156 std::vector<ScoredDocumentHit>* scored_document_hits_heap,
157 const ScoredDocumentHitComparator& scored_document_hit_comparator) {
158 if (scored_document_hits_heap->empty()) {
159 // An invalid ScoredDocumentHit
160 return absl_ports::ResourceExhaustedError("Heap is empty");
161 }
162
163 // Steps to extract root from heap:
164 // 1. copy out root
165 ScoredDocumentHit root = scored_document_hits_heap->at(0);
166 const size_t last_node_index = scored_document_hits_heap->size() - 1;
167 // 2. swap root and the last node
168 std::swap(scored_document_hits_heap->at(0),
169 scored_document_hits_heap->at(last_node_index));
170 // 3. remove last node
171 scored_document_hits_heap->pop_back();
172 // 4. heapify root
173 Heapify(scored_document_hits_heap, /*target_subtree_root_index=*/0,
174 scored_document_hit_comparator);
175 return root;
176 }
177
178 } // namespace
179
BuildHeapInPlace(std::vector<ScoredDocumentHit> * scored_document_hits,const ScoredDocumentHitComparator & scored_document_hit_comparator)180 void BuildHeapInPlace(
181 std::vector<ScoredDocumentHit>* scored_document_hits,
182 const ScoredDocumentHitComparator& scored_document_hit_comparator) {
183 const int heap_size = scored_document_hits->size();
184 // Since we use a vector to represent the heap, [size / 2 - 1] is the index
185 // of the parent node of the last node.
186 for (int subtree_root_index = heap_size / 2 - 1; subtree_root_index >= 0;
187 subtree_root_index--) {
188 Heapify(scored_document_hits, subtree_root_index,
189 scored_document_hit_comparator);
190 }
191 }
192
PushToTermHeap(TermMetadata term,int number_to_return,std::vector<TermMetadata> & scored_terms_heap)193 void PushToTermHeap(TermMetadata term, int number_to_return,
194 std::vector<TermMetadata>& scored_terms_heap) {
195 if (scored_terms_heap.size() < number_to_return) {
196 scored_terms_heap.push_back(std::move(term));
197 // We insert at end, so we should heapify bottom up.
198 HeapifyTermUp(scored_terms_heap, scored_terms_heap.size() - 1);
199 } else if (scored_terms_heap.at(0).hit_count < term.hit_count) {
200 scored_terms_heap.at(0) = std::move(term);
201 // We insert at root, so we should heapify top down.
202 HeapifyTermDown(scored_terms_heap, /*target_subtree_root_index=*/0);
203 }
204 }
205
PopTopResultsFromHeap(std::vector<ScoredDocumentHit> * scored_document_hits_heap,int num_results,const ScoredDocumentHitComparator & scored_document_hit_comparator)206 std::vector<ScoredDocumentHit> PopTopResultsFromHeap(
207 std::vector<ScoredDocumentHit>* scored_document_hits_heap, int num_results,
208 const ScoredDocumentHitComparator& scored_document_hit_comparator) {
209 std::vector<ScoredDocumentHit> scored_document_hit_result;
210 int result_size = std::min(
211 num_results, static_cast<int>(scored_document_hits_heap->size()));
212 while (result_size-- > 0) {
213 libtextclassifier3::StatusOr<ScoredDocumentHit> next_best_document_hit_or =
214 PopRoot(scored_document_hits_heap, scored_document_hit_comparator);
215 if (next_best_document_hit_or.ok()) {
216 scored_document_hit_result.push_back(
217 std::move(next_best_document_hit_or).ValueOrDie());
218 } else {
219 ICING_VLOG(1) << next_best_document_hit_or.status().error_message();
220 }
221 }
222 return scored_document_hit_result;
223 }
224
PopAllTermsFromHeap(std::vector<TermMetadata> & scored_terms_heap)225 std::vector<TermMetadata> PopAllTermsFromHeap(
226 std::vector<TermMetadata>& scored_terms_heap) {
227 std::vector<TermMetadata> top_term_result;
228 top_term_result.reserve(scored_terms_heap.size());
229 while (!scored_terms_heap.empty()) {
230 top_term_result.push_back(PopRootTerm(scored_terms_heap));
231 }
232 return top_term_result;
233 }
234
235 } // namespace lib
236 } // namespace icing
237