1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/scoring/ranker.h"
16
17 #include <algorithm>
18 #include <vector>
19
20 #include "icing/text_classifier/lib3/utils/base/statusor.h"
21 #include "icing/absl_ports/canonical_errors.h"
22 #include "icing/scoring/scored-document-hit.h"
23 #include "icing/util/logging.h"
24
25 namespace icing {
26 namespace lib {
27
28 namespace {
29 // For all the heap manipulations in this file, we use a vector to represent the
30 // heap. The element at index 0 is the root node. For any node at index i, its
31 // left child node is at 2 * i + 1, its right child node is at 2 * i + 2.
32
33 // Helper function to wrap the heapify algorithm, it heapifies the target
34 // subtree node in place.
35 // TODO(b/152934343) refactor the heapify function and making it into a class.
Heapify(std::vector<ScoredDocumentHit> * scored_document_hits,int target_subtree_root_index,const ScoredDocumentHitComparator & scored_document_hit_comparator)36 void Heapify(
37 std::vector<ScoredDocumentHit>* scored_document_hits,
38 int target_subtree_root_index,
39 const ScoredDocumentHitComparator& scored_document_hit_comparator) {
40 const int heap_size = scored_document_hits->size();
41 if (target_subtree_root_index >= heap_size) {
42 return;
43 }
44
45 // Initializes subtree root as the current best node.
46 int best = target_subtree_root_index;
47 // If we represent a heap in an array/vector, indices of left and right
48 // children can be calculated.
49 const int left = target_subtree_root_index * 2 + 1;
50 const int right = target_subtree_root_index * 2 + 2;
51
52 // If left child is better than current best
53 if (left < heap_size &&
54 scored_document_hit_comparator(scored_document_hits->at(left),
55 scored_document_hits->at(best))) {
56 best = left;
57 }
58
59 // If right child is better than current best
60 if (right < heap_size &&
61 scored_document_hit_comparator(scored_document_hits->at(right),
62 scored_document_hits->at(best))) {
63 best = right;
64 }
65
66 // If the best is not the subtree root, swap and continue heapifying the lower
67 // level subtree
68 if (best != target_subtree_root_index) {
69 std::swap(scored_document_hits->at(best),
70 scored_document_hits->at(target_subtree_root_index));
71 Heapify(scored_document_hits, best, scored_document_hit_comparator);
72 }
73 }
74
75 // Heapify the given term vector from top to bottom. Call it after add or
76 // replace an element at the front of the vector.
HeapifyTermDown(std::vector<TermMetadata> & scored_terms,int target_subtree_root_index)77 void HeapifyTermDown(std::vector<TermMetadata>& scored_terms,
78 int target_subtree_root_index) {
79 int heap_size = scored_terms.size();
80 if (target_subtree_root_index >= heap_size) {
81 return;
82 }
83
84 // Initializes subtree root as the current minimum node.
85 int min = target_subtree_root_index;
86 // If we represent a heap in an array/vector, indices of left and right
87 // children can be calculated as such.
88 const int left = target_subtree_root_index * 2 + 1;
89 const int right = target_subtree_root_index * 2 + 2;
90
91 // If left child is smaller than current minimum.
92 if (left < heap_size &&
93 scored_terms.at(left).score < scored_terms.at(min).score) {
94 min = left;
95 }
96
97 // If right child is smaller than current minimum.
98 if (right < heap_size &&
99 scored_terms.at(right).score < scored_terms.at(min).score) {
100 min = right;
101 }
102
103 // If the minimum is not the subtree root, swap and continue heapifying the
104 // lower level subtree.
105 if (min != target_subtree_root_index) {
106 std::swap(scored_terms.at(min), scored_terms.at(target_subtree_root_index));
107 HeapifyTermDown(scored_terms, min);
108 }
109 }
110
111 // Heapify the given term vector from bottom to top. Call it after add an
112 // element at the end of the vector.
HeapifyTermUp(std::vector<TermMetadata> & scored_terms,int target_subtree_child_index)113 void HeapifyTermUp(std::vector<TermMetadata>& scored_terms,
114 int target_subtree_child_index) {
115 // If we represent a heap in an array/vector, indices of root can be
116 // calculated as such.
117 const int root = (target_subtree_child_index + 1) / 2 - 1;
118
119 // If the current child is smaller than the root, swap and continue heapifying
120 // the upper level subtree
121 if (root >= 0 && scored_terms.at(target_subtree_child_index).score <
122 scored_terms.at(root).score) {
123 std::swap(scored_terms.at(root),
124 scored_terms.at(target_subtree_child_index));
125 HeapifyTermUp(scored_terms, root);
126 }
127 }
128
PopRootTerm(std::vector<TermMetadata> & scored_terms)129 TermMetadata PopRootTerm(std::vector<TermMetadata>& scored_terms) {
130 if (scored_terms.empty()) {
131 // Return an invalid TermMetadata as a sentinel value.
132 return TermMetadata(/*content_in=*/"", /*hit_count_in=*/-1);
133 }
134
135 // Steps to extract root from heap:
136 // 1. copy out root
137 TermMetadata root = scored_terms.at(0);
138 const size_t last_node_index = scored_terms.size() - 1;
139 // 2. swap root and the last node
140 std::swap(scored_terms.at(0), scored_terms.at(last_node_index));
141 // 3. remove last node
142 scored_terms.pop_back();
143 // 4. heapify root
144 HeapifyTermDown(scored_terms, /*target_subtree_root_index=*/0);
145 return root;
146 }
147
148 } // namespace
149
BuildHeapInPlace(std::vector<ScoredDocumentHit> * scored_document_hits,const ScoredDocumentHitComparator & scored_document_hit_comparator)150 void BuildHeapInPlace(
151 std::vector<ScoredDocumentHit>* scored_document_hits,
152 const ScoredDocumentHitComparator& scored_document_hit_comparator) {
153 const int heap_size = scored_document_hits->size();
154 // Since we use a vector to represent the heap, [size / 2 - 1] is the index
155 // of the parent node of the last node.
156 for (int subtree_root_index = heap_size / 2 - 1; subtree_root_index >= 0;
157 subtree_root_index--) {
158 Heapify(scored_document_hits, subtree_root_index,
159 scored_document_hit_comparator);
160 }
161 }
162
PushToTermHeap(TermMetadata term,int number_to_return,std::vector<TermMetadata> & scored_terms_heap)163 void PushToTermHeap(TermMetadata term, int number_to_return,
164 std::vector<TermMetadata>& scored_terms_heap) {
165 if (scored_terms_heap.size() < number_to_return) {
166 scored_terms_heap.push_back(std::move(term));
167 // We insert at end, so we should heapify bottom up.
168 HeapifyTermUp(scored_terms_heap, scored_terms_heap.size() - 1);
169 } else if (scored_terms_heap.at(0).score < term.score) {
170 scored_terms_heap.at(0) = std::move(term);
171 // We insert at root, so we should heapify top down.
172 HeapifyTermDown(scored_terms_heap, /*target_subtree_root_index=*/0);
173 }
174 }
175
PopNextTopResultFromHeap(std::vector<ScoredDocumentHit> * scored_document_hits_heap,const ScoredDocumentHitComparator & scored_document_hit_comparator)176 libtextclassifier3::StatusOr<ScoredDocumentHit> PopNextTopResultFromHeap(
177 std::vector<ScoredDocumentHit>* scored_document_hits_heap,
178 const ScoredDocumentHitComparator& scored_document_hit_comparator) {
179 if (scored_document_hits_heap->empty()) {
180 // An invalid ScoredDocumentHit
181 return absl_ports::ResourceExhaustedError("Heap is empty");
182 }
183
184 // Steps to extract root from heap:
185 // 1. copy out root
186 ScoredDocumentHit root = scored_document_hits_heap->at(0);
187 const size_t last_node_index = scored_document_hits_heap->size() - 1;
188 // 2. swap root and the last node
189 std::swap(scored_document_hits_heap->at(0),
190 scored_document_hits_heap->at(last_node_index));
191 // 3. remove last node
192 scored_document_hits_heap->pop_back();
193 // 4. heapify root
194 Heapify(scored_document_hits_heap, /*target_subtree_root_index=*/0,
195 scored_document_hit_comparator);
196 return root;
197 }
198
PopTopResultsFromHeap(std::vector<ScoredDocumentHit> * scored_document_hits_heap,int num_results,const ScoredDocumentHitComparator & scored_document_hit_comparator)199 std::vector<ScoredDocumentHit> PopTopResultsFromHeap(
200 std::vector<ScoredDocumentHit>* scored_document_hits_heap, int num_results,
201 const ScoredDocumentHitComparator& scored_document_hit_comparator) {
202 std::vector<ScoredDocumentHit> scored_document_hit_result;
203 int result_size = std::min(
204 num_results, static_cast<int>(scored_document_hits_heap->size()));
205 while (result_size-- > 0) {
206 libtextclassifier3::StatusOr<ScoredDocumentHit> next_best_document_hit_or =
207 PopNextTopResultFromHeap(scored_document_hits_heap,
208 scored_document_hit_comparator);
209 if (next_best_document_hit_or.ok()) {
210 scored_document_hit_result.push_back(
211 std::move(next_best_document_hit_or).ValueOrDie());
212 } else {
213 ICING_VLOG(1) << next_best_document_hit_or.status().error_message();
214 }
215 }
216 return scored_document_hit_result;
217 }
218
PopAllTermsFromHeap(std::vector<TermMetadata> & scored_terms_heap)219 std::vector<TermMetadata> PopAllTermsFromHeap(
220 std::vector<TermMetadata>& scored_terms_heap) {
221 std::vector<TermMetadata> top_term_result;
222 top_term_result.reserve(scored_terms_heap.size());
223 while (!scored_terms_heap.empty()) {
224 top_term_result.push_back(PopRootTerm(scored_terms_heap));
225 }
226 return top_term_result;
227 }
228
229 } // namespace lib
230 } // namespace icing
231