1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <numeric>
20 #include <string>
21 #include <vector>
22
23 #include "icing/text_classifier/lib3/utils/base/status.h"
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "icing/absl_ports/canonical_errors.h"
26 #include "icing/absl_ports/str_cat.h"
27 #include "icing/index/hit/doc-hit-info.h"
28 #include "icing/index/hit/hit.h"
29 #include "icing/index/iterator/doc-hit-info-iterator.h"
30 #include "icing/index/lite/lite-index.h"
31 #include "icing/index/term-id-codec.h"
32 #include "icing/schema/section.h"
33 #include "icing/util/logging.h"
34 #include "icing/util/math-util.h"
35 #include "icing/util/status-macros.h"
36
37 namespace icing {
38 namespace lib {
39
40 namespace {
41
SectionIdMaskToString(SectionIdMask section_id_mask)42 std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
43 std::string mask(kTotalNumSections, '0');
44 for (SectionId i = kMaxSectionId; i >= 0; --i) {
45 if (section_id_mask & (UINT64_C(1) << i)) {
46 mask[kMaxSectionId - i] = '1';
47 }
48 }
49 return mask;
50 }
51
52 } // namespace
53
Advance()54 libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() {
55 if (cached_hits_idx_ == -1) {
56 libtextclassifier3::Status status = RetrieveMoreHits();
57 if (!status.ok()) {
58 if (!absl_ports::IsNotFound(status)) {
59 // NOT_FOUND is expected to happen (not every term will be in the main
60 // index!). Other errors are worth logging.
61 ICING_LOG(ERROR)
62 << "Encountered unexpected failure while retrieving hits "
63 << status.error_message();
64 }
65 return absl_ports::ResourceExhaustedError(
66 "No more DocHitInfos in iterator");
67 }
68 } else {
69 ++cached_hits_idx_;
70 }
71 if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) {
72 // Nothing more for the iterator to return. Set these members to invalid
73 // values.
74 doc_hit_info_ = DocHitInfo();
75 return absl_ports::ResourceExhaustedError(
76 "No more DocHitInfos in iterator");
77 }
78 ++num_advance_calls_;
79 doc_hit_info_ = cached_hits_.at(cached_hits_idx_);
80 return libtextclassifier3::Status::OK;
81 }
82
83 libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
TrimRightMostNode()84 DocHitInfoIteratorTermLite::TrimRightMostNode() && {
85 // Leaf iterator should trim itself.
86 DocHitInfoIterator::TrimmedNode node = {nullptr, term_, term_start_index_,
87 unnormalized_term_length_};
88 return node;
89 }
90
RetrieveMoreHits()91 libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
92 // Exact match only. All hits in lite lexicon are exact.
93 ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->GetTermId(term_));
94 ICING_ASSIGN_OR_RETURN(uint32_t term_id,
95 term_id_codec_->EncodeTvi(tvi, TviType::LITE));
96 lite_index_->FetchHits(
97 term_id, section_restrict_mask_,
98 /*only_from_prefix_sections=*/false,
99 /*score_by=*/
100 SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE,
101 /*namespace_checker=*/nullptr, &cached_hits_,
102 need_hit_term_frequency_ ? &cached_hit_term_frequency_ : nullptr);
103 cached_hits_idx_ = 0;
104 return libtextclassifier3::Status::OK;
105 }
106
ToString() const107 std::string DocHitInfoIteratorTermLiteExact::ToString() const {
108 return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
109 term_);
110 }
111
112 libtextclassifier3::Status
RetrieveMoreHits()113 DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
114 // Take union of lite terms.
115 int term_len = term_.length();
116 int terms_matched = 0;
117 for (LiteIndex::PrefixIterator it = lite_index_->FindTermPrefixes(term_);
118 it.IsValid(); it.Advance()) {
119 bool exact_match = it.GetKey().size() == term_len;
120 ICING_ASSIGN_OR_RETURN(
121 uint32_t term_id,
122 term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE));
123 lite_index_->FetchHits(
124 term_id, section_restrict_mask_,
125 /*only_from_prefix_sections=*/!exact_match,
126 /*score_by=*/
127 SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE,
128 /*namespace_checker=*/nullptr, &cached_hits_,
129 need_hit_term_frequency_ ? &cached_hit_term_frequency_ : nullptr);
130 ++terms_matched;
131 }
132 if (terms_matched > 1) {
133 SortAndDedupeDocumentIds();
134 }
135 cached_hits_idx_ = 0;
136 return libtextclassifier3::Status::OK;
137 }
138
SortDocumentIds()139 void DocHitInfoIteratorTermLitePrefix::SortDocumentIds() {
140 // Re-sort cached document_ids and merge sections.
141 if (!need_hit_term_frequency_) {
142 // If we don't need to also sort cached_hit_term_frequency_ along with
143 // cached_hits_, then just simply sort cached_hits_.
144 sort(cached_hits_.begin(), cached_hits_.end());
145 } else {
146 // Sort cached_hit_term_frequency_ along with cached_hits_.
147 std::vector<int> indices(cached_hits_.size());
148 std::iota(indices.begin(), indices.end(), 0);
149 std::sort(indices.begin(), indices.end(), [this](int i, int j) {
150 return cached_hits_[i] < cached_hits_[j];
151 });
152 // Now indices is a map from sorted index to current index. In other words,
153 // the sorted cached_hits_[i] should be the current cached_hits_[indices[i]]
154 // for every valid i.
155
156 math_util::ApplyPermutation(indices, cached_hits_,
157 cached_hit_term_frequency_);
158 }
159 }
160
SortAndDedupeDocumentIds()161 void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() {
162 SortDocumentIds();
163 int idx = 0;
164 for (int i = 1; i < cached_hits_.size(); ++i) {
165 const DocHitInfo& hit_info = cached_hits_[i];
166 DocHitInfo& collapsed_hit_info = cached_hits_[idx];
167 if (collapsed_hit_info.document_id() == hit_info.document_id()) {
168 SectionIdMask curr_mask = hit_info.hit_section_ids_mask();
169 collapsed_hit_info.MergeSectionsFrom(curr_mask);
170 if (need_hit_term_frequency_) {
171 Hit::TermFrequencyArray& collapsed_term_frequency =
172 cached_hit_term_frequency_[idx];
173 while (curr_mask) {
174 SectionId section_id = __builtin_ctzll(curr_mask);
175 // We add the new term-frequency to the existing term-frequency we've
176 // recorded for the current section-id in case there are multiple hits
177 // matching the query for this section.
178 // - This is the case for a prefix query where there are multiple
179 // terms matching the prefix from the same sectionId + docId.
180 collapsed_term_frequency[section_id] +=
181 cached_hit_term_frequency_[i][section_id];
182 curr_mask &= ~(UINT64_C(1) << section_id);
183 }
184 }
185 } else {
186 // New document_id.
187 ++idx;
188 cached_hits_[idx] = hit_info;
189 if (need_hit_term_frequency_) {
190 cached_hit_term_frequency_[idx] = cached_hit_term_frequency_[i];
191 }
192 }
193 }
194 // idx points to last doc hit info.
195 cached_hits_.resize(idx + 1);
196 if (need_hit_term_frequency_) {
197 cached_hit_term_frequency_.resize(idx + 1);
198 }
199 }
200
ToString() const201 std::string DocHitInfoIteratorTermLitePrefix::ToString() const {
202 return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
203 term_, "*");
204 }
205
206 } // namespace lib
207 } // namespace icing
208