• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <numeric>
20 #include <string>
21 #include <vector>
22 
23 #include "icing/text_classifier/lib3/utils/base/status.h"
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "icing/absl_ports/canonical_errors.h"
26 #include "icing/absl_ports/str_cat.h"
27 #include "icing/index/hit/doc-hit-info.h"
28 #include "icing/index/hit/hit.h"
29 #include "icing/index/iterator/doc-hit-info-iterator.h"
30 #include "icing/index/lite/lite-index.h"
31 #include "icing/index/term-id-codec.h"
32 #include "icing/schema/section.h"
33 #include "icing/util/logging.h"
34 #include "icing/util/math-util.h"
35 #include "icing/util/status-macros.h"
36 
37 namespace icing {
38 namespace lib {
39 
40 namespace {
41 
SectionIdMaskToString(SectionIdMask section_id_mask)42 std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
43   std::string mask(kTotalNumSections, '0');
44   for (SectionId i = kMaxSectionId; i >= 0; --i) {
45     if (section_id_mask & (UINT64_C(1) << i)) {
46       mask[kMaxSectionId - i] = '1';
47     }
48   }
49   return mask;
50 }
51 
52 }  // namespace
53 
Advance()54 libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() {
55   if (cached_hits_idx_ == -1) {
56     libtextclassifier3::Status status = RetrieveMoreHits();
57     if (!status.ok()) {
58       if (!absl_ports::IsNotFound(status)) {
59         // NOT_FOUND is expected to happen (not every term will be in the main
60         // index!). Other errors are worth logging.
61         ICING_LOG(ERROR)
62             << "Encountered unexpected failure while retrieving  hits "
63             << status.error_message();
64       }
65       return absl_ports::ResourceExhaustedError(
66           "No more DocHitInfos in iterator");
67     }
68   } else {
69     ++cached_hits_idx_;
70   }
71   if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) {
72     // Nothing more for the iterator to return. Set these members to invalid
73     // values.
74     doc_hit_info_ = DocHitInfo();
75     return absl_ports::ResourceExhaustedError(
76         "No more DocHitInfos in iterator");
77   }
78   ++num_advance_calls_;
79   doc_hit_info_ = cached_hits_.at(cached_hits_idx_);
80   return libtextclassifier3::Status::OK;
81 }
82 
83 libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
TrimRightMostNode()84 DocHitInfoIteratorTermLite::TrimRightMostNode() && {
85   // Leaf iterator should trim itself.
86   DocHitInfoIterator::TrimmedNode node = {nullptr, term_, term_start_index_,
87                                           unnormalized_term_length_};
88   return node;
89 }
90 
RetrieveMoreHits()91 libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
92   // Exact match only. All hits in lite lexicon are exact.
93   ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->GetTermId(term_));
94   ICING_ASSIGN_OR_RETURN(uint32_t term_id,
95                          term_id_codec_->EncodeTvi(tvi, TviType::LITE));
96   lite_index_->FetchHits(
97       term_id, section_restrict_mask_,
98       /*only_from_prefix_sections=*/false,
99       /*score_by=*/
100       SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE,
101       /*namespace_checker=*/nullptr, &cached_hits_,
102       need_hit_term_frequency_ ? &cached_hit_term_frequency_ : nullptr);
103   cached_hits_idx_ = 0;
104   return libtextclassifier3::Status::OK;
105 }
106 
ToString() const107 std::string DocHitInfoIteratorTermLiteExact::ToString() const {
108   return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
109                             term_);
110 }
111 
112 libtextclassifier3::Status
RetrieveMoreHits()113 DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
114   // Take union of lite terms.
115   int term_len = term_.length();
116   int terms_matched = 0;
117   for (LiteIndex::PrefixIterator it = lite_index_->FindTermPrefixes(term_);
118        it.IsValid(); it.Advance()) {
119     bool exact_match = it.GetKey().size() == term_len;
120     ICING_ASSIGN_OR_RETURN(
121         uint32_t term_id,
122         term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE));
123     lite_index_->FetchHits(
124         term_id, section_restrict_mask_,
125         /*only_from_prefix_sections=*/!exact_match,
126         /*score_by=*/
127         SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE,
128         /*namespace_checker=*/nullptr, &cached_hits_,
129         need_hit_term_frequency_ ? &cached_hit_term_frequency_ : nullptr);
130     ++terms_matched;
131   }
132   if (terms_matched > 1) {
133     SortAndDedupeDocumentIds();
134   }
135   cached_hits_idx_ = 0;
136   return libtextclassifier3::Status::OK;
137 }
138 
SortDocumentIds()139 void DocHitInfoIteratorTermLitePrefix::SortDocumentIds() {
140   // Re-sort cached document_ids and merge sections.
141   if (!need_hit_term_frequency_) {
142     // If we don't need to also sort cached_hit_term_frequency_ along with
143     // cached_hits_, then just simply sort cached_hits_.
144     sort(cached_hits_.begin(), cached_hits_.end());
145   } else {
146     // Sort cached_hit_term_frequency_ along with cached_hits_.
147     std::vector<int> indices(cached_hits_.size());
148     std::iota(indices.begin(), indices.end(), 0);
149     std::sort(indices.begin(), indices.end(), [this](int i, int j) {
150       return cached_hits_[i] < cached_hits_[j];
151     });
152     // Now indices is a map from sorted index to current index. In other words,
153     // the sorted cached_hits_[i] should be the current cached_hits_[indices[i]]
154     // for every valid i.
155 
156     math_util::ApplyPermutation(indices, cached_hits_,
157                                 cached_hit_term_frequency_);
158   }
159 }
160 
SortAndDedupeDocumentIds()161 void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() {
162   SortDocumentIds();
163   int idx = 0;
164   for (int i = 1; i < cached_hits_.size(); ++i) {
165     const DocHitInfo& hit_info = cached_hits_[i];
166     DocHitInfo& collapsed_hit_info = cached_hits_[idx];
167     if (collapsed_hit_info.document_id() == hit_info.document_id()) {
168       SectionIdMask curr_mask = hit_info.hit_section_ids_mask();
169       collapsed_hit_info.MergeSectionsFrom(curr_mask);
170       if (need_hit_term_frequency_) {
171         Hit::TermFrequencyArray& collapsed_term_frequency =
172             cached_hit_term_frequency_[idx];
173         while (curr_mask) {
174           SectionId section_id = __builtin_ctzll(curr_mask);
175           // We add the new term-frequency to the existing term-frequency we've
176           // recorded for the current section-id in case there are multiple hits
177           // matching the query for this section.
178           // - This is the case for a prefix query where there are multiple
179           //   terms matching the prefix from the same sectionId + docId.
180           collapsed_term_frequency[section_id] +=
181               cached_hit_term_frequency_[i][section_id];
182           curr_mask &= ~(UINT64_C(1) << section_id);
183         }
184       }
185     } else {
186       // New document_id.
187       ++idx;
188       cached_hits_[idx] = hit_info;
189       if (need_hit_term_frequency_) {
190         cached_hit_term_frequency_[idx] = cached_hit_term_frequency_[i];
191       }
192     }
193   }
194   // idx points to last doc hit info.
195   cached_hits_.resize(idx + 1);
196   if (need_hit_term_frequency_) {
197     cached_hit_term_frequency_.resize(idx + 1);
198   }
199 }
200 
ToString() const201 std::string DocHitInfoIteratorTermLitePrefix::ToString() const {
202   return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
203                             term_, "*");
204 }
205 
206 }  // namespace lib
207 }  // namespace icing
208