1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_ 16 #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_ 17 18 #include <array> 19 #include <cstdint> 20 #include <string> 21 #include <string_view> 22 23 #include "icing/text_classifier/lib3/utils/base/status.h" 24 #include "icing/text_classifier/lib3/utils/base/statusor.h" 25 #include "icing/absl_ports/canonical_errors.h" 26 #include "icing/index/hit/doc-hit-info.h" 27 #include "icing/schema/section.h" 28 #include "icing/store/document-id.h" 29 30 namespace icing { 31 namespace lib { 32 33 // Data structure that maps a single matched query term to its section mask 34 // and the list of term frequencies. 35 // TODO(b/158603837): add stat on whether the matched terms are prefix matched 36 // or not. This information will be used to boost exact match. 37 struct TermMatchInfo { 38 std::string_view term; 39 // SectionIdMask associated to the term. 40 SectionIdMask section_ids_mask; 41 // Array with fixed size kMaxSectionId. For every section id, i.e. 42 // vector index, it stores the term frequency of the term. 43 std::array<Hit::TermFrequency, kTotalNumSections> term_frequencies; 44 TermMatchInfoTermMatchInfo45 explicit TermMatchInfo( 46 std::string_view term, SectionIdMask section_ids_mask, 47 std::array<Hit::TermFrequency, kTotalNumSections> term_frequencies) 48 : term(term), 49 section_ids_mask(section_ids_mask), 50 term_frequencies(std::move(term_frequencies)) {} 51 }; 52 53 // Iterator over DocHitInfos (collapsed Hits) in REVERSE document_id order. 54 // 55 // NOTE: You must call Advance() before calling hit_info() or 56 // hit_intersect_section_ids_mask(). 57 // 58 // Example: 59 // DocHitInfoIterator itr = GetIterator(...); 60 // while (itr.Advance()) { 61 // HandleDocHitInfo(itr.hit_info()); 62 // } 63 class DocHitInfoIterator { 64 public: 65 struct TrimmedNode { 66 // the query results which we should only search for suggestion in these 67 // documents. 68 std::unique_ptr<DocHitInfoIterator> iterator_; 69 // term of the trimmed node which we need to generate suggested strings. 70 std::string term_; 71 // the string in the query which indicates the target section we should 72 // search for suggestions. 73 std::string target_section_; 74 // the start index of the current term in the given search query. 75 int term_start_index_; 76 // The length of the given unnormalized term in the search query 77 int unnormalized_term_length_; 78 TrimmedNodeTrimmedNode79 TrimmedNode(std::unique_ptr<DocHitInfoIterator> iterator, std::string term, 80 int term_start_index, int unnormalized_term_length) 81 : iterator_(std::move(iterator)), 82 term_(term), 83 target_section_(""), 84 term_start_index_(term_start_index), 85 unnormalized_term_length_(unnormalized_term_length) {} 86 }; 87 88 // Trim the rightmost iterator of the iterator tree. 89 // This is to support search suggestions for the last term which is the 90 // right-most node of the root iterator tree. Only support trim the right-most 91 // node on the AND, AND_NARY, OR, OR_NARY, OR_LEAF, Filter, and the 92 // property-in-schema-check iterator. 93 // 94 // After calling this method, this iterator is no longer usable. Please use 95 // the returned iterator. 96 // Returns: 97 // the new iterator without the right-most child, if was able to trim the 98 // right-most node. 99 // nullptr if the current iterator should be trimmed. 100 // INVALID_ARGUMENT if the right-most node is not suppose to be trimmed. 101 virtual libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && = 0; 102 103 virtual ~DocHitInfoIterator() = default; 104 105 // Returns: 106 // OK if was able to advance to a new document_id. 107 // INVALID_ARGUMENT if there are less than 2 iterators for an AND/OR 108 // iterator 109 // RESOUCE_EXHAUSTED if we've run out of document_ids to iterate over 110 virtual libtextclassifier3::Status Advance() = 0; 111 112 // Returns the DocHitInfo that the iterator is currently at. The DocHitInfo 113 // will have a kInvalidDocumentId if Advance() was not called after 114 // construction or if Advance returned an error. doc_hit_info()115 const DocHitInfo& doc_hit_info() const { return doc_hit_info_; } 116 117 // SectionIdMask representing which sections (if any) have matched *ALL* query 118 // terms for the current document_id. hit_intersect_section_ids_mask()119 SectionIdMask hit_intersect_section_ids_mask() const { 120 return hit_intersect_section_ids_mask_; 121 } 122 123 // Gets the number of flash index blocks that have been read as a 124 // result of operations on this object. 125 virtual int32_t GetNumBlocksInspected() const = 0; 126 127 // HitIterators may be constructed into trees. Internal nodes will return the 128 // sum of the number of Advance() calls to all leaf nodes. Leaf nodes will 129 // return the number of times Advance() was called on it. 130 virtual int32_t GetNumLeafAdvanceCalls() const = 0; 131 132 // A string representing the iterator. 133 virtual std::string ToString() const = 0; 134 135 // For the last hit docid, retrieves all the matched query terms and other 136 // stats, see TermMatchInfo. 137 // filtering_section_mask filters the matching sections and should be set only 138 // by DocHitInfoIteratorSectionRestrict. 139 // If Advance() wasn't called after construction, Advance() returned false or 140 // the concrete HitIterator didn't override this method, the vectors aren't 141 // populated. 142 virtual void PopulateMatchedTermsStats( 143 std::vector<TermMatchInfo>* matched_terms_stats, 144 SectionIdMask filtering_section_mask = kSectionIdMaskAll) const {} 145 146 protected: 147 DocHitInfo doc_hit_info_; 148 SectionIdMask hit_intersect_section_ids_mask_ = kSectionIdMaskNone; 149 150 // Helper function to advance the given iterator to at most the given 151 // document_id. AdvanceTo(DocHitInfoIterator * it,DocumentId document_id)152 libtextclassifier3::StatusOr<DocumentId> AdvanceTo(DocHitInfoIterator* it, 153 DocumentId document_id) { 154 while (it->Advance().ok()) { 155 if (it->doc_hit_info().document_id() <= document_id) { 156 return it->doc_hit_info().document_id(); 157 } 158 } 159 160 // Didn't find anything for the other iterator, reset to invalid values and 161 // return. 162 doc_hit_info_ = DocHitInfo(kInvalidDocumentId); 163 hit_intersect_section_ids_mask_ = kSectionIdMaskNone; 164 return absl_ports::ResourceExhaustedError( 165 "No more DocHitInfos in iterator"); 166 } 167 }; // namespace DocHitInfoIterator 168 169 } // namespace lib 170 } // namespace icing 171 172 #endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_ 173