1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_ 16 #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_ 17 18 #include <array> 19 #include <cstdint> 20 #include <string> 21 #include <string_view> 22 23 #include "icing/text_classifier/lib3/utils/base/status.h" 24 #include "icing/text_classifier/lib3/utils/base/statusor.h" 25 #include "icing/absl_ports/canonical_errors.h" 26 #include "icing/index/hit/doc-hit-info.h" 27 #include "icing/schema/section.h" 28 #include "icing/store/document-id.h" 29 30 namespace icing { 31 namespace lib { 32 33 // Data structure that maps a single matched query term to its section mask 34 // and the list of term frequencies. 35 // TODO(b/158603837): add stat on whether the matched terms are prefix matched 36 // or not. This information will be used to boost exact match. 37 struct TermMatchInfo { 38 std::string_view term; 39 // SectionIdMask associated to the term. 40 SectionIdMask section_ids_mask; 41 // Array with fixed size kMaxSectionId. For every section id, i.e. 42 // vector index, it stores the term frequency of the term. 43 std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies; 44 TermMatchInfoTermMatchInfo45 explicit TermMatchInfo( 46 std::string_view term, SectionIdMask section_ids_mask, 47 std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies) 48 : term(term), 49 section_ids_mask(section_ids_mask), 50 term_frequencies(std::move(term_frequencies)) {} 51 }; 52 53 // Iterator over DocHitInfos (collapsed Hits) in REVERSE document_id order. 54 // 55 // NOTE: You must call Advance() before calling hit_info() or 56 // hit_intersect_section_ids_mask(). 57 // 58 // Example: 59 // DocHitInfoIterator itr = GetIterator(...); 60 // while (itr.Advance()) { 61 // HandleDocHitInfo(itr.hit_info()); 62 // } 63 class DocHitInfoIterator { 64 public: 65 virtual ~DocHitInfoIterator() = default; 66 67 // Returns: 68 // OK if was able to advance to a new document_id. 69 // INVALID_ARGUMENT if there are less than 2 iterators for an AND/OR 70 // iterator 71 // RESOUCE_EXHAUSTED if we've run out of document_ids to iterate over 72 virtual libtextclassifier3::Status Advance() = 0; 73 74 // Returns the DocHitInfo that the iterator is currently at. The DocHitInfo 75 // will have a kInvalidDocumentId if Advance() was not called after 76 // construction or if Advance returned an error. doc_hit_info()77 const DocHitInfo& doc_hit_info() const { return doc_hit_info_; } 78 79 // SectionIdMask representing which sections (if any) have matched *ALL* query 80 // terms for the current document_id. hit_intersect_section_ids_mask()81 SectionIdMask hit_intersect_section_ids_mask() const { 82 return hit_intersect_section_ids_mask_; 83 } 84 85 // Gets the number of flash index blocks that have been read as a 86 // result of operations on this object. 87 virtual int32_t GetNumBlocksInspected() const = 0; 88 89 // HitIterators may be constructed into trees. Internal nodes will return the 90 // sum of the number of Advance() calls to all leaf nodes. Leaf nodes will 91 // return the number of times Advance() was called on it. 92 virtual int32_t GetNumLeafAdvanceCalls() const = 0; 93 94 // A string representing the iterator. 95 virtual std::string ToString() const = 0; 96 97 // For the last hit docid, retrieves all the matched query terms and other 98 // stats, see TermMatchInfo. 99 // filtering_section_mask filters the matching sections and should be set only 100 // by DocHitInfoIteratorSectionRestrict. 101 // If Advance() wasn't called after construction, Advance() returned false or 102 // the concrete HitIterator didn't override this method, the vectors aren't 103 // populated. 104 virtual void PopulateMatchedTermsStats( 105 std::vector<TermMatchInfo>* matched_terms_stats, 106 SectionIdMask filtering_section_mask = kSectionIdMaskAll) const {} 107 108 protected: 109 DocHitInfo doc_hit_info_; 110 SectionIdMask hit_intersect_section_ids_mask_ = kSectionIdMaskNone; 111 112 // Helper function to advance the given iterator to at most the given 113 // document_id. AdvanceTo(DocHitInfoIterator * it,DocumentId document_id)114 libtextclassifier3::StatusOr<DocumentId> AdvanceTo(DocHitInfoIterator* it, 115 DocumentId document_id) { 116 while (it->Advance().ok()) { 117 if (it->doc_hit_info().document_id() <= document_id) { 118 return it->doc_hit_info().document_id(); 119 } 120 } 121 122 // Didn't find anything for the other iterator, reset to invalid values and 123 // return. 124 doc_hit_info_ = DocHitInfo(kInvalidDocumentId); 125 hit_intersect_section_ids_mask_ = kSectionIdMaskNone; 126 return absl_ports::ResourceExhaustedError( 127 "No more DocHitInfos in iterator"); 128 } 129 }; // namespace DocHitInfoIterator 130 131 } // namespace lib 132 } // namespace icing 133 134 #endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_ 135