• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_
16 #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_
17 
18 #include <array>
19 #include <cstdint>
20 #include <string>
21 #include <string_view>
22 
23 #include "icing/text_classifier/lib3/utils/base/status.h"
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "icing/absl_ports/canonical_errors.h"
26 #include "icing/index/hit/doc-hit-info.h"
27 #include "icing/schema/section.h"
28 #include "icing/store/document-id.h"
29 
30 namespace icing {
31 namespace lib {
32 
33 // Data structure that maps a single matched query term to its section mask
34 // and the list of term frequencies.
35 // TODO(b/158603837): add stat on whether the matched terms are prefix matched
36 // or not. This information will be used to boost exact match.
37 struct TermMatchInfo {
38   std::string_view term;
39   // SectionIdMask associated to the term.
40   SectionIdMask section_ids_mask;
41   // Array with fixed size kMaxSectionId. For every section id, i.e.
42   // vector index, it stores the term frequency of the term.
43   std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies;
44 
TermMatchInfoTermMatchInfo45   explicit TermMatchInfo(
46       std::string_view term, SectionIdMask section_ids_mask,
47       std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies)
48       : term(term),
49         section_ids_mask(section_ids_mask),
50         term_frequencies(std::move(term_frequencies)) {}
51 };
52 
53 // Iterator over DocHitInfos (collapsed Hits) in REVERSE document_id order.
54 //
55 // NOTE: You must call Advance() before calling hit_info() or
56 // hit_intersect_section_ids_mask().
57 //
58 // Example:
59 // DocHitInfoIterator itr = GetIterator(...);
60 // while (itr.Advance()) {
61 //   HandleDocHitInfo(itr.hit_info());
62 // }
63 class DocHitInfoIterator {
64  public:
65   virtual ~DocHitInfoIterator() = default;
66 
67   // Returns:
68   //   OK if was able to advance to a new document_id.
69   //   INVALID_ARGUMENT if there are less than 2 iterators for an AND/OR
70   //       iterator
71   //   RESOUCE_EXHAUSTED if we've run out of document_ids to iterate over
72   virtual libtextclassifier3::Status Advance() = 0;
73 
74   // Returns the DocHitInfo that the iterator is currently at. The DocHitInfo
75   // will have a kInvalidDocumentId if Advance() was not called after
76   // construction or if Advance returned an error.
doc_hit_info()77   const DocHitInfo& doc_hit_info() const { return doc_hit_info_; }
78 
79   // SectionIdMask representing which sections (if any) have matched *ALL* query
80   // terms for the current document_id.
hit_intersect_section_ids_mask()81   SectionIdMask hit_intersect_section_ids_mask() const {
82     return hit_intersect_section_ids_mask_;
83   }
84 
85   // Gets the number of flash index blocks that have been read as a
86   // result of operations on this object.
87   virtual int32_t GetNumBlocksInspected() const = 0;
88 
89   // HitIterators may be constructed into trees. Internal nodes will return the
90   // sum of the number of Advance() calls to all leaf nodes. Leaf nodes will
91   // return the number of times Advance() was called on it.
92   virtual int32_t GetNumLeafAdvanceCalls() const = 0;
93 
94   // A string representing the iterator.
95   virtual std::string ToString() const = 0;
96 
97   // For the last hit docid, retrieves all the matched query terms and other
98   // stats, see TermMatchInfo.
99   // filtering_section_mask filters the matching sections and should be set only
100   // by DocHitInfoIteratorSectionRestrict.
101   // If Advance() wasn't called after construction, Advance() returned false or
102   // the concrete HitIterator didn't override this method, the vectors aren't
103   // populated.
104   virtual void PopulateMatchedTermsStats(
105       std::vector<TermMatchInfo>* matched_terms_stats,
106       SectionIdMask filtering_section_mask = kSectionIdMaskAll) const {}
107 
108  protected:
109   DocHitInfo doc_hit_info_;
110   SectionIdMask hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
111 
112   // Helper function to advance the given iterator to at most the given
113   // document_id.
AdvanceTo(DocHitInfoIterator * it,DocumentId document_id)114   libtextclassifier3::StatusOr<DocumentId> AdvanceTo(DocHitInfoIterator* it,
115                                                      DocumentId document_id) {
116     while (it->Advance().ok()) {
117       if (it->doc_hit_info().document_id() <= document_id) {
118         return it->doc_hit_info().document_id();
119       }
120     }
121 
122     // Didn't find anything for the other iterator, reset to invalid values and
123     // return.
124     doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
125     hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
126     return absl_ports::ResourceExhaustedError(
127         "No more DocHitInfos in iterator");
128   }
129 };  // namespace DocHitInfoIterator
130 
131 }  // namespace lib
132 }  // namespace icing
133 
134 #endif  // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_
135