• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_
16 #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_
17 
18 #include <array>
19 #include <cstdint>
20 #include <string>
21 #include <string_view>
22 
23 #include "icing/text_classifier/lib3/utils/base/status.h"
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "icing/absl_ports/canonical_errors.h"
26 #include "icing/index/hit/doc-hit-info.h"
27 #include "icing/schema/section.h"
28 #include "icing/store/document-id.h"
29 
30 namespace icing {
31 namespace lib {
32 
33 // Data structure that maps a single matched query term to its section mask
34 // and the list of term frequencies.
35 // TODO(b/158603837): add stat on whether the matched terms are prefix matched
36 // or not. This information will be used to boost exact match.
37 struct TermMatchInfo {
38   std::string_view term;
39   // SectionIdMask associated to the term.
40   SectionIdMask section_ids_mask;
41   // Array with fixed size kMaxSectionId. For every section id, i.e.
42   // vector index, it stores the term frequency of the term.
43   std::array<Hit::TermFrequency, kTotalNumSections> term_frequencies;
44 
TermMatchInfoTermMatchInfo45   explicit TermMatchInfo(
46       std::string_view term, SectionIdMask section_ids_mask,
47       std::array<Hit::TermFrequency, kTotalNumSections> term_frequencies)
48       : term(term),
49         section_ids_mask(section_ids_mask),
50         term_frequencies(std::move(term_frequencies)) {}
51 };
52 
53 // Iterator over DocHitInfos (collapsed Hits) in REVERSE document_id order.
54 //
55 // NOTE: You must call Advance() before calling hit_info() or
56 // hit_intersect_section_ids_mask().
57 //
58 // Example:
59 // DocHitInfoIterator itr = GetIterator(...);
60 // while (itr.Advance()) {
61 //   HandleDocHitInfo(itr.hit_info());
62 // }
63 class DocHitInfoIterator {
64  public:
65   struct TrimmedNode {
66     // the query results which we should only search for suggestion in these
67     // documents.
68     std::unique_ptr<DocHitInfoIterator> iterator_;
69     // term of the trimmed node which we need to generate suggested strings.
70     std::string term_;
71     // the string in the query which indicates the target section we should
72     // search for suggestions.
73     std::string target_section_;
74     // the start index of the current term in the given search query.
75     int term_start_index_;
76     // The length of the given unnormalized term in the search query
77     int unnormalized_term_length_;
78 
TrimmedNodeTrimmedNode79     TrimmedNode(std::unique_ptr<DocHitInfoIterator> iterator, std::string term,
80                 int term_start_index, int unnormalized_term_length)
81         : iterator_(std::move(iterator)),
82           term_(term),
83           target_section_(""),
84           term_start_index_(term_start_index),
85           unnormalized_term_length_(unnormalized_term_length) {}
86   };
87 
88   // Trim the rightmost iterator of the iterator tree.
89   // This is to support search suggestions for the last term which is the
90   // right-most node of the root iterator tree. Only support trim the right-most
91   // node on the AND, AND_NARY, OR, OR_NARY, OR_LEAF, Filter, and the
92   // property-in-schema-check iterator.
93   //
94   // After calling this method, this iterator is no longer usable. Please use
95   // the returned iterator.
96   // Returns:
97   //   the new iterator without the right-most child, if was able to trim the
98   //   right-most node.
99   //   nullptr if the current iterator should be trimmed.
100   //   INVALID_ARGUMENT if the right-most node is not suppose to be trimmed.
101   virtual libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && = 0;
102 
103   virtual ~DocHitInfoIterator() = default;
104 
105   // Returns:
106   //   OK if was able to advance to a new document_id.
107   //   INVALID_ARGUMENT if there are less than 2 iterators for an AND/OR
108   //       iterator
109   //   RESOUCE_EXHAUSTED if we've run out of document_ids to iterate over
110   virtual libtextclassifier3::Status Advance() = 0;
111 
112   // Returns the DocHitInfo that the iterator is currently at. The DocHitInfo
113   // will have a kInvalidDocumentId if Advance() was not called after
114   // construction or if Advance returned an error.
doc_hit_info()115   const DocHitInfo& doc_hit_info() const { return doc_hit_info_; }
116 
117   // SectionIdMask representing which sections (if any) have matched *ALL* query
118   // terms for the current document_id.
hit_intersect_section_ids_mask()119   SectionIdMask hit_intersect_section_ids_mask() const {
120     return hit_intersect_section_ids_mask_;
121   }
122 
123   // Gets the number of flash index blocks that have been read as a
124   // result of operations on this object.
125   virtual int32_t GetNumBlocksInspected() const = 0;
126 
127   // HitIterators may be constructed into trees. Internal nodes will return the
128   // sum of the number of Advance() calls to all leaf nodes. Leaf nodes will
129   // return the number of times Advance() was called on it.
130   virtual int32_t GetNumLeafAdvanceCalls() const = 0;
131 
132   // A string representing the iterator.
133   virtual std::string ToString() const = 0;
134 
135   // For the last hit docid, retrieves all the matched query terms and other
136   // stats, see TermMatchInfo.
137   // filtering_section_mask filters the matching sections and should be set only
138   // by DocHitInfoIteratorSectionRestrict.
139   // If Advance() wasn't called after construction, Advance() returned false or
140   // the concrete HitIterator didn't override this method, the vectors aren't
141   // populated.
142   virtual void PopulateMatchedTermsStats(
143       std::vector<TermMatchInfo>* matched_terms_stats,
144       SectionIdMask filtering_section_mask = kSectionIdMaskAll) const {}
145 
146  protected:
147   DocHitInfo doc_hit_info_;
148   SectionIdMask hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
149 
150   // Helper function to advance the given iterator to at most the given
151   // document_id.
AdvanceTo(DocHitInfoIterator * it,DocumentId document_id)152   libtextclassifier3::StatusOr<DocumentId> AdvanceTo(DocHitInfoIterator* it,
153                                                      DocumentId document_id) {
154     while (it->Advance().ok()) {
155       if (it->doc_hit_info().document_id() <= document_id) {
156         return it->doc_hit_info().document_id();
157       }
158     }
159 
160     // Didn't find anything for the other iterator, reset to invalid values and
161     // return.
162     doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
163     hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
164     return absl_ports::ResourceExhaustedError(
165         "No more DocHitInfos in iterator");
166   }
167 };  // namespace DocHitInfoIterator
168 
169 }  // namespace lib
170 }  // namespace icing
171 
172 #endif  // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_
173