• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_HIT_HIT_H_
16 #define ICING_INDEX_HIT_HIT_H_
17 
18 #include <array>
19 #include <cstdint>
20 #include <limits>
21 
22 #include "icing/legacy/core/icing-packed-pod.h"
23 #include "icing/schema/section.h"
24 #include "icing/store/document-id.h"
25 
26 namespace icing {
27 namespace lib {
28 
29 // BasicHit is a specific encoding that refers to content within a document. A
30 // basic hit consists of:
31 // - a DocumentId
32 // - a SectionId
33 // referring to the document and section that the hit corresponds to.
34 //
35 // The hit is the most basic unit of the index and, when grouped together by
36 // term, can be used to encode what terms appear in what documents.
37 //
38 // BasicHit is for indices (e.g. numeric index) that don't require term
39 // frequency.
40 class BasicHit {
41  public:
42   // The datatype used to encode BasicHit information: the document_id and
43   // section_id.
44   using Value = uint32_t;
45 
46   // WARNING: Changing this value will invalidate any pre-existing posting lists
47   // on user devices.
48   //
49   // kInvalidValue contains:
50   // - 0 for unused bits. Note that unused bits are always 0 for both valid and
51   //   invalid BasicHit values.
52   // - Inverted kInvalidDocumentId
53   // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has
54   //   already invalidated the value. In fact, we currently use all 2^6 section
55   //   ids and there is no "invalid section id", so it doesn't matter what
56   //   SectionId we set for kInvalidValue.
57   static constexpr Value kInvalidValue = 0;
58 
59   explicit BasicHit(SectionId section_id, DocumentId document_id);
60 
BasicHit()61   explicit BasicHit() : value_(kInvalidValue) {}
62 
is_valid()63   bool is_valid() const { return value_ != kInvalidValue; }
value()64   Value value() const { return value_; }
65   DocumentId document_id() const;
66   SectionId section_id() const;
67 
68   bool operator<(const BasicHit& h2) const { return value_ < h2.value_; }
69   bool operator==(const BasicHit& h2) const { return value_ == h2.value_; }
70 
71  private:
72   // Value bits layout: 4 unused + 22 document_id + 6 section id.
73   Value value_;
74 } __attribute__((packed));
75 static_assert(sizeof(BasicHit) == 4, "");
76 
77 // Hit is a specific encoding that refers to content within a document. A hit
78 // consists of:
79 // - a DocumentId
80 // - a SectionId
81 // referring to the document and section that the hit corresponds to, as well as
82 // metadata about the hit:
83 // - whether the Hit has a TermFrequency other than the default value
84 // - whether the Hit does not appear exactly in the document, but instead
85 //   represents a term that is a prefix of a term in the document
86 // - whether the Hit came from a section that has prefix expansion enabled
87 //   and a term frequency for the hit.
88 //
89 // The hit is the most basic unit of the index and, when grouped together by
90 // term, can be used to encode what terms appear in what documents.
91 class Hit {
92  public:
93   // The datatype used to encode Hit information: the document_id, section_id
94   // and the has_term_frequency, prefix hit and in prefix section flags.
95   using Value = uint32_t;
96 
97   // WARNING: Changing this value will invalidate any pre-existing posting lists
98   // on user devices.
99   //
100   // WARNING:
101   // - Hit::kInvalidValue should contain inverted kInvalidDocumentId, which is
102   //   b'00...0. However, currently we set it as UINT32_MAX and actually it
103   //   contains b'11...1, which is the inverted document_id 0.
104   // - It means Hit::kInvalidValue contains valid (document_id, section_id,
105   //   flags), so we potentially cannot distinguish if a Hit is invalid or not.
106   //   The invalidity is an essential feature for posting list since we use it
107   //   to determine the state of the posting list.
108   // - The reason why it won't break the current posting list is because the
109   //   unused bit(s) are set as 1 for Hit::kInvalidValue and 0 for all valid
110   //   Hits. In other words, the unused bit(s) are actually serving as "invalid
111   //   flag".
112   // - If we want to exhaust all unused bits in the future, then we have to
113   //   change Hit::kInvalidValue to set the inverted document_id section
114   //   correctly (b'00...0, refer to BasicHit::kInvalidValue as an example).
115   // - Also this problem is guarded by static_assert in hit.cc. If exhausting
116   //   all unused bits, then the static_assert will detect and fail. We can
117   //   safely remove the static_assert check after following the above process
118   //   to resolve the incorrect Hit::kInvalidValue issue.
119   static constexpr Value kInvalidValue = std::numeric_limits<Value>::max();
120   // Docs are sorted in reverse, and 0 is never used as the inverted
121   // DocumentId (because it is the inverse of kInvalidValue), so it is always
122   // the max in a descending sort.
123   static constexpr Value kMaxDocumentIdSortValue = 0;
124 
125   // The Term Frequency of a Hit.
126   using TermFrequency = uint8_t;
127   using TermFrequencyArray = std::array<Hit::TermFrequency, kTotalNumSections>;
128   // Max TermFrequency is 255.
129   static constexpr TermFrequency kMaxTermFrequency =
130       std::numeric_limits<TermFrequency>::max();
131   static constexpr TermFrequency kDefaultTermFrequency = 1;
132   static constexpr TermFrequency kNoTermFrequency = 0;
133 
134   explicit Hit(Value value = kInvalidValue,
135                TermFrequency term_frequency = kDefaultTermFrequency)
value_(value)136       : value_(value), term_frequency_(term_frequency) {}
137   Hit(SectionId section_id, DocumentId document_id,
138       TermFrequency term_frequency, bool is_in_prefix_section = false,
139       bool is_prefix_hit = false);
140 
is_valid()141   bool is_valid() const { return value() != kInvalidValue; }
value()142   Value value() const { return value_; }
143   DocumentId document_id() const;
144   SectionId section_id() const;
145   // Whether or not the hit contains a valid term frequency.
146   bool has_term_frequency() const;
term_frequency()147   TermFrequency term_frequency() const { return term_frequency_; }
148   bool is_prefix_hit() const;
149   bool is_in_prefix_section() const;
150 
151   // Creates a new hit based on old_hit but with new_document_id set.
152   static Hit TranslateHit(Hit old_hit, DocumentId new_document_id);
153 
154   bool operator<(const Hit& h2) const { return value() < h2.value(); }
155   bool operator==(const Hit& h2) const { return value() == h2.value(); }
156 
157   struct EqualsDocumentIdAndSectionId {
158     bool operator()(const Hit& hit1, const Hit& hit2) const;
159   };
160 
161  private:
162   // Value and TermFrequency must be in this order.
163   // Value bits layout: 1 unused + 22 document_id + 6 section id + 3 flags.
164   Value value_;
165   TermFrequency term_frequency_;
166 } __attribute__((packed));
167 static_assert(sizeof(Hit) == 5, "");
168 // TODO(b/138991332) decide how to remove/replace all is_packed_pod assertions.
169 static_assert(icing_is_packed_pod<Hit>::value, "go/icing-ubsan");
170 
171 }  // namespace lib
172 }  // namespace icing
173 
174 #endif  // ICING_INDEX_HIT_HIT_H_
175