• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_HIT_HIT_H_
16 #define ICING_INDEX_HIT_HIT_H_
17 
18 #include <array>
19 #include <cstdint>
20 #include <cstring>
21 #include <limits>
22 
23 #include "icing/legacy/core/icing-packed-pod.h"
24 #include "icing/schema/section.h"
25 #include "icing/store/document-id.h"
26 
27 namespace icing {
28 namespace lib {
29 
30 // BasicHit is a specific encoding that refers to content within a document. A
31 // basic hit consists of:
32 // - a DocumentId
33 // - a SectionId
34 // referring to the document and section that the hit corresponds to.
35 //
36 // The hit is the most basic unit of the index and, when grouped together by
37 // term, can be used to encode what terms appear in what documents.
38 //
39 // BasicHit is for indices (e.g. numeric index) that don't require term
40 // frequency.
41 class BasicHit {
42  public:
43   // The datatype used to encode BasicHit information: the document_id and
44   // section_id.
45   using Value = uint32_t;
46 
47   // WARNING: Changing this value will invalidate any pre-existing posting lists
48   // on user devices.
49   //
50   // kInvalidValue contains:
51   // - 0 for unused bits. Note that unused bits are always 0 for both valid and
52   //   invalid BasicHit values.
53   // - Inverted kInvalidDocumentId
54   // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has
55   //   already invalidated the value. In fact, we currently use all 2^6 section
56   //   ids and there is no "invalid section id", so it doesn't matter what
57   //   SectionId we set for kInvalidValue.
58   static constexpr Value kInvalidValue = 0;
59 
60   explicit BasicHit(SectionId section_id, DocumentId document_id);
61 
BasicHit()62   explicit BasicHit() : value_(kInvalidValue) {}
BasicHit(Value value)63   explicit BasicHit(Value value) : value_(value) {}
64 
is_valid()65   bool is_valid() const { return value_ != kInvalidValue; }
value()66   Value value() const { return value_; }
67   DocumentId document_id() const;
68   SectionId section_id() const;
69 
70   bool operator<(const BasicHit& h2) const { return value_ < h2.value_; }
71   bool operator==(const BasicHit& h2) const { return value_ == h2.value_; }
72 
73  private:
74   // Value bits layout: 4 unused + 22 document_id + 6 section id.
75   //
76   // The Value is guaranteed to be an unsigned integer, but its size and the
77   // information it stores may change if the Hit's encoding format is changed.
78   Value value_;
79 } __attribute__((packed));
80 static_assert(sizeof(BasicHit) == 4, "");
81 
82 // Hit is a specific encoding that refers to content within a document. A hit
83 // consists of:
84 // - a DocumentId
85 // - a SectionId
86 //   - referring to the document and section that the hit corresponds to
87 // - Metadata about the hit:
88 //   - whether the Hit does not appear exactly in the document, but instead
89 //     represents a term that is a prefix of a term in the document
90 //     (is_prefix_hit)
91 //   - whether the Hit came from a section that has prefix expansion enabled
92 //     (is_in_prefix_section)
93 //   - whether the Hit has set any bitmask flags (has_flags)
94 //   - bitmasks in flags fields:
95 //     - whether the Hit has a TermFrequency other than the default value
96 //       (has_term_frequency)
97 //   - a term frequency for the hit
98 //
99 // The hit is the most basic unit of the index and, when grouped together by
100 // term, can be used to encode what terms appear in what documents.
101 class Hit {
102  public:
103   // The datatype used to encode Hit information: the document_id, section_id,
104   // and 3 flags: is_prefix_hit, is_hit_in_prefix_section and has_flags flag.
105   //
106   // The Value is guaranteed to be an unsigned integer, but its size and the
107   // information it stores may change if the Hit's encoding format is changed.
108   using Value = uint32_t;
109 
110   // WARNING: Changing this value will invalidate any pre-existing posting lists
111   // on user devices.
112   //
113   // kInvalidValue contains:
114   // - 0 for unused bits. Note that unused bits are always 0 for both valid and
115   //   invalid Hit values.
116   // - Inverted kInvalidDocumentId
117   // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has
118   //   already invalidated the value. In fact, we currently use all 2^6 section
119   //   ids and there is no "invalid section id", so it doesn't matter what
120   //   SectionId we set for kInvalidValue.
121   static constexpr Value kInvalidValue = 0;
122   // Docs are sorted in reverse, and 0 is never used as the inverted
123   // DocumentId (because it is the inverse of kInvalidValue), so it is always
124   // the max in a descending sort.
125   static constexpr Value kMaxDocumentIdSortValue = 0;
126 
127   enum FlagOffsetsInFlagsField {
128     // Whether or not the hit has a term_frequency other than
129     // kDefaultTermFrequency.
130     kHasTermFrequency = 0,
131     kNumFlagsInFlagsField = 1,
132   };
133 
134   enum FlagOffsetsInValueField {
135     // Whether or not the hit has a flags value other than kNoEnabledFlags (i.e.
136     // it has flags enabled in the flags field)
137     kHasFlags = 0,
138     // This hit, whether exact or not, came from a prefixed section and will
139     // need to be backfilled into branching posting lists if/when those are
140     // created.
141     kInPrefixSection = 1,
142     // This hit represents a prefix of a longer term. If exact matches are
143     // required, then this hit should be ignored.
144     kPrefixHit = 2,
145     kNumFlagsInValueField = 3,
146   };
147   static_assert(kDocumentIdBits + kSectionIdBits + kNumFlagsInValueField <=
148                     sizeof(Hit::Value) * 8,
149                 "HitOverflow");
150   static_assert(kDocumentIdBits == 22, "");
151   static_assert(kSectionIdBits == 6, "");
152   static_assert(kNumFlagsInValueField == 3, "");
153 
154   // The datatype used to encode additional bit-flags in the Hit.
155   // This is guaranteed to be an unsigned integer, but its size may change if
156   // more flags are introduced in the future and require more bits to encode.
157   using Flags = uint8_t;
158   static constexpr Flags kNoEnabledFlags = 0;
159 
160   // The Term Frequency of a Hit.
161   // This is guaranteed to be an unsigned integer, but its size may change if we
162   // need to expand the max term-frequency.
163   using TermFrequency = uint8_t;
164   using TermFrequencyArray = std::array<Hit::TermFrequency, kTotalNumSections>;
165   // Max TermFrequency is 255.
166   static constexpr TermFrequency kMaxTermFrequency =
167       std::numeric_limits<TermFrequency>::max();
168   static constexpr TermFrequency kDefaultTermFrequency = 1;
169   static constexpr TermFrequency kNoTermFrequency = 0;
170 
Hit(Value value)171   explicit Hit(Value value)
172       : Hit(value, kNoEnabledFlags, kDefaultTermFrequency) {}
173   explicit Hit(Value value, Flags flags, TermFrequency term_frequency);
174   explicit Hit(SectionId section_id, DocumentId document_id,
175                TermFrequency term_frequency, bool is_in_prefix_section,
176                bool is_prefix_hit);
177 
is_valid()178   bool is_valid() const { return value() != kInvalidValue; }
179 
value()180   Value value() const {
181     Value value;
182     memcpy(&value, value_.data(), sizeof(value));
183     return value;
184   }
185 
186   DocumentId document_id() const;
187   SectionId section_id() const;
188   bool is_prefix_hit() const;
189   bool is_in_prefix_section() const;
190   // Whether or not the hit has any flags set to true.
191   bool has_flags() const;
192 
flags()193   Flags flags() const { return flags_; }
194   // Whether or not the hit contains a valid term frequency.
195   bool has_term_frequency() const;
196 
term_frequency()197   TermFrequency term_frequency() const { return term_frequency_; }
198 
199   // Returns true if the flags values across the Hit's value_, term_frequency_
200   // and flags_ fields are consistent.
201   bool CheckFlagsAreConsistent() const;
202 
203   // Creates a new hit based on old_hit but with new_document_id set.
204   static Hit TranslateHit(Hit old_hit, DocumentId new_document_id);
205 
206   bool operator<(const Hit& h2) const {
207     if (value() != h2.value()) {
208       return value() < h2.value();
209     }
210     return flags() < h2.flags();
211   }
212   bool operator==(const Hit& h2) const {
213     return value() == h2.value() && flags() == h2.flags();
214   }
215 
216   struct EqualsDocumentIdAndSectionId {
217     bool operator()(const Hit& hit1, const Hit& hit2) const;
218   };
219 
220  private:
221   // Value, Flags and TermFrequency must be in this order.
222   // Value bits layout: 1 unused + 22 document_id + 6 section_id + 1
223   // is_prefix_hit + 1 is_in_prefix_section + 1 has_flags.
224   std::array<char, sizeof(Value)> value_;
225   // Flags bits layout: 1 reserved + 6 unused + 1 has_term_frequency.
226   // The left-most bit is reserved for chaining additional fields in case of
227   // future hit expansions.
228   Flags flags_;
229   TermFrequency term_frequency_;
230 };
231 static_assert(sizeof(Hit) == 6, "");
232 // TODO(b/138991332) decide how to remove/replace all is_packed_pod assertions.
233 static_assert(icing_is_packed_pod<Hit>::value, "go/icing-ubsan");
234 
235 }  // namespace lib
236 }  // namespace icing
237 
238 #endif  // ICING_INDEX_HIT_HIT_H_
239