1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_HIT_HIT_H_ 16 #define ICING_INDEX_HIT_HIT_H_ 17 18 #include <array> 19 #include <cstdint> 20 #include <cstring> 21 #include <limits> 22 23 #include "icing/legacy/core/icing-packed-pod.h" 24 #include "icing/schema/section.h" 25 #include "icing/store/document-id.h" 26 27 namespace icing { 28 namespace lib { 29 30 // BasicHit is a specific encoding that refers to content within a document. A 31 // basic hit consists of: 32 // - a DocumentId 33 // - a SectionId 34 // referring to the document and section that the hit corresponds to. 35 // 36 // The hit is the most basic unit of the index and, when grouped together by 37 // term, can be used to encode what terms appear in what documents. 38 // 39 // BasicHit is for indices (e.g. numeric index) that don't require term 40 // frequency. 41 class BasicHit { 42 public: 43 // The datatype used to encode BasicHit information: the document_id and 44 // section_id. 45 using Value = uint32_t; 46 47 // WARNING: Changing this value will invalidate any pre-existing posting lists 48 // on user devices. 49 // 50 // kInvalidValue contains: 51 // - 0 for unused bits. Note that unused bits are always 0 for both valid and 52 // invalid BasicHit values. 53 // - Inverted kInvalidDocumentId 54 // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has 55 // already invalidated the value. In fact, we currently use all 2^6 section 56 // ids and there is no "invalid section id", so it doesn't matter what 57 // SectionId we set for kInvalidValue. 58 static constexpr Value kInvalidValue = 0; 59 60 explicit BasicHit(SectionId section_id, DocumentId document_id); 61 BasicHit()62 explicit BasicHit() : value_(kInvalidValue) {} BasicHit(Value value)63 explicit BasicHit(Value value) : value_(value) {} 64 is_valid()65 bool is_valid() const { return value_ != kInvalidValue; } value()66 Value value() const { return value_; } 67 DocumentId document_id() const; 68 SectionId section_id() const; 69 70 bool operator<(const BasicHit& h2) const { return value_ < h2.value_; } 71 bool operator==(const BasicHit& h2) const { return value_ == h2.value_; } 72 73 private: 74 // Value bits layout: 4 unused + 22 document_id + 6 section id. 75 // 76 // The Value is guaranteed to be an unsigned integer, but its size and the 77 // information it stores may change if the Hit's encoding format is changed. 78 Value value_; 79 } __attribute__((packed)); 80 static_assert(sizeof(BasicHit) == 4, ""); 81 82 // Hit is a specific encoding that refers to content within a document. A hit 83 // consists of: 84 // - a DocumentId 85 // - a SectionId 86 // - referring to the document and section that the hit corresponds to 87 // - Metadata about the hit: 88 // - whether the Hit does not appear exactly in the document, but instead 89 // represents a term that is a prefix of a term in the document 90 // (is_prefix_hit) 91 // - whether the Hit came from a section that has prefix expansion enabled 92 // (is_in_prefix_section) 93 // - whether the Hit has set any bitmask flags (has_flags) 94 // - bitmasks in flags fields: 95 // - whether the Hit has a TermFrequency other than the default value 96 // (has_term_frequency) 97 // - a term frequency for the hit 98 // 99 // The hit is the most basic unit of the index and, when grouped together by 100 // term, can be used to encode what terms appear in what documents. 101 class Hit { 102 public: 103 // The datatype used to encode Hit information: the document_id, section_id, 104 // and 3 flags: is_prefix_hit, is_hit_in_prefix_section and has_flags flag. 105 // 106 // The Value is guaranteed to be an unsigned integer, but its size and the 107 // information it stores may change if the Hit's encoding format is changed. 108 using Value = uint32_t; 109 110 // WARNING: Changing this value will invalidate any pre-existing posting lists 111 // on user devices. 112 // 113 // kInvalidValue contains: 114 // - 0 for unused bits. Note that unused bits are always 0 for both valid and 115 // invalid Hit values. 116 // - Inverted kInvalidDocumentId 117 // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has 118 // already invalidated the value. In fact, we currently use all 2^6 section 119 // ids and there is no "invalid section id", so it doesn't matter what 120 // SectionId we set for kInvalidValue. 121 static constexpr Value kInvalidValue = 0; 122 // Docs are sorted in reverse, and 0 is never used as the inverted 123 // DocumentId (because it is the inverse of kInvalidValue), so it is always 124 // the max in a descending sort. 125 static constexpr Value kMaxDocumentIdSortValue = 0; 126 127 enum FlagOffsetsInFlagsField { 128 // Whether or not the hit has a term_frequency other than 129 // kDefaultTermFrequency. 130 kHasTermFrequency = 0, 131 kNumFlagsInFlagsField = 1, 132 }; 133 134 enum FlagOffsetsInValueField { 135 // Whether or not the hit has a flags value other than kNoEnabledFlags (i.e. 136 // it has flags enabled in the flags field) 137 kHasFlags = 0, 138 // This hit, whether exact or not, came from a prefixed section and will 139 // need to be backfilled into branching posting lists if/when those are 140 // created. 141 kInPrefixSection = 1, 142 // This hit represents a prefix of a longer term. If exact matches are 143 // required, then this hit should be ignored. 144 kPrefixHit = 2, 145 kNumFlagsInValueField = 3, 146 }; 147 static_assert(kDocumentIdBits + kSectionIdBits + kNumFlagsInValueField <= 148 sizeof(Hit::Value) * 8, 149 "HitOverflow"); 150 static_assert(kDocumentIdBits == 22, ""); 151 static_assert(kSectionIdBits == 6, ""); 152 static_assert(kNumFlagsInValueField == 3, ""); 153 154 // The datatype used to encode additional bit-flags in the Hit. 155 // This is guaranteed to be an unsigned integer, but its size may change if 156 // more flags are introduced in the future and require more bits to encode. 157 using Flags = uint8_t; 158 static constexpr Flags kNoEnabledFlags = 0; 159 160 // The Term Frequency of a Hit. 161 // This is guaranteed to be an unsigned integer, but its size may change if we 162 // need to expand the max term-frequency. 163 using TermFrequency = uint8_t; 164 using TermFrequencyArray = std::array<Hit::TermFrequency, kTotalNumSections>; 165 // Max TermFrequency is 255. 166 static constexpr TermFrequency kMaxTermFrequency = 167 std::numeric_limits<TermFrequency>::max(); 168 static constexpr TermFrequency kDefaultTermFrequency = 1; 169 static constexpr TermFrequency kNoTermFrequency = 0; 170 Hit(Value value)171 explicit Hit(Value value) 172 : Hit(value, kNoEnabledFlags, kDefaultTermFrequency) {} 173 explicit Hit(Value value, Flags flags, TermFrequency term_frequency); 174 explicit Hit(SectionId section_id, DocumentId document_id, 175 TermFrequency term_frequency, bool is_in_prefix_section, 176 bool is_prefix_hit); 177 is_valid()178 bool is_valid() const { return value() != kInvalidValue; } 179 value()180 Value value() const { 181 Value value; 182 memcpy(&value, value_.data(), sizeof(value)); 183 return value; 184 } 185 186 DocumentId document_id() const; 187 SectionId section_id() const; 188 bool is_prefix_hit() const; 189 bool is_in_prefix_section() const; 190 // Whether or not the hit has any flags set to true. 191 bool has_flags() const; 192 flags()193 Flags flags() const { return flags_; } 194 // Whether or not the hit contains a valid term frequency. 195 bool has_term_frequency() const; 196 term_frequency()197 TermFrequency term_frequency() const { return term_frequency_; } 198 199 // Returns true if the flags values across the Hit's value_, term_frequency_ 200 // and flags_ fields are consistent. 201 bool CheckFlagsAreConsistent() const; 202 203 // Creates a new hit based on old_hit but with new_document_id set. 204 static Hit TranslateHit(Hit old_hit, DocumentId new_document_id); 205 206 bool operator<(const Hit& h2) const { 207 if (value() != h2.value()) { 208 return value() < h2.value(); 209 } 210 return flags() < h2.flags(); 211 } 212 bool operator==(const Hit& h2) const { 213 return value() == h2.value() && flags() == h2.flags(); 214 } 215 216 struct EqualsDocumentIdAndSectionId { 217 bool operator()(const Hit& hit1, const Hit& hit2) const; 218 }; 219 220 private: 221 // Value, Flags and TermFrequency must be in this order. 222 // Value bits layout: 1 unused + 22 document_id + 6 section_id + 1 223 // is_prefix_hit + 1 is_in_prefix_section + 1 has_flags. 224 std::array<char, sizeof(Value)> value_; 225 // Flags bits layout: 1 reserved + 6 unused + 1 has_term_frequency. 226 // The left-most bit is reserved for chaining additional fields in case of 227 // future hit expansions. 228 Flags flags_; 229 TermFrequency term_frequency_; 230 }; 231 static_assert(sizeof(Hit) == 6, ""); 232 // TODO(b/138991332) decide how to remove/replace all is_packed_pod assertions. 233 static_assert(icing_is_packed_pod<Hit>::value, "go/icing-ubsan"); 234 235 } // namespace lib 236 } // namespace icing 237 238 #endif // ICING_INDEX_HIT_HIT_H_ 239