1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_HIT_HIT_H_ 16 #define ICING_INDEX_HIT_HIT_H_ 17 18 #include <array> 19 #include <cstdint> 20 #include <limits> 21 22 #include "icing/legacy/core/icing-packed-pod.h" 23 #include "icing/schema/section.h" 24 #include "icing/store/document-id.h" 25 26 namespace icing { 27 namespace lib { 28 29 // BasicHit is a specific encoding that refers to content within a document. A 30 // basic hit consists of: 31 // - a DocumentId 32 // - a SectionId 33 // referring to the document and section that the hit corresponds to. 34 // 35 // The hit is the most basic unit of the index and, when grouped together by 36 // term, can be used to encode what terms appear in what documents. 37 // 38 // BasicHit is for indices (e.g. numeric index) that don't require term 39 // frequency. 40 class BasicHit { 41 public: 42 // The datatype used to encode BasicHit information: the document_id and 43 // section_id. 44 using Value = uint32_t; 45 46 // WARNING: Changing this value will invalidate any pre-existing posting lists 47 // on user devices. 48 // 49 // kInvalidValue contains: 50 // - 0 for unused bits. Note that unused bits are always 0 for both valid and 51 // invalid BasicHit values. 52 // - Inverted kInvalidDocumentId 53 // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has 54 // already invalidated the value. In fact, we currently use all 2^6 section 55 // ids and there is no "invalid section id", so it doesn't matter what 56 // SectionId we set for kInvalidValue. 57 static constexpr Value kInvalidValue = 0; 58 59 explicit BasicHit(SectionId section_id, DocumentId document_id); 60 BasicHit()61 explicit BasicHit() : value_(kInvalidValue) {} 62 is_valid()63 bool is_valid() const { return value_ != kInvalidValue; } value()64 Value value() const { return value_; } 65 DocumentId document_id() const; 66 SectionId section_id() const; 67 68 bool operator<(const BasicHit& h2) const { return value_ < h2.value_; } 69 bool operator==(const BasicHit& h2) const { return value_ == h2.value_; } 70 71 private: 72 // Value bits layout: 4 unused + 22 document_id + 6 section id. 73 Value value_; 74 } __attribute__((packed)); 75 static_assert(sizeof(BasicHit) == 4, ""); 76 77 // Hit is a specific encoding that refers to content within a document. A hit 78 // consists of: 79 // - a DocumentId 80 // - a SectionId 81 // referring to the document and section that the hit corresponds to, as well as 82 // metadata about the hit: 83 // - whether the Hit has a TermFrequency other than the default value 84 // - whether the Hit does not appear exactly in the document, but instead 85 // represents a term that is a prefix of a term in the document 86 // - whether the Hit came from a section that has prefix expansion enabled 87 // and a term frequency for the hit. 88 // 89 // The hit is the most basic unit of the index and, when grouped together by 90 // term, can be used to encode what terms appear in what documents. 91 class Hit { 92 public: 93 // The datatype used to encode Hit information: the document_id, section_id 94 // and the has_term_frequency, prefix hit and in prefix section flags. 95 using Value = uint32_t; 96 97 // WARNING: Changing this value will invalidate any pre-existing posting lists 98 // on user devices. 99 // 100 // WARNING: 101 // - Hit::kInvalidValue should contain inverted kInvalidDocumentId, which is 102 // b'00...0. However, currently we set it as UINT32_MAX and actually it 103 // contains b'11...1, which is the inverted document_id 0. 104 // - It means Hit::kInvalidValue contains valid (document_id, section_id, 105 // flags), so we potentially cannot distinguish if a Hit is invalid or not. 106 // The invalidity is an essential feature for posting list since we use it 107 // to determine the state of the posting list. 108 // - The reason why it won't break the current posting list is because the 109 // unused bit(s) are set as 1 for Hit::kInvalidValue and 0 for all valid 110 // Hits. In other words, the unused bit(s) are actually serving as "invalid 111 // flag". 112 // - If we want to exhaust all unused bits in the future, then we have to 113 // change Hit::kInvalidValue to set the inverted document_id section 114 // correctly (b'00...0, refer to BasicHit::kInvalidValue as an example). 115 // - Also this problem is guarded by static_assert in hit.cc. If exhausting 116 // all unused bits, then the static_assert will detect and fail. We can 117 // safely remove the static_assert check after following the above process 118 // to resolve the incorrect Hit::kInvalidValue issue. 119 static constexpr Value kInvalidValue = std::numeric_limits<Value>::max(); 120 // Docs are sorted in reverse, and 0 is never used as the inverted 121 // DocumentId (because it is the inverse of kInvalidValue), so it is always 122 // the max in a descending sort. 123 static constexpr Value kMaxDocumentIdSortValue = 0; 124 125 // The Term Frequency of a Hit. 126 using TermFrequency = uint8_t; 127 using TermFrequencyArray = std::array<Hit::TermFrequency, kTotalNumSections>; 128 // Max TermFrequency is 255. 129 static constexpr TermFrequency kMaxTermFrequency = 130 std::numeric_limits<TermFrequency>::max(); 131 static constexpr TermFrequency kDefaultTermFrequency = 1; 132 static constexpr TermFrequency kNoTermFrequency = 0; 133 134 explicit Hit(Value value = kInvalidValue, 135 TermFrequency term_frequency = kDefaultTermFrequency) value_(value)136 : value_(value), term_frequency_(term_frequency) {} 137 Hit(SectionId section_id, DocumentId document_id, 138 TermFrequency term_frequency, bool is_in_prefix_section = false, 139 bool is_prefix_hit = false); 140 is_valid()141 bool is_valid() const { return value() != kInvalidValue; } value()142 Value value() const { return value_; } 143 DocumentId document_id() const; 144 SectionId section_id() const; 145 // Whether or not the hit contains a valid term frequency. 146 bool has_term_frequency() const; term_frequency()147 TermFrequency term_frequency() const { return term_frequency_; } 148 bool is_prefix_hit() const; 149 bool is_in_prefix_section() const; 150 151 // Creates a new hit based on old_hit but with new_document_id set. 152 static Hit TranslateHit(Hit old_hit, DocumentId new_document_id); 153 154 bool operator<(const Hit& h2) const { return value() < h2.value(); } 155 bool operator==(const Hit& h2) const { return value() == h2.value(); } 156 157 struct EqualsDocumentIdAndSectionId { 158 bool operator()(const Hit& hit1, const Hit& hit2) const; 159 }; 160 161 private: 162 // Value and TermFrequency must be in this order. 163 // Value bits layout: 1 unused + 22 document_id + 6 section id + 3 flags. 164 Value value_; 165 TermFrequency term_frequency_; 166 } __attribute__((packed)); 167 static_assert(sizeof(Hit) == 5, ""); 168 // TODO(b/138991332) decide how to remove/replace all is_packed_pod assertions. 169 static_assert(icing_is_packed_pod<Hit>::value, "go/icing-ubsan"); 170 171 } // namespace lib 172 } // namespace icing 173 174 #endif // ICING_INDEX_HIT_HIT_H_ 175