1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifndef ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
16 #define ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
17
18 #include <cstdint>
19 #include <vector>
20
21 #include "icing/text_classifier/lib3/utils/base/status.h"
22 #include "icing/text_classifier/lib3/utils/base/statusor.h"
23 #include "icing/file/posting_list/posting-list-common.h"
24 #include "icing/file/posting_list/posting-list-used.h"
25 #include "icing/index/hit/hit.h"
26
27 namespace icing {
28 namespace lib {
29
30 // A serializer class to serialize hits to PostingListUsed. Layout described in
31 // comments in posting-list-hit-serializer.cc.
32 class PostingListHitSerializer : public PostingListSerializer {
33 public:
34 static constexpr uint32_t kSpecialHitsSize = kNumSpecialData * sizeof(Hit);
35
GetDataTypeBytes()36 uint32_t GetDataTypeBytes() const override { return sizeof(Hit); }
37
GetMinPostingListSize()38 uint32_t GetMinPostingListSize() const override {
39 static constexpr uint32_t kMinPostingListSize = kSpecialHitsSize;
40 static_assert(sizeof(PostingListIndex) <= kMinPostingListSize,
41 "PostingListIndex must be small enough to fit in a "
42 "minimum-sized Posting List.");
43
44 return kMinPostingListSize;
45 }
46
47 uint32_t GetMinPostingListSizeToFit(
48 const PostingListUsed* posting_list_used) const override;
49
50 uint32_t GetBytesUsed(
51 const PostingListUsed* posting_list_used) const override;
52
53 void Clear(PostingListUsed* posting_list_used) const override;
54
55 libtextclassifier3::Status MoveFrom(PostingListUsed* dst,
56 PostingListUsed* src) const override;
57
58 // Prepend a hit to the posting list.
59 //
60 // RETURNS:
61 // - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the
62 // previously added hit.
63 // - RESOURCE_EXHAUSTED if there is no more room to add hit to the posting
64 // list.
65 libtextclassifier3::Status PrependHit(PostingListUsed* posting_list_used,
66 const Hit& hit) const;
67
68 // Prepend hits to the posting list. Hits should be sorted in descending order
69 // (as defined by the less than operator for Hit)
70 //
71 // Returns the number of hits that could be prepended to the posting list. If
72 // keep_prepended is true, whatever could be prepended is kept, otherwise the
73 // posting list is left in its original state.
74 template <class T, Hit (*GetHit)(const T&)>
75 uint32_t PrependHitArray(PostingListUsed* posting_list_used, const T* array,
76 uint32_t num_hits, bool keep_prepended) const;
77
78 // Retrieves the hits stored in the posting list.
79 //
80 // RETURNS:
81 // - On success, a vector of hits sorted by the reverse order of prepending.
82 // - INTERNAL_ERROR if the posting list has been corrupted somehow.
83 libtextclassifier3::StatusOr<std::vector<Hit>> GetHits(
84 const PostingListUsed* posting_list_used) const;
85
86 // Same as GetHits but appends hits to hits_out.
87 //
88 // RETURNS:
89 // - On success, a vector of hits sorted by the reverse order of prepending.
90 // - INTERNAL_ERROR if the posting list has been corrupted somehow.
91 libtextclassifier3::Status GetHits(const PostingListUsed* posting_list_used,
92 std::vector<Hit>* hits_out) const;
93
94 // Undo the last num_hits hits prepended. If num_hits > number of
95 // hits we clear all hits.
96 //
97 // RETURNS:
98 // - OK on success
99 // - INTERNAL_ERROR if the posting list has been corrupted somehow.
100 libtextclassifier3::Status PopFrontHits(PostingListUsed* posting_list_used,
101 uint32_t num_hits) const;
102
103 private:
104 // Posting list layout formats:
105 //
106 // not_full
107 //
108 // +-----------------+----------------+-------+-----------------+
109 // |hits-start-offset|Hit::kInvalidVal|xxxxxxx|(compressed) hits|
110 // +-----------------+----------------+-------+-----------------+
111 //
112 // almost_full
113 //
114 // +-----------------+----------------+-------+-----------------+
115 // |Hit::kInvalidVal |1st hit |(pad) |(compressed) hits|
116 // +-----------------+----------------+-------+-----------------+
117 //
118 // full()
119 //
120 // +-----------------+----------------+-------+-----------------+
121 // |1st hit |2nd hit |(pad) |(compressed) hits|
122 // +-----------------+----------------+-------+-----------------+
123 //
124 // The first two uncompressed hits also implicitly encode information about
125 // the size of the compressed hits region.
126 //
127 // 1. If the posting list is NOT_FULL, then
128 // posting_list_buffer_[0] contains the byte offset of the start of the
129 // compressed hits - and, thus, the size of the compressed hits region is
130 // size_in_bytes - posting_list_buffer_[0].
131 //
132 // 2. If posting list is ALMOST_FULL or FULL, then the compressed hits region
133 // starts somewhere between [kSpecialHitsSize, kSpecialHitsSize + sizeof(Hit)
134 // - 1] and ends at size_in_bytes - 1.
135 //
136 // Hit term frequencies are stored after the hit value, compressed or
137 // uncompressed. For the first two special hits, we always have a
138 // space for the term frequency. For hits in the compressed area, we only have
139 // the term frequency following the hit value of hit.has_term_frequency() is
140 // true. This allows good compression in the common case where hits don't have
141 // a valid term frequency.
142 //
143 // EXAMPLE
144 // Posting list storage. Posting list size: 20 bytes
145 // EMPTY!
146 // +--bytes 0-4--+----- 5-9 ------+---------------- 10-19 -----------------+
147 // | 20 |Hit::kInvalidVal| 0x000 |
148 // +-------------+----------------+----------------+-----------------------+
149 //
150 // Add Hit 0x07FFF998 (DocumentId = 12, SectionId = 3, Flags = 0)
151 // NOT FULL!
152 // +--bytes 0-4--+----- 5-9 ------+----- 10-15 -----+-------- 16-19 -------+
153 // | 16 |Hit::kInvalidVal| 0x000 | 0x07FFF998 |
154 // +-------------+----------------+-----------------+----------------------+
155 //
156 // Add Hit 0x07FFF684 (DocumentId = 18, SectionId = 0, Flags = 4,
157 // TermFrequency=125)
158 // (Hit 0x07FFF998 - Hit 0x07FFF684 = 788)
159 // +--bytes 0-4--+----- 5-9 ------+-- 10-12 --+-- 13-16 --+- 17 -+-- 18-19 --+
160 // | 13 |Hit::kInvalidVal| 0x000 | 0x07FFF684| 125 | 788 |
161 // +-------------+----------------+-----------+-----------+------+-----------+
162 //
163 // Add Hit 0x07FFF4D2 (DocumentId = 22, SectionId = 10, Flags = 2)
164 // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 434)
165 // +--bytes 0-4--+--- 5-9 ----+-- 10 --+-- 11-14 -+- 15-16 -+- 17 -+- 18-19 -+
166 // | 9 |Hit::kInvVal| 0x00 |0x07FFF4D2| 434 | 125 | 788 |
167 // +-------------+------------+--------+----------+---------+------+---------+
168 //
169 // Add Hit 0x07FFF40E (DocumentId = 23, SectionId = 1, Flags = 6,
170 // TermFrequency = 87)
171 // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 196) ALMOST FULL!
172 // +--bytes 0-4-+---- 5-9 ----+- 10-12 -+- 13-14 -+- 15-16 -+- 17 -+- 18-19 -+
173 // |Hit::kInvVal|0x07FFF40E,87| 0x000 | 196 | 434 | 125 | 788 |
174 // +-------------+------------+---------+---------+---------+------+---------+
175 //
176 // Add Hit 0x07FFF320 (DocumentId = 27, SectionId = 4, Flags = 0)
177 // FULL!
178 // +--bytes 0-4--+---- 5-9 ----+- 10-13 -+-- 14-15 -+- 16-17 -+- 18 -+- 19-20
179 // -+ | 0x07FFF320 |0x07FFF40E,87| 0x000 | 196 | 434 | 125 | 788
180 // |
181 // +-------------+-------------+---------+----------+---------+------+---------+
182
183 // Helpers to determine what state the posting list is in.
IsFull(const PostingListUsed * posting_list_used)184 bool IsFull(const PostingListUsed* posting_list_used) const {
185 return GetSpecialHit(posting_list_used, /*index=*/0)
186 .ValueOrDie()
187 .is_valid() &&
188 GetSpecialHit(posting_list_used, /*index=*/1)
189 .ValueOrDie()
190 .is_valid();
191 }
192
IsAlmostFull(const PostingListUsed * posting_list_used)193 bool IsAlmostFull(const PostingListUsed* posting_list_used) const {
194 return !GetSpecialHit(posting_list_used, /*index=*/0)
195 .ValueOrDie()
196 .is_valid();
197 }
198
IsEmpty(const PostingListUsed * posting_list_used)199 bool IsEmpty(const PostingListUsed* posting_list_used) const {
200 return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() ==
201 posting_list_used->size_in_bytes() &&
202 !GetSpecialHit(posting_list_used, /*index=*/1)
203 .ValueOrDie()
204 .is_valid();
205 }
206
207 // Returns false if both special hits are invalid or if the offset value
208 // stored in the special hit is less than kSpecialHitsSize or greater than
209 // posting_list_used->size_in_bytes(). Returns true, otherwise.
210 bool IsPostingListValid(const PostingListUsed* posting_list_used) const;
211
212 // Prepend hit to a posting list that is in the ALMOST_FULL state.
213 // RETURNS:
214 // - OK, if successful
215 // - INVALID_ARGUMENT if hit is not less than the previously added hit.
216 libtextclassifier3::Status PrependHitToAlmostFull(
217 PostingListUsed* posting_list_used, const Hit& hit) const;
218
219 // Prepend hit to a posting list that is in the EMPTY state. This will always
220 // succeed because there are no pre-existing hits and no validly constructed
221 // posting list could fail to fit one hit.
222 void PrependHitToEmpty(PostingListUsed* posting_list_used,
223 const Hit& hit) const;
224
225 // Prepend hit to a posting list that is in the NOT_FULL state.
226 // RETURNS:
227 // - OK, if successful
228 // - INVALID_ARGUMENT if hit is not less than the previously added hit.
229 libtextclassifier3::Status PrependHitToNotFull(
230 PostingListUsed* posting_list_used, const Hit& hit,
231 uint32_t offset) const;
232
233 // Returns either 0 (full state), sizeof(Hit) (almost_full state) or
234 // a byte offset between kSpecialHitsSize and
235 // posting_list_used->size_in_bytes() (inclusive) (not_full state).
236 uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const;
237
238 // Sets the special hits to properly reflect what offset is (see layout
239 // comment for further details).
240 //
241 // Returns false if offset > posting_list_used->size_in_bytes() or offset is
242 // (kSpecialHitsSize, sizeof(Hit)) or offset is (sizeof(Hit), 0). True,
243 // otherwise.
244 bool SetStartByteOffset(PostingListUsed* posting_list_used,
245 uint32_t offset) const;
246
247 // Manipulate padded areas. We never store the same hit value twice
248 // so a delta of 0 is a pad byte.
249
250 // Returns offset of first non-pad byte.
251 uint32_t GetPadEnd(const PostingListUsed* posting_list_used,
252 uint32_t offset) const;
253
254 // Fill padding between offset start and offset end with 0s.
255 // Returns false if end > posting_list_used->size_in_bytes(). True,
256 // otherwise.
257 bool PadToEnd(PostingListUsed* posting_list_used, uint32_t start,
258 uint32_t end) const;
259
260 // Helper for AppendHits/PopFrontHits. Adds limit number of hits to out or all
261 // hits in the posting list if the posting list contains less than limit
262 // number of hits. out can be NULL.
263 //
264 // NOTE: If called with limit=1, pop=true on a posting list that transitioned
265 // from NOT_FULL directly to FULL, GetHitsInternal will not return the posting
266 // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
267 // ALMOST_FULL.
268 //
269 // RETURNS:
270 // - OK on success
271 // - INTERNAL_ERROR if the posting list has been corrupted somehow.
272 libtextclassifier3::Status GetHitsInternal(
273 const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
274 std::vector<Hit>* out) const;
275
276 // Retrieves the value stored in the index-th special hit.
277 //
278 // RETURNS:
279 // - A valid Hit, on success
280 // - INVALID_ARGUMENT if index is not less than kNumSpecialData
281 libtextclassifier3::StatusOr<Hit> GetSpecialHit(
282 const PostingListUsed* posting_list_used, uint32_t index) const;
283
284 // Sets the value stored in the index-th special hit to val. If index is not
285 // less than kSpecialHitSize / sizeof(Hit), this has no effect.
286 bool SetSpecialHit(PostingListUsed* posting_list_used, uint32_t index,
287 const Hit& val) const;
288
289 // Prepends hit to the memory region [offset - sizeof(Hit), offset] and
290 // returns the new beginning of the padded region.
291 //
292 // RETURNS:
293 // - The new beginning of the padded region, if successful.
294 // - INVALID_ARGUMENT if hit will not fit (uncompressed) between offset and
295 // kSpecialHitsSize
296 libtextclassifier3::StatusOr<uint32_t> PrependHitUncompressed(
297 PostingListUsed* posting_list_used, const Hit& hit,
298 uint32_t offset) const;
299
300 // If hit has a term frequency, consumes the term frequency at offset, updates
301 // hit to include the term frequency and updates offset to reflect that the
302 // term frequency has been consumed.
303 //
304 // RETURNS:
305 // - OK, if successful
306 // - INVALID_ARGUMENT if hit has a term frequency and offset +
307 // sizeof(Hit::TermFrequency) >= posting_list_used->size_in_bytes()
308 libtextclassifier3::Status ConsumeTermFrequencyIfPresent(
309 const PostingListUsed* posting_list_used, Hit* hit,
310 uint32_t* offset) const;
311 };
312
313 // Inlined functions. Implementation details below. Avert eyes!
314 template <class T, Hit (*GetHit)(const T&)>
PrependHitArray(PostingListUsed * posting_list_used,const T * array,uint32_t num_hits,bool keep_prepended)315 uint32_t PostingListHitSerializer::PrependHitArray(
316 PostingListUsed* posting_list_used, const T* array, uint32_t num_hits,
317 bool keep_prepended) const {
318 if (!IsPostingListValid(posting_list_used)) {
319 return 0;
320 }
321
322 // Prepend hits working backwards from array[num_hits - 1].
323 uint32_t i;
324 for (i = 0; i < num_hits; ++i) {
325 if (!PrependHit(posting_list_used, GetHit(array[num_hits - i - 1])).ok()) {
326 break;
327 }
328 }
329 if (i != num_hits && !keep_prepended) {
330 // Didn't fit. Undo everything and check that we have the same offset as
331 // before. PopFrontHits guarantees that it will remove all 'i' hits so long
332 // as there are at least 'i' hits in the posting list, which we know there
333 // are.
334 PopFrontHits(posting_list_used, /*num_hits=*/i);
335 }
336 return i;
337 }
338
339 } // namespace lib
340 } // namespace icing
341
342 #endif // ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
343