1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/index/main/posting-list-hit-serializer.h"
16
17 #include <cstdint>
18 #include <cstring>
19 #include <limits>
20 #include <vector>
21
22 #include "icing/absl_ports/canonical_errors.h"
23 #include "icing/file/posting_list/posting-list-used.h"
24 #include "icing/legacy/core/icing-string-util.h"
25 #include "icing/legacy/index/icing-bit-util.h"
26 #include "icing/util/logging.h"
27 #include "icing/util/status-macros.h"
28
29 namespace icing {
30 namespace lib {
31
32 namespace {
33
GetTermFrequencyByteSize(const Hit & hit)34 uint32_t GetTermFrequencyByteSize(const Hit& hit) {
35 return hit.has_term_frequency() ? sizeof(Hit::TermFrequency) : 0;
36 }
37
38 } // namespace
39
GetBytesUsed(const PostingListUsed * posting_list_used) const40 uint32_t PostingListHitSerializer::GetBytesUsed(
41 const PostingListUsed* posting_list_used) const {
42 // The special hits will be included if they represent actual hits. If they
43 // represent the hit offset or the invalid hit sentinel, they are not
44 // included.
45 return posting_list_used->size_in_bytes() -
46 GetStartByteOffset(posting_list_used);
47 }
48
GetMinPostingListSizeToFit(const PostingListUsed * posting_list_used) const49 uint32_t PostingListHitSerializer::GetMinPostingListSizeToFit(
50 const PostingListUsed* posting_list_used) const {
51 if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) {
52 // If in either the FULL state or ALMOST_FULL state, this posting list *is*
53 // the minimum size posting list that can fit these hits. So just return the
54 // size of the posting list.
55 return posting_list_used->size_in_bytes();
56 }
57
58 // In NOT_FULL status BytesUsed contains no special hits. The minimum sized
59 // posting list that would be guaranteed to fit these hits would be
60 // ALMOST_FULL, with kInvalidHit in special_hit(0), the uncompressed Hit in
61 // special_hit(1) and the n compressed hits in the compressed region.
62 // BytesUsed contains one uncompressed Hit and n compressed hits. Therefore,
63 // fitting these hits into a posting list would require BytesUsed plus one
64 // extra hit.
65 return GetBytesUsed(posting_list_used) + sizeof(Hit);
66 }
67
Clear(PostingListUsed * posting_list_used) const68 void PostingListHitSerializer::Clear(PostingListUsed* posting_list_used) const {
69 // Safe to ignore return value because posting_list_used->size_in_bytes() is
70 // a valid argument.
71 SetStartByteOffset(posting_list_used,
72 /*offset=*/posting_list_used->size_in_bytes());
73 }
74
MoveFrom(PostingListUsed * dst,PostingListUsed * src) const75 libtextclassifier3::Status PostingListHitSerializer::MoveFrom(
76 PostingListUsed* dst, PostingListUsed* src) const {
77 ICING_RETURN_ERROR_IF_NULL(dst);
78 ICING_RETURN_ERROR_IF_NULL(src);
79 if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) {
80 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
81 "src MinPostingListSizeToFit %d must be larger than size %d.",
82 GetMinPostingListSizeToFit(src), dst->size_in_bytes()));
83 }
84
85 if (!IsPostingListValid(dst)) {
86 return absl_ports::FailedPreconditionError(
87 "Dst posting list is in an invalid state and can't be used!");
88 }
89 if (!IsPostingListValid(src)) {
90 return absl_ports::InvalidArgumentError(
91 "Cannot MoveFrom an invalid src posting list!");
92 }
93
94 // Pop just enough hits that all of src's compressed hits fit in
95 // dst posting_list's compressed area. Then we can memcpy that area.
96 std::vector<Hit> hits;
97 while (IsFull(src) || IsAlmostFull(src) ||
98 (dst->size_in_bytes() - kSpecialHitsSize < GetBytesUsed(src))) {
99 if (!GetHitsInternal(src, /*limit=*/1, /*pop=*/true, &hits).ok()) {
100 return absl_ports::AbortedError(
101 "Unable to retrieve hits from src posting list.");
102 }
103 }
104
105 // memcpy the area and set up start byte offset.
106 Clear(dst);
107 memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src),
108 src->posting_list_buffer() + GetStartByteOffset(src),
109 GetBytesUsed(src));
110 // Because we popped all hits from src outside of the compressed area and we
111 // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() -
112 // kSpecialHitSize. This is guaranteed to be a valid byte offset for the
113 // NOT_FULL state, so ignoring the value is safe.
114 SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src));
115
116 // Put back remaining hits.
117 for (size_t i = 0; i < hits.size(); i++) {
118 const Hit& hit = hits[hits.size() - i - 1];
119 // PrependHit can return either INVALID_ARGUMENT - if hit is invalid or not
120 // less than the previous hit - or RESOURCE_EXHAUSTED. RESOURCE_EXHAUSTED
121 // should be impossible because we've already assured that there is enough
122 // room above.
123 ICING_RETURN_IF_ERROR(PrependHit(dst, hit));
124 }
125
126 Clear(src);
127 return libtextclassifier3::Status::OK;
128 }
129
GetPadEnd(const PostingListUsed * posting_list_used,uint32_t offset) const130 uint32_t PostingListHitSerializer::GetPadEnd(
131 const PostingListUsed* posting_list_used, uint32_t offset) const {
132 Hit::Value pad;
133 uint32_t pad_end = offset;
134 while (pad_end < posting_list_used->size_in_bytes()) {
135 size_t pad_len = VarInt::Decode(
136 posting_list_used->posting_list_buffer() + pad_end, &pad);
137 if (pad != 0) {
138 // No longer a pad.
139 break;
140 }
141 pad_end += pad_len;
142 }
143 return pad_end;
144 }
145
PadToEnd(PostingListUsed * posting_list_used,uint32_t start,uint32_t end) const146 bool PostingListHitSerializer::PadToEnd(PostingListUsed* posting_list_used,
147 uint32_t start, uint32_t end) const {
148 if (end > posting_list_used->size_in_bytes()) {
149 ICING_LOG(ERROR) << "Cannot pad a region that ends after size!";
150 return false;
151 }
152 // In VarInt a value of 0 encodes to 0.
153 memset(posting_list_used->posting_list_buffer() + start, 0, end - start);
154 return true;
155 }
156
PrependHitToAlmostFull(PostingListUsed * posting_list_used,const Hit & hit) const157 libtextclassifier3::Status PostingListHitSerializer::PrependHitToAlmostFull(
158 PostingListUsed* posting_list_used, const Hit& hit) const {
159 // Get delta between first hit and the new hit. Try to fit delta
160 // in the padded area and put new hit at the special position 1.
161 // Calling ValueOrDie is safe here because 1 < kNumSpecialData.
162 Hit cur = GetSpecialHit(posting_list_used, /*index=*/1).ValueOrDie();
163 if (cur.value() <= hit.value()) {
164 return absl_ports::InvalidArgumentError(
165 "Hit being prepended must be strictly less than the most recent Hit");
166 }
167 uint64_t delta = cur.value() - hit.value();
168 uint8_t delta_buf[VarInt::kMaxEncodedLen64];
169 size_t delta_len = VarInt::Encode(delta, delta_buf);
170 uint32_t cur_term_frequency_bytes = GetTermFrequencyByteSize(cur);
171
172 uint32_t pad_end = GetPadEnd(posting_list_used,
173 /*offset=*/kSpecialHitsSize);
174
175 if (pad_end >= kSpecialHitsSize + delta_len + cur_term_frequency_bytes) {
176 // Pad area has enough space for delta and term_frequency of existing hit
177 // (cur). Write delta at pad_end - delta_len - cur_term_frequency_bytes.
178 uint8_t* delta_offset = posting_list_used->posting_list_buffer() + pad_end -
179 delta_len - cur_term_frequency_bytes;
180 memcpy(delta_offset, delta_buf, delta_len);
181 // Now copy term_frequency.
182 Hit::TermFrequency term_frequency = cur.term_frequency();
183 uint8_t* term_frequency_offset = delta_offset + delta_len;
184 memcpy(term_frequency_offset, &term_frequency, cur_term_frequency_bytes);
185
186 // Now first hit is the new hit, at special position 1. Safe to ignore the
187 // return value because 1 < kNumSpecialData.
188 SetSpecialHit(posting_list_used, /*index=*/1, hit);
189 // Safe to ignore the return value because sizeof(Hit) is a valid argument.
190 SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
191 } else {
192 // No space for delta. We put the new hit at special position 0
193 // and go to the full state. Safe to ignore the return value because 1 <
194 // kNumSpecialData.
195 SetSpecialHit(posting_list_used, /*index=*/0, hit);
196 }
197 return libtextclassifier3::Status::OK;
198 }
199
PrependHitToEmpty(PostingListUsed * posting_list_used,const Hit & hit) const200 void PostingListHitSerializer::PrependHitToEmpty(
201 PostingListUsed* posting_list_used, const Hit& hit) const {
202 // First hit to be added. Just add verbatim, no compression.
203 if (posting_list_used->size_in_bytes() == kSpecialHitsSize) {
204 // Safe to ignore the return value because 1 < kNumSpecialData
205 SetSpecialHit(posting_list_used, /*index=*/1, hit);
206 // Safe to ignore the return value because sizeof(Hit) is a valid argument.
207 SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
208 } else {
209 // Since this is the first hit, size != kSpecialHitsSize and
210 // size % sizeof(Hit) == 0, we know that there is room to fit 'hit' into
211 // the compressed region, so ValueOrDie is safe.
212 uint32_t offset =
213 PrependHitUncompressed(posting_list_used, hit,
214 /*offset=*/posting_list_used->size_in_bytes())
215 .ValueOrDie();
216 // Safe to ignore the return value because PrependHitUncompressed is
217 // guaranteed to return a valid offset.
218 SetStartByteOffset(posting_list_used, offset);
219 }
220 }
221
PrependHitToNotFull(PostingListUsed * posting_list_used,const Hit & hit,uint32_t offset) const222 libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull(
223 PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const {
224 // First hit in compressed area. It is uncompressed. See if delta
225 // between the first hit and new hit will still fit in the
226 // compressed area.
227 if (offset + sizeof(Hit::Value) > posting_list_used->size_in_bytes()) {
228 // The first hit in the compressed region *should* be uncompressed, but
229 // somehow there isn't enough room between offset and the end of the
230 // compressed area to fit an uncompressed hit. This should NEVER happen.
231 return absl_ports::FailedPreconditionError(
232 "Posting list is in an invalid state.");
233 }
234 Hit::Value cur_value;
235 memcpy(&cur_value, posting_list_used->posting_list_buffer() + offset,
236 sizeof(Hit::Value));
237 if (cur_value <= hit.value()) {
238 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
239 "Hit %d being prepended must be strictly less than the most recent "
240 "Hit %d",
241 hit.value(), cur_value));
242 }
243 uint64_t delta = cur_value - hit.value();
244 uint8_t delta_buf[VarInt::kMaxEncodedLen64];
245 size_t delta_len = VarInt::Encode(delta, delta_buf);
246 uint32_t hit_term_frequency_bytes = GetTermFrequencyByteSize(hit);
247
248 // offset now points to one past the end of the first hit.
249 offset += sizeof(Hit::Value);
250 if (kSpecialHitsSize + sizeof(Hit::Value) + delta_len +
251 hit_term_frequency_bytes <=
252 offset) {
253 // Enough space for delta in compressed area.
254
255 // Prepend delta.
256 offset -= delta_len;
257 memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf,
258 delta_len);
259
260 // Prepend new hit with (possibly) its term_frequency. We know that there is
261 // room for 'hit' because of the if statement above, so calling ValueOrDie
262 // is safe.
263 offset =
264 PrependHitUncompressed(posting_list_used, hit, offset).ValueOrDie();
265 // offset is guaranteed to be valid here. So it's safe to ignore the return
266 // value. The if above will guarantee that offset >= kSpecialHitSize and <
267 // posting_list_used->size_in_bytes() because the if ensures that there is
268 // enough room between offset and kSpecialHitSize to fit the delta of the
269 // previous hit, any term_frequency and the uncompressed hit.
270 SetStartByteOffset(posting_list_used, offset);
271 } else if (kSpecialHitsSize + delta_len <= offset) {
272 // Only have space for delta. The new hit must be put in special
273 // position 1.
274
275 // Prepend delta.
276 offset -= delta_len;
277 memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf,
278 delta_len);
279
280 // Prepend pad. Safe to ignore the return value of PadToEnd because offset
281 // must be less than posting_list_used->size_in_bytes(). Otherwise, this
282 // function already would have returned FAILED_PRECONDITION.
283 PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize,
284 /*end=*/offset);
285
286 // Put new hit in special position 1. Safe to ignore return value because 1
287 // < kNumSpecialData.
288 SetSpecialHit(posting_list_used, /*index=*/1, hit);
289
290 // State almost_full. Safe to ignore the return value because sizeof(Hit) is
291 // a valid argument.
292 SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
293 } else {
294 // Very rare case where delta is larger than sizeof(Hit::Value)
295 // (i.e. varint delta encoding expanded required storage). We
296 // move first hit to special position 1 and put new hit in
297 // special position 0.
298 Hit cur(cur_value);
299 // offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes.
300 // Therefore, offset must be less than kSpecialHitSize + 5. Since posting
301 // list size must be divisible by sizeof(Hit) (5), it is guaranteed that
302 // offset < size_in_bytes, so it is safe to ignore the return value here.
303 ConsumeTermFrequencyIfPresent(posting_list_used, &cur, &offset);
304 // Safe to ignore the return value of PadToEnd because offset must be less
305 // than posting_list_used->size_in_bytes(). Otherwise, this function
306 // already would have returned FAILED_PRECONDITION.
307 PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize,
308 /*end=*/offset);
309 // Safe to ignore the return value here because 0 and 1 < kNumSpecialData.
310 SetSpecialHit(posting_list_used, /*index=*/1, cur);
311 SetSpecialHit(posting_list_used, /*index=*/0, hit);
312 }
313 return libtextclassifier3::Status::OK;
314 }
315
PrependHit(PostingListUsed * posting_list_used,const Hit & hit) const316 libtextclassifier3::Status PostingListHitSerializer::PrependHit(
317 PostingListUsed* posting_list_used, const Hit& hit) const {
318 static_assert(sizeof(Hit::Value) <= sizeof(uint64_t),
319 "Hit::Value cannot be larger than 8 bytes because the delta "
320 "must be able to fit in 8 bytes.");
321 if (!hit.is_valid()) {
322 return absl_ports::InvalidArgumentError("Cannot prepend an invalid hit!");
323 }
324 if (!IsPostingListValid(posting_list_used)) {
325 return absl_ports::FailedPreconditionError(
326 "This PostingListUsed is in an invalid state and can't add any hits!");
327 }
328
329 if (IsFull(posting_list_used)) {
330 // State full: no space left.
331 return absl_ports::ResourceExhaustedError("No more room for hits");
332 } else if (IsAlmostFull(posting_list_used)) {
333 return PrependHitToAlmostFull(posting_list_used, hit);
334 } else if (IsEmpty(posting_list_used)) {
335 PrependHitToEmpty(posting_list_used, hit);
336 return libtextclassifier3::Status::OK;
337 } else {
338 uint32_t offset = GetStartByteOffset(posting_list_used);
339 return PrependHitToNotFull(posting_list_used, hit, offset);
340 }
341 }
342
343 libtextclassifier3::StatusOr<std::vector<Hit>>
GetHits(const PostingListUsed * posting_list_used) const344 PostingListHitSerializer::GetHits(
345 const PostingListUsed* posting_list_used) const {
346 std::vector<Hit> hits_out;
347 ICING_RETURN_IF_ERROR(GetHits(posting_list_used, &hits_out));
348 return hits_out;
349 }
350
GetHits(const PostingListUsed * posting_list_used,std::vector<Hit> * hits_out) const351 libtextclassifier3::Status PostingListHitSerializer::GetHits(
352 const PostingListUsed* posting_list_used,
353 std::vector<Hit>* hits_out) const {
354 return GetHitsInternal(posting_list_used,
355 /*limit=*/std::numeric_limits<uint32_t>::max(),
356 /*pop=*/false, hits_out);
357 }
358
PopFrontHits(PostingListUsed * posting_list_used,uint32_t num_hits) const359 libtextclassifier3::Status PostingListHitSerializer::PopFrontHits(
360 PostingListUsed* posting_list_used, uint32_t num_hits) const {
361 if (num_hits == 1 && IsFull(posting_list_used)) {
362 // The PL is in full status which means that we save 2 uncompressed hits in
363 // the 2 special postions. But full status may be reached by 2 different
364 // statuses.
365 // (1) In "almost full" status
366 // +-----------------+----------------+-------+-----------------+
367 // |Hit::kInvalidVal |1st hit |(pad) |(compressed) hits|
368 // +-----------------+----------------+-------+-----------------+
369 // When we prepend another hit, we can only put it at the special
370 // position 0. And we get a full PL
371 // +-----------------+----------------+-------+-----------------+
372 // |new 1st hit |original 1st hit|(pad) |(compressed) hits|
373 // +-----------------+----------------+-------+-----------------+
374 // (2) In "not full" status
375 // +-----------------+----------------+------+-------+------------------+
376 // |hits-start-offset|Hit::kInvalidVal|(pad) |1st hit|(compressed) hits |
377 // +-----------------+----------------+------+-------+------------------+
378 // When we prepend another hit, we can reach any of the 3 following
379 // scenarios:
380 // (2.1) not full
381 // if the space of pad and original 1st hit can accommodate the new 1st hit
382 // and the encoded delta value.
383 // +-----------------+----------------+------+-----------+-----------------+
384 // |hits-start-offset|Hit::kInvalidVal|(pad) |new 1st hit|(compressed) hits|
385 // +-----------------+----------------+------+-----------+-----------------+
386 // (2.2) almost full
387 // If the space of pad and original 1st hit cannot accommodate the new 1st
388 // hit and the encoded delta value but can accommodate the encoded delta
389 // value only. We can put the new 1st hit at special position 1.
390 // +-----------------+----------------+-------+-----------------+
391 // |Hit::kInvalidVal |new 1st hit |(pad) |(compressed) hits|
392 // +-----------------+----------------+-------+-----------------+
393 // (2.3) full
394 // In very rare case, it cannot even accommodate only the encoded delta
395 // value. we can move the original 1st hit into special position 1 and the
396 // new 1st hit into special position 0. This may happen because we use
397 // VarInt encoding method which may make the encoded value longer (about
398 // 4/3 times of original)
399 // +-----------------+----------------+-------+-----------------+
400 // |new 1st hit |original 1st hit|(pad) |(compressed) hits|
401 // +-----------------+----------------+-------+-----------------+
402 // Suppose now the PL is full. But we don't know whether it arrived to
403 // this status from "not full" like (2.3) or from "almost full" like (1).
404 // We'll return to "almost full" status like (1) if we simply pop the new
405 // 1st hit but we want to make the prepending operation "reversible". So
406 // there should be some way to return to "not full" if possible. A simple
407 // way to do it is to pop 2 hits out of the PL to status "almost full" or
408 // "not full". And add the original 1st hit back. We can return to the
409 // correct original statuses of (2.1) or (1). This makes our prepending
410 // operation reversible.
411 std::vector<Hit> out;
412
413 // Popping 2 hits should never fail because we've just ensured that the
414 // posting list is in the FULL state.
415 ICING_RETURN_IF_ERROR(
416 GetHitsInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out));
417
418 // PrependHit should never fail because out[1] is a valid hit less than
419 // previous hits in the posting list and because there's no way that the
420 // posting list could run out of room because it previously stored this hit
421 // AND another hit.
422 PrependHit(posting_list_used, out[1]);
423 } else if (num_hits > 0) {
424 return GetHitsInternal(posting_list_used, /*limit=*/num_hits, /*pop=*/true,
425 nullptr);
426 }
427 return libtextclassifier3::Status::OK;
428 }
429
GetHitsInternal(const PostingListUsed * posting_list_used,uint32_t limit,bool pop,std::vector<Hit> * out) const430 libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal(
431 const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
432 std::vector<Hit>* out) const {
433 // Put current uncompressed val here.
434 Hit::Value val = Hit::kInvalidValue;
435 uint32_t offset = GetStartByteOffset(posting_list_used);
436 uint32_t count = 0;
437
438 // First traverse the first two special positions.
439 while (count < limit && offset < kSpecialHitsSize) {
440 // Calling ValueOrDie is safe here because offset / sizeof(Hit) <
441 // kNumSpecialData because of the check above.
442 Hit hit = GetSpecialHit(posting_list_used, /*index=*/offset / sizeof(Hit))
443 .ValueOrDie();
444 val = hit.value();
445 if (out != nullptr) {
446 out->push_back(hit);
447 }
448 offset += sizeof(Hit);
449 count++;
450 }
451
452 // If special position 1 was set then we need to skip padding.
453 if (val != Hit::kInvalidValue && offset == kSpecialHitsSize) {
454 offset = GetPadEnd(posting_list_used, offset);
455 }
456
457 while (count < limit && offset < posting_list_used->size_in_bytes()) {
458 if (val == Hit::kInvalidValue) {
459 // First hit is in compressed area. Put that in val.
460 memcpy(&val, posting_list_used->posting_list_buffer() + offset,
461 sizeof(Hit::Value));
462 offset += sizeof(Hit::Value);
463 } else {
464 // Now we have delta encoded subsequent hits. Decode and push.
465 uint64_t delta;
466 offset += VarInt::Decode(
467 posting_list_used->posting_list_buffer() + offset, &delta);
468 val += delta;
469 }
470 Hit hit(val);
471 libtextclassifier3::Status status =
472 ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset);
473 if (!status.ok()) {
474 // This posting list has been corrupted somehow. The first hit of the
475 // posting list claims to have a term frequency, but there's no more room
476 // in the posting list for that term frequency to exist. Return an empty
477 // vector and zero to indicate no hits retrieved.
478 if (out != nullptr) {
479 out->clear();
480 }
481 return absl_ports::InternalError("Posting list has been corrupted!");
482 }
483 if (out != nullptr) {
484 out->push_back(hit);
485 }
486 count++;
487 }
488
489 if (pop) {
490 PostingListUsed* mutable_posting_list_used =
491 const_cast<PostingListUsed*>(posting_list_used);
492 // Modify the posting list so that we pop all hits actually
493 // traversed.
494 if (offset >= kSpecialHitsSize &&
495 offset < posting_list_used->size_in_bytes()) {
496 // In the compressed area. Pop and reconstruct. offset/val is
497 // the last traversed hit, which we must discard. So move one
498 // more forward.
499 uint64_t delta;
500 offset += VarInt::Decode(
501 posting_list_used->posting_list_buffer() + offset, &delta);
502 val += delta;
503
504 // Now val is the first hit of the new posting list.
505 if (kSpecialHitsSize + sizeof(Hit::Value) <= offset) {
506 // val fits in compressed area. Simply copy.
507 offset -= sizeof(Hit::Value);
508 memcpy(mutable_posting_list_used->posting_list_buffer() + offset, &val,
509 sizeof(Hit::Value));
510 } else {
511 // val won't fit in compressed area. Also see if there is a
512 // term_frequency.
513 Hit hit(val);
514 libtextclassifier3::Status status =
515 ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset);
516 if (!status.ok()) {
517 // This posting list has been corrupted somehow. The first hit of
518 // the posting list claims to have a term frequency, but there's no
519 // more room in the posting list for that term frequency to exist.
520 // Return an empty vector and zero to indicate no hits retrieved. Do
521 // not pop anything.
522 if (out != nullptr) {
523 out->clear();
524 }
525 return absl_ports::InternalError("Posting list has been corrupted!");
526 }
527 // Okay to ignore the return value here because 1 < kNumSpecialData.
528 SetSpecialHit(mutable_posting_list_used, /*index=*/1, hit);
529
530 // Prepend pad. Safe to ignore the return value of PadToEnd because
531 // offset must be less than posting_list_used->size_in_bytes() thanks to
532 // the if above.
533 PadToEnd(mutable_posting_list_used,
534 /*start=*/kSpecialHitsSize,
535 /*end=*/offset);
536 offset = sizeof(Hit);
537 }
538 }
539 // offset is guaranteed to be valid so ignoring the return value of
540 // set_start_byte_offset is safe. It falls into one of four scenarios:
541 // Scenario 1: the above if was false because offset is not <
542 // posting_list_used->size_in_bytes()
543 // In this case, offset must be == posting_list_used->size_in_bytes()
544 // because we reached offset by unwinding hits on the posting list.
545 // Scenario 2: offset is < kSpecialHitSize
546 // In this case, offset is guaranteed to be either 0 or sizeof(Hit)
547 // because offset is incremented by sizeof(Hit) within the first while
548 // loop.
549 // Scenario 3: offset is within the compressed region and the new first hit
550 // in the posting list (the value that 'val' holds) will fit as an
551 // uncompressed hit in the compressed region. The resulting offset from
552 // decompressing val must be >= kSpecialHitSize because otherwise we'd be
553 // in Scenario 4
554 // Scenario 4: offset is within the compressed region, but the new first hit
555 // in the posting list is too large to fit as an uncompressed hit in the
556 // in the compressed region. Therefore, it must be stored in a special hit
557 // and offset will be sizeof(Hit).
558 SetStartByteOffset(mutable_posting_list_used, offset);
559 }
560
561 return libtextclassifier3::Status::OK;
562 }
563
GetSpecialHit(const PostingListUsed * posting_list_used,uint32_t index) const564 libtextclassifier3::StatusOr<Hit> PostingListHitSerializer::GetSpecialHit(
565 const PostingListUsed* posting_list_used, uint32_t index) const {
566 static_assert(sizeof(Hit::Value) >= sizeof(uint32_t), "HitTooSmall");
567 if (index >= kNumSpecialData || index < 0) {
568 return absl_ports::InvalidArgumentError(
569 "Special hits only exist at indices 0 and 1");
570 }
571 Hit val;
572 memcpy(&val, posting_list_used->posting_list_buffer() + index * sizeof(val),
573 sizeof(val));
574 return val;
575 }
576
SetSpecialHit(PostingListUsed * posting_list_used,uint32_t index,const Hit & val) const577 bool PostingListHitSerializer::SetSpecialHit(PostingListUsed* posting_list_used,
578 uint32_t index,
579 const Hit& val) const {
580 if (index >= kNumSpecialData || index < 0) {
581 ICING_LOG(ERROR) << "Special hits only exist at indices 0 and 1";
582 return false;
583 }
584 memcpy(posting_list_used->posting_list_buffer() + index * sizeof(val), &val,
585 sizeof(val));
586 return true;
587 }
588
IsPostingListValid(const PostingListUsed * posting_list_used) const589 bool PostingListHitSerializer::IsPostingListValid(
590 const PostingListUsed* posting_list_used) const {
591 if (IsAlmostFull(posting_list_used)) {
592 // Special Hit 1 should hold a Hit. Calling ValueOrDie is safe because we
593 // know that 1 < kNumSpecialData.
594 if (!GetSpecialHit(posting_list_used, /*index=*/1)
595 .ValueOrDie()
596 .is_valid()) {
597 ICING_LOG(ERROR)
598 << "Both special hits cannot be invalid at the same time.";
599 return false;
600 }
601 } else if (!IsFull(posting_list_used)) {
602 // NOT_FULL. Special Hit 0 should hold a valid offset. Calling ValueOrDie is
603 // safe because we know that 0 < kNumSpecialData.
604 if (GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() >
605 posting_list_used->size_in_bytes() ||
606 GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() <
607 kSpecialHitsSize) {
608 ICING_LOG(ERROR)
609 << "Hit: "
610 << GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value()
611 << " size: " << posting_list_used->size_in_bytes()
612 << " sp size: " << kSpecialHitsSize;
613 return false;
614 }
615 }
616 return true;
617 }
618
GetStartByteOffset(const PostingListUsed * posting_list_used) const619 uint32_t PostingListHitSerializer::GetStartByteOffset(
620 const PostingListUsed* posting_list_used) const {
621 if (IsFull(posting_list_used)) {
622 return 0;
623 } else if (IsAlmostFull(posting_list_used)) {
624 return sizeof(Hit);
625 } else {
626 // NOT_FULL, calling ValueOrDie is safe because we know that 0 <
627 // kNumSpecialData.
628 return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value();
629 }
630 }
631
SetStartByteOffset(PostingListUsed * posting_list_used,uint32_t offset) const632 bool PostingListHitSerializer::SetStartByteOffset(
633 PostingListUsed* posting_list_used, uint32_t offset) const {
634 if (offset > posting_list_used->size_in_bytes()) {
635 ICING_LOG(ERROR) << "offset cannot be a value greater than size "
636 << posting_list_used->size_in_bytes() << ". offset is "
637 << offset << ".";
638 return false;
639 }
640 if (offset < kSpecialHitsSize && offset > sizeof(Hit)) {
641 ICING_LOG(ERROR) << "offset cannot be a value between (" << sizeof(Hit)
642 << ", " << kSpecialHitsSize << "). offset is " << offset
643 << ".";
644 return false;
645 }
646 if (offset < sizeof(Hit) && offset != 0) {
647 ICING_LOG(ERROR) << "offset cannot be a value between (0, " << sizeof(Hit)
648 << "). offset is " << offset << ".";
649 return false;
650 }
651 if (offset >= kSpecialHitsSize) {
652 // not_full state. Safe to ignore the return value because 0 and 1 are both
653 // < kNumSpecialData.
654 SetSpecialHit(posting_list_used, /*index=*/0, Hit(offset));
655 SetSpecialHit(posting_list_used, /*index=*/1, Hit());
656 } else if (offset == sizeof(Hit)) {
657 // almost_full state. Safe to ignore the return value because 1 is both <
658 // kNumSpecialData.
659 SetSpecialHit(posting_list_used, /*index=*/0, Hit());
660 }
661 // Nothing to do for the FULL state - the offset isn't actually stored
662 // anywhere and both special hits hold valid hits.
663 return true;
664 }
665
666 libtextclassifier3::StatusOr<uint32_t>
PrependHitUncompressed(PostingListUsed * posting_list_used,const Hit & hit,uint32_t offset) const667 PostingListHitSerializer::PrependHitUncompressed(
668 PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const {
669 if (hit.has_term_frequency()) {
670 if (offset < kSpecialHitsSize + sizeof(Hit)) {
671 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
672 "Not enough room to prepend Hit at offset %d.", offset));
673 }
674 offset -= sizeof(Hit);
675 memcpy(posting_list_used->posting_list_buffer() + offset, &hit,
676 sizeof(Hit));
677 } else {
678 if (offset < kSpecialHitsSize + sizeof(Hit::Value)) {
679 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
680 "Not enough room to prepend Hit::Value at offset %d.", offset));
681 }
682 offset -= sizeof(Hit::Value);
683 Hit::Value val = hit.value();
684 memcpy(posting_list_used->posting_list_buffer() + offset, &val,
685 sizeof(Hit::Value));
686 }
687 return offset;
688 }
689
690 libtextclassifier3::Status
ConsumeTermFrequencyIfPresent(const PostingListUsed * posting_list_used,Hit * hit,uint32_t * offset) const691 PostingListHitSerializer::ConsumeTermFrequencyIfPresent(
692 const PostingListUsed* posting_list_used, Hit* hit,
693 uint32_t* offset) const {
694 if (!hit->has_term_frequency()) {
695 // No term frequency to consume. Everything is fine.
696 return libtextclassifier3::Status::OK;
697 }
698 if (*offset + sizeof(Hit::TermFrequency) >
699 posting_list_used->size_in_bytes()) {
700 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
701 "offset %d must not point past the end of the posting list of size %d.",
702 *offset, posting_list_used->size_in_bytes()));
703 }
704 Hit::TermFrequency term_frequency;
705 memcpy(&term_frequency, posting_list_used->posting_list_buffer() + *offset,
706 sizeof(Hit::TermFrequency));
707 *hit = Hit(hit->value(), term_frequency);
708 *offset += sizeof(Hit::TermFrequency);
709 return libtextclassifier3::Status::OK;
710 }
711
712 } // namespace lib
713 } // namespace icing
714