• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/main/posting-list-hit-serializer.h"
16 
17 #include <cstdint>
18 #include <cstring>
19 #include <limits>
20 #include <vector>
21 
22 #include "icing/absl_ports/canonical_errors.h"
23 #include "icing/file/posting_list/posting-list-used.h"
24 #include "icing/legacy/core/icing-string-util.h"
25 #include "icing/legacy/index/icing-bit-util.h"
26 #include "icing/util/logging.h"
27 #include "icing/util/status-macros.h"
28 
29 namespace icing {
30 namespace lib {
31 
32 namespace {
33 
GetTermFrequencyByteSize(const Hit & hit)34 uint32_t GetTermFrequencyByteSize(const Hit& hit) {
35   return hit.has_term_frequency() ? sizeof(Hit::TermFrequency) : 0;
36 }
37 
38 }  // namespace
39 
GetBytesUsed(const PostingListUsed * posting_list_used) const40 uint32_t PostingListHitSerializer::GetBytesUsed(
41     const PostingListUsed* posting_list_used) const {
42   // The special hits will be included if they represent actual hits. If they
43   // represent the hit offset or the invalid hit sentinel, they are not
44   // included.
45   return posting_list_used->size_in_bytes() -
46          GetStartByteOffset(posting_list_used);
47 }
48 
GetMinPostingListSizeToFit(const PostingListUsed * posting_list_used) const49 uint32_t PostingListHitSerializer::GetMinPostingListSizeToFit(
50     const PostingListUsed* posting_list_used) const {
51   if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) {
52     // If in either the FULL state or ALMOST_FULL state, this posting list *is*
53     // the minimum size posting list that can fit these hits. So just return the
54     // size of the posting list.
55     return posting_list_used->size_in_bytes();
56   }
57 
58   // In NOT_FULL status BytesUsed contains no special hits. The minimum sized
59   // posting list that would be guaranteed to fit these hits would be
60   // ALMOST_FULL, with kInvalidHit in special_hit(0), the uncompressed Hit in
61   // special_hit(1) and the n compressed hits in the compressed region.
62   // BytesUsed contains one uncompressed Hit and n compressed hits. Therefore,
63   // fitting these hits into a posting list would require BytesUsed plus one
64   // extra hit.
65   return GetBytesUsed(posting_list_used) + sizeof(Hit);
66 }
67 
Clear(PostingListUsed * posting_list_used) const68 void PostingListHitSerializer::Clear(PostingListUsed* posting_list_used) const {
69   // Safe to ignore return value because posting_list_used->size_in_bytes() is
70   // a valid argument.
71   SetStartByteOffset(posting_list_used,
72                      /*offset=*/posting_list_used->size_in_bytes());
73 }
74 
MoveFrom(PostingListUsed * dst,PostingListUsed * src) const75 libtextclassifier3::Status PostingListHitSerializer::MoveFrom(
76     PostingListUsed* dst, PostingListUsed* src) const {
77   ICING_RETURN_ERROR_IF_NULL(dst);
78   ICING_RETURN_ERROR_IF_NULL(src);
79   if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) {
80     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
81         "src MinPostingListSizeToFit %d must be larger than size %d.",
82         GetMinPostingListSizeToFit(src), dst->size_in_bytes()));
83   }
84 
85   if (!IsPostingListValid(dst)) {
86     return absl_ports::FailedPreconditionError(
87         "Dst posting list is in an invalid state and can't be used!");
88   }
89   if (!IsPostingListValid(src)) {
90     return absl_ports::InvalidArgumentError(
91         "Cannot MoveFrom an invalid src posting list!");
92   }
93 
94   // Pop just enough hits that all of src's compressed hits fit in
95   // dst posting_list's compressed area. Then we can memcpy that area.
96   std::vector<Hit> hits;
97   while (IsFull(src) || IsAlmostFull(src) ||
98          (dst->size_in_bytes() - kSpecialHitsSize < GetBytesUsed(src))) {
99     if (!GetHitsInternal(src, /*limit=*/1, /*pop=*/true, &hits).ok()) {
100       return absl_ports::AbortedError(
101           "Unable to retrieve hits from src posting list.");
102     }
103   }
104 
105   // memcpy the area and set up start byte offset.
106   Clear(dst);
107   memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src),
108          src->posting_list_buffer() + GetStartByteOffset(src),
109          GetBytesUsed(src));
110   // Because we popped all hits from src outside of the compressed area and we
111   // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() -
112   // kSpecialHitSize. This is guaranteed to be a valid byte offset for the
113   // NOT_FULL state, so ignoring the value is safe.
114   SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src));
115 
116   // Put back remaining hits.
117   for (size_t i = 0; i < hits.size(); i++) {
118     const Hit& hit = hits[hits.size() - i - 1];
119     // PrependHit can return either INVALID_ARGUMENT - if hit is invalid or not
120     // less than the previous hit - or RESOURCE_EXHAUSTED. RESOURCE_EXHAUSTED
121     // should be impossible because we've already assured that there is enough
122     // room above.
123     ICING_RETURN_IF_ERROR(PrependHit(dst, hit));
124   }
125 
126   Clear(src);
127   return libtextclassifier3::Status::OK;
128 }
129 
GetPadEnd(const PostingListUsed * posting_list_used,uint32_t offset) const130 uint32_t PostingListHitSerializer::GetPadEnd(
131     const PostingListUsed* posting_list_used, uint32_t offset) const {
132   Hit::Value pad;
133   uint32_t pad_end = offset;
134   while (pad_end < posting_list_used->size_in_bytes()) {
135     size_t pad_len = VarInt::Decode(
136         posting_list_used->posting_list_buffer() + pad_end, &pad);
137     if (pad != 0) {
138       // No longer a pad.
139       break;
140     }
141     pad_end += pad_len;
142   }
143   return pad_end;
144 }
145 
PadToEnd(PostingListUsed * posting_list_used,uint32_t start,uint32_t end) const146 bool PostingListHitSerializer::PadToEnd(PostingListUsed* posting_list_used,
147                                         uint32_t start, uint32_t end) const {
148   if (end > posting_list_used->size_in_bytes()) {
149     ICING_LOG(ERROR) << "Cannot pad a region that ends after size!";
150     return false;
151   }
152   // In VarInt a value of 0 encodes to 0.
153   memset(posting_list_used->posting_list_buffer() + start, 0, end - start);
154   return true;
155 }
156 
PrependHitToAlmostFull(PostingListUsed * posting_list_used,const Hit & hit) const157 libtextclassifier3::Status PostingListHitSerializer::PrependHitToAlmostFull(
158     PostingListUsed* posting_list_used, const Hit& hit) const {
159   // Get delta between first hit and the new hit. Try to fit delta
160   // in the padded area and put new hit at the special position 1.
161   // Calling ValueOrDie is safe here because 1 < kNumSpecialData.
162   Hit cur = GetSpecialHit(posting_list_used, /*index=*/1).ValueOrDie();
163   if (cur.value() <= hit.value()) {
164     return absl_ports::InvalidArgumentError(
165         "Hit being prepended must be strictly less than the most recent Hit");
166   }
167   uint64_t delta = cur.value() - hit.value();
168   uint8_t delta_buf[VarInt::kMaxEncodedLen64];
169   size_t delta_len = VarInt::Encode(delta, delta_buf);
170   uint32_t cur_term_frequency_bytes = GetTermFrequencyByteSize(cur);
171 
172   uint32_t pad_end = GetPadEnd(posting_list_used,
173                                /*offset=*/kSpecialHitsSize);
174 
175   if (pad_end >= kSpecialHitsSize + delta_len + cur_term_frequency_bytes) {
176     // Pad area has enough space for delta and term_frequency of existing hit
177     // (cur). Write delta at pad_end - delta_len - cur_term_frequency_bytes.
178     uint8_t* delta_offset = posting_list_used->posting_list_buffer() + pad_end -
179                             delta_len - cur_term_frequency_bytes;
180     memcpy(delta_offset, delta_buf, delta_len);
181     // Now copy term_frequency.
182     Hit::TermFrequency term_frequency = cur.term_frequency();
183     uint8_t* term_frequency_offset = delta_offset + delta_len;
184     memcpy(term_frequency_offset, &term_frequency, cur_term_frequency_bytes);
185 
186     // Now first hit is the new hit, at special position 1. Safe to ignore the
187     // return value because 1 < kNumSpecialData.
188     SetSpecialHit(posting_list_used, /*index=*/1, hit);
189     // Safe to ignore the return value because sizeof(Hit) is a valid argument.
190     SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
191   } else {
192     // No space for delta. We put the new hit at special position 0
193     // and go to the full state. Safe to ignore the return value because 1 <
194     // kNumSpecialData.
195     SetSpecialHit(posting_list_used, /*index=*/0, hit);
196   }
197   return libtextclassifier3::Status::OK;
198 }
199 
PrependHitToEmpty(PostingListUsed * posting_list_used,const Hit & hit) const200 void PostingListHitSerializer::PrependHitToEmpty(
201     PostingListUsed* posting_list_used, const Hit& hit) const {
202   // First hit to be added. Just add verbatim, no compression.
203   if (posting_list_used->size_in_bytes() == kSpecialHitsSize) {
204     // Safe to ignore the return value because 1 < kNumSpecialData
205     SetSpecialHit(posting_list_used, /*index=*/1, hit);
206     // Safe to ignore the return value because sizeof(Hit) is a valid argument.
207     SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
208   } else {
209     // Since this is the first hit, size != kSpecialHitsSize and
210     // size % sizeof(Hit) == 0, we know that there is room to fit 'hit' into
211     // the compressed region, so ValueOrDie is safe.
212     uint32_t offset =
213         PrependHitUncompressed(posting_list_used, hit,
214                                /*offset=*/posting_list_used->size_in_bytes())
215             .ValueOrDie();
216     // Safe to ignore the return value because PrependHitUncompressed is
217     // guaranteed to return a valid offset.
218     SetStartByteOffset(posting_list_used, offset);
219   }
220 }
221 
PrependHitToNotFull(PostingListUsed * posting_list_used,const Hit & hit,uint32_t offset) const222 libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull(
223     PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const {
224   // First hit in compressed area. It is uncompressed. See if delta
225   // between the first hit and new hit will still fit in the
226   // compressed area.
227   if (offset + sizeof(Hit::Value) > posting_list_used->size_in_bytes()) {
228     // The first hit in the compressed region *should* be uncompressed, but
229     // somehow there isn't enough room between offset and the end of the
230     // compressed area to fit an uncompressed hit. This should NEVER happen.
231     return absl_ports::FailedPreconditionError(
232         "Posting list is in an invalid state.");
233   }
234   Hit::Value cur_value;
235   memcpy(&cur_value, posting_list_used->posting_list_buffer() + offset,
236          sizeof(Hit::Value));
237   if (cur_value <= hit.value()) {
238     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
239         "Hit %d being prepended must be strictly less than the most recent "
240         "Hit %d",
241         hit.value(), cur_value));
242   }
243   uint64_t delta = cur_value - hit.value();
244   uint8_t delta_buf[VarInt::kMaxEncodedLen64];
245   size_t delta_len = VarInt::Encode(delta, delta_buf);
246   uint32_t hit_term_frequency_bytes = GetTermFrequencyByteSize(hit);
247 
248   // offset now points to one past the end of the first hit.
249   offset += sizeof(Hit::Value);
250   if (kSpecialHitsSize + sizeof(Hit::Value) + delta_len +
251           hit_term_frequency_bytes <=
252       offset) {
253     // Enough space for delta in compressed area.
254 
255     // Prepend delta.
256     offset -= delta_len;
257     memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf,
258            delta_len);
259 
260     // Prepend new hit with (possibly) its term_frequency. We know that there is
261     // room for 'hit' because of the if statement above, so calling ValueOrDie
262     // is safe.
263     offset =
264         PrependHitUncompressed(posting_list_used, hit, offset).ValueOrDie();
265     // offset is guaranteed to be valid here. So it's safe to ignore the return
266     // value. The if above will guarantee that offset >= kSpecialHitSize and <
267     // posting_list_used->size_in_bytes() because the if ensures that there is
268     // enough room between offset and kSpecialHitSize to fit the delta of the
269     // previous hit, any term_frequency and the uncompressed hit.
270     SetStartByteOffset(posting_list_used, offset);
271   } else if (kSpecialHitsSize + delta_len <= offset) {
272     // Only have space for delta. The new hit must be put in special
273     // position 1.
274 
275     // Prepend delta.
276     offset -= delta_len;
277     memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf,
278            delta_len);
279 
280     // Prepend pad. Safe to ignore the return value of PadToEnd because offset
281     // must be less than posting_list_used->size_in_bytes(). Otherwise, this
282     // function already would have returned FAILED_PRECONDITION.
283     PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize,
284              /*end=*/offset);
285 
286     // Put new hit in special position 1. Safe to ignore return value because 1
287     // < kNumSpecialData.
288     SetSpecialHit(posting_list_used, /*index=*/1, hit);
289 
290     // State almost_full. Safe to ignore the return value because sizeof(Hit) is
291     // a valid argument.
292     SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
293   } else {
294     // Very rare case where delta is larger than sizeof(Hit::Value)
295     // (i.e. varint delta encoding expanded required storage). We
296     // move first hit to special position 1 and put new hit in
297     // special position 0.
298     Hit cur(cur_value);
299     // offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes.
300     // Therefore, offset must be less than kSpecialHitSize + 5. Since posting
301     // list size must be divisible by sizeof(Hit) (5), it is guaranteed that
302     // offset < size_in_bytes, so it is safe to ignore the return value here.
303     ConsumeTermFrequencyIfPresent(posting_list_used, &cur, &offset);
304     // Safe to ignore the return value of PadToEnd because offset must be less
305     // than posting_list_used->size_in_bytes(). Otherwise, this function
306     // already would have returned FAILED_PRECONDITION.
307     PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize,
308              /*end=*/offset);
309     // Safe to ignore the return value here because 0 and 1 < kNumSpecialData.
310     SetSpecialHit(posting_list_used, /*index=*/1, cur);
311     SetSpecialHit(posting_list_used, /*index=*/0, hit);
312   }
313   return libtextclassifier3::Status::OK;
314 }
315 
PrependHit(PostingListUsed * posting_list_used,const Hit & hit) const316 libtextclassifier3::Status PostingListHitSerializer::PrependHit(
317     PostingListUsed* posting_list_used, const Hit& hit) const {
318   static_assert(sizeof(Hit::Value) <= sizeof(uint64_t),
319                 "Hit::Value cannot be larger than 8 bytes because the delta "
320                 "must be able to fit in 8 bytes.");
321   if (!hit.is_valid()) {
322     return absl_ports::InvalidArgumentError("Cannot prepend an invalid hit!");
323   }
324   if (!IsPostingListValid(posting_list_used)) {
325     return absl_ports::FailedPreconditionError(
326         "This PostingListUsed is in an invalid state and can't add any hits!");
327   }
328 
329   if (IsFull(posting_list_used)) {
330     // State full: no space left.
331     return absl_ports::ResourceExhaustedError("No more room for hits");
332   } else if (IsAlmostFull(posting_list_used)) {
333     return PrependHitToAlmostFull(posting_list_used, hit);
334   } else if (IsEmpty(posting_list_used)) {
335     PrependHitToEmpty(posting_list_used, hit);
336     return libtextclassifier3::Status::OK;
337   } else {
338     uint32_t offset = GetStartByteOffset(posting_list_used);
339     return PrependHitToNotFull(posting_list_used, hit, offset);
340   }
341 }
342 
343 libtextclassifier3::StatusOr<std::vector<Hit>>
GetHits(const PostingListUsed * posting_list_used) const344 PostingListHitSerializer::GetHits(
345     const PostingListUsed* posting_list_used) const {
346   std::vector<Hit> hits_out;
347   ICING_RETURN_IF_ERROR(GetHits(posting_list_used, &hits_out));
348   return hits_out;
349 }
350 
GetHits(const PostingListUsed * posting_list_used,std::vector<Hit> * hits_out) const351 libtextclassifier3::Status PostingListHitSerializer::GetHits(
352     const PostingListUsed* posting_list_used,
353     std::vector<Hit>* hits_out) const {
354   return GetHitsInternal(posting_list_used,
355                          /*limit=*/std::numeric_limits<uint32_t>::max(),
356                          /*pop=*/false, hits_out);
357 }
358 
PopFrontHits(PostingListUsed * posting_list_used,uint32_t num_hits) const359 libtextclassifier3::Status PostingListHitSerializer::PopFrontHits(
360     PostingListUsed* posting_list_used, uint32_t num_hits) const {
361   if (num_hits == 1 && IsFull(posting_list_used)) {
362     // The PL is in full status which means that we save 2 uncompressed hits in
363     // the 2 special postions. But full status may be reached by 2 different
364     // statuses.
365     // (1) In "almost full" status
366     // +-----------------+----------------+-------+-----------------+
367     // |Hit::kInvalidVal |1st hit         |(pad)  |(compressed) hits|
368     // +-----------------+----------------+-------+-----------------+
369     // When we prepend another hit, we can only put it at the special
370     // position 0. And we get a full PL
371     // +-----------------+----------------+-------+-----------------+
372     // |new 1st hit      |original 1st hit|(pad)  |(compressed) hits|
373     // +-----------------+----------------+-------+-----------------+
374     // (2) In "not full" status
375     // +-----------------+----------------+------+-------+------------------+
376     // |hits-start-offset|Hit::kInvalidVal|(pad) |1st hit|(compressed) hits |
377     // +-----------------+----------------+------+-------+------------------+
378     // When we prepend another hit, we can reach any of the 3 following
379     // scenarios:
380     // (2.1) not full
381     // if the space of pad and original 1st hit can accommodate the new 1st hit
382     // and the encoded delta value.
383     // +-----------------+----------------+------+-----------+-----------------+
384     // |hits-start-offset|Hit::kInvalidVal|(pad) |new 1st hit|(compressed) hits|
385     // +-----------------+----------------+------+-----------+-----------------+
386     // (2.2) almost full
387     // If the space of pad and original 1st hit cannot accommodate the new 1st
388     // hit and the encoded delta value but can accommodate the encoded delta
389     // value only. We can put the new 1st hit at special position 1.
390     // +-----------------+----------------+-------+-----------------+
391     // |Hit::kInvalidVal |new 1st hit     |(pad)  |(compressed) hits|
392     // +-----------------+----------------+-------+-----------------+
393     // (2.3) full
394     // In very rare case, it cannot even accommodate only the encoded delta
395     // value. we can move the original 1st hit into special position 1 and the
396     // new 1st hit into special position 0. This may happen because we use
397     // VarInt encoding method which may make the encoded value longer (about
398     // 4/3 times of original)
399     // +-----------------+----------------+-------+-----------------+
400     // |new 1st hit      |original 1st hit|(pad)  |(compressed) hits|
401     // +-----------------+----------------+-------+-----------------+
402     // Suppose now the PL is full. But we don't know whether it arrived to
403     // this status from "not full" like (2.3) or from "almost full" like (1).
404     // We'll return to "almost full" status like (1) if we simply pop the new
405     // 1st hit but we want to make the prepending operation "reversible". So
406     // there should be some way to return to "not full" if possible. A simple
407     // way to do it is to pop 2 hits out of the PL to status "almost full" or
408     // "not full".  And add the original 1st hit back. We can return to the
409     // correct original statuses of (2.1) or (1). This makes our prepending
410     // operation reversible.
411     std::vector<Hit> out;
412 
413     // Popping 2 hits should never fail because we've just ensured that the
414     // posting list is in the FULL state.
415     ICING_RETURN_IF_ERROR(
416         GetHitsInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out));
417 
418     // PrependHit should never fail because out[1] is a valid hit less than
419     // previous hits in the posting list and because there's no way that the
420     // posting list could run out of room because it previously stored this hit
421     // AND another hit.
422     PrependHit(posting_list_used, out[1]);
423   } else if (num_hits > 0) {
424     return GetHitsInternal(posting_list_used, /*limit=*/num_hits, /*pop=*/true,
425                            nullptr);
426   }
427   return libtextclassifier3::Status::OK;
428 }
429 
GetHitsInternal(const PostingListUsed * posting_list_used,uint32_t limit,bool pop,std::vector<Hit> * out) const430 libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal(
431     const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
432     std::vector<Hit>* out) const {
433   // Put current uncompressed val here.
434   Hit::Value val = Hit::kInvalidValue;
435   uint32_t offset = GetStartByteOffset(posting_list_used);
436   uint32_t count = 0;
437 
438   // First traverse the first two special positions.
439   while (count < limit && offset < kSpecialHitsSize) {
440     // Calling ValueOrDie is safe here because offset / sizeof(Hit) <
441     // kNumSpecialData because of the check above.
442     Hit hit = GetSpecialHit(posting_list_used, /*index=*/offset / sizeof(Hit))
443                   .ValueOrDie();
444     val = hit.value();
445     if (out != nullptr) {
446       out->push_back(hit);
447     }
448     offset += sizeof(Hit);
449     count++;
450   }
451 
452   // If special position 1 was set then we need to skip padding.
453   if (val != Hit::kInvalidValue && offset == kSpecialHitsSize) {
454     offset = GetPadEnd(posting_list_used, offset);
455   }
456 
457   while (count < limit && offset < posting_list_used->size_in_bytes()) {
458     if (val == Hit::kInvalidValue) {
459       // First hit is in compressed area. Put that in val.
460       memcpy(&val, posting_list_used->posting_list_buffer() + offset,
461              sizeof(Hit::Value));
462       offset += sizeof(Hit::Value);
463     } else {
464       // Now we have delta encoded subsequent hits. Decode and push.
465       uint64_t delta;
466       offset += VarInt::Decode(
467           posting_list_used->posting_list_buffer() + offset, &delta);
468       val += delta;
469     }
470     Hit hit(val);
471     libtextclassifier3::Status status =
472         ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset);
473     if (!status.ok()) {
474       // This posting list has been corrupted somehow. The first hit of the
475       // posting list claims to have a term frequency, but there's no more room
476       // in the posting list for that term frequency to exist. Return an empty
477       // vector and zero to indicate no hits retrieved.
478       if (out != nullptr) {
479         out->clear();
480       }
481       return absl_ports::InternalError("Posting list has been corrupted!");
482     }
483     if (out != nullptr) {
484       out->push_back(hit);
485     }
486     count++;
487   }
488 
489   if (pop) {
490     PostingListUsed* mutable_posting_list_used =
491         const_cast<PostingListUsed*>(posting_list_used);
492     // Modify the posting list so that we pop all hits actually
493     // traversed.
494     if (offset >= kSpecialHitsSize &&
495         offset < posting_list_used->size_in_bytes()) {
496       // In the compressed area. Pop and reconstruct. offset/val is
497       // the last traversed hit, which we must discard. So move one
498       // more forward.
499       uint64_t delta;
500       offset += VarInt::Decode(
501           posting_list_used->posting_list_buffer() + offset, &delta);
502       val += delta;
503 
504       // Now val is the first hit of the new posting list.
505       if (kSpecialHitsSize + sizeof(Hit::Value) <= offset) {
506         // val fits in compressed area. Simply copy.
507         offset -= sizeof(Hit::Value);
508         memcpy(mutable_posting_list_used->posting_list_buffer() + offset, &val,
509                sizeof(Hit::Value));
510       } else {
511         // val won't fit in compressed area. Also see if there is a
512         // term_frequency.
513         Hit hit(val);
514         libtextclassifier3::Status status =
515             ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset);
516         if (!status.ok()) {
517           // This posting list has been corrupted somehow. The first hit of
518           // the posting list claims to have a term frequency, but there's no
519           // more room in the posting list for that term frequency to exist.
520           // Return an empty vector and zero to indicate no hits retrieved. Do
521           // not pop anything.
522           if (out != nullptr) {
523             out->clear();
524           }
525           return absl_ports::InternalError("Posting list has been corrupted!");
526         }
527         // Okay to ignore the return value here because 1 < kNumSpecialData.
528         SetSpecialHit(mutable_posting_list_used, /*index=*/1, hit);
529 
530         // Prepend pad. Safe to ignore the return value of PadToEnd because
531         // offset must be less than posting_list_used->size_in_bytes() thanks to
532         // the if above.
533         PadToEnd(mutable_posting_list_used,
534                  /*start=*/kSpecialHitsSize,
535                  /*end=*/offset);
536         offset = sizeof(Hit);
537       }
538     }
539     // offset is guaranteed to be valid so ignoring the return value of
540     // set_start_byte_offset is safe. It falls into one of four scenarios:
541     // Scenario 1: the above if was false because offset is not <
542     //             posting_list_used->size_in_bytes()
543     //   In this case, offset must be == posting_list_used->size_in_bytes()
544     //   because we reached offset by unwinding hits on the posting list.
545     // Scenario 2: offset is < kSpecialHitSize
546     //   In this case, offset is guaranteed to be either 0 or sizeof(Hit)
547     //   because offset is incremented by sizeof(Hit) within the first while
548     //   loop.
549     // Scenario 3: offset is within the compressed region and the new first hit
550     //   in the posting list (the value that 'val' holds) will fit as an
551     //   uncompressed hit in the compressed region. The resulting offset from
552     //   decompressing val must be >= kSpecialHitSize because otherwise we'd be
553     //   in Scenario 4
554     // Scenario 4: offset is within the compressed region, but the new first hit
555     //   in the posting list is too large to fit as an uncompressed hit in the
556     //   in the compressed region. Therefore, it must be stored in a special hit
557     //   and offset will be sizeof(Hit).
558     SetStartByteOffset(mutable_posting_list_used, offset);
559   }
560 
561   return libtextclassifier3::Status::OK;
562 }
563 
GetSpecialHit(const PostingListUsed * posting_list_used,uint32_t index) const564 libtextclassifier3::StatusOr<Hit> PostingListHitSerializer::GetSpecialHit(
565     const PostingListUsed* posting_list_used, uint32_t index) const {
566   static_assert(sizeof(Hit::Value) >= sizeof(uint32_t), "HitTooSmall");
567   if (index >= kNumSpecialData || index < 0) {
568     return absl_ports::InvalidArgumentError(
569         "Special hits only exist at indices 0 and 1");
570   }
571   Hit val;
572   memcpy(&val, posting_list_used->posting_list_buffer() + index * sizeof(val),
573          sizeof(val));
574   return val;
575 }
576 
SetSpecialHit(PostingListUsed * posting_list_used,uint32_t index,const Hit & val) const577 bool PostingListHitSerializer::SetSpecialHit(PostingListUsed* posting_list_used,
578                                              uint32_t index,
579                                              const Hit& val) const {
580   if (index >= kNumSpecialData || index < 0) {
581     ICING_LOG(ERROR) << "Special hits only exist at indices 0 and 1";
582     return false;
583   }
584   memcpy(posting_list_used->posting_list_buffer() + index * sizeof(val), &val,
585          sizeof(val));
586   return true;
587 }
588 
IsPostingListValid(const PostingListUsed * posting_list_used) const589 bool PostingListHitSerializer::IsPostingListValid(
590     const PostingListUsed* posting_list_used) const {
591   if (IsAlmostFull(posting_list_used)) {
592     // Special Hit 1 should hold a Hit. Calling ValueOrDie is safe because we
593     // know that 1 < kNumSpecialData.
594     if (!GetSpecialHit(posting_list_used, /*index=*/1)
595              .ValueOrDie()
596              .is_valid()) {
597       ICING_LOG(ERROR)
598           << "Both special hits cannot be invalid at the same time.";
599       return false;
600     }
601   } else if (!IsFull(posting_list_used)) {
602     // NOT_FULL. Special Hit 0 should hold a valid offset. Calling ValueOrDie is
603     // safe because we know that 0 < kNumSpecialData.
604     if (GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() >
605             posting_list_used->size_in_bytes() ||
606         GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() <
607             kSpecialHitsSize) {
608       ICING_LOG(ERROR)
609           << "Hit: "
610           << GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value()
611           << " size: " << posting_list_used->size_in_bytes()
612           << " sp size: " << kSpecialHitsSize;
613       return false;
614     }
615   }
616   return true;
617 }
618 
GetStartByteOffset(const PostingListUsed * posting_list_used) const619 uint32_t PostingListHitSerializer::GetStartByteOffset(
620     const PostingListUsed* posting_list_used) const {
621   if (IsFull(posting_list_used)) {
622     return 0;
623   } else if (IsAlmostFull(posting_list_used)) {
624     return sizeof(Hit);
625   } else {
626     // NOT_FULL, calling ValueOrDie is safe because we know that 0 <
627     // kNumSpecialData.
628     return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value();
629   }
630 }
631 
SetStartByteOffset(PostingListUsed * posting_list_used,uint32_t offset) const632 bool PostingListHitSerializer::SetStartByteOffset(
633     PostingListUsed* posting_list_used, uint32_t offset) const {
634   if (offset > posting_list_used->size_in_bytes()) {
635     ICING_LOG(ERROR) << "offset cannot be a value greater than size "
636                      << posting_list_used->size_in_bytes() << ". offset is "
637                      << offset << ".";
638     return false;
639   }
640   if (offset < kSpecialHitsSize && offset > sizeof(Hit)) {
641     ICING_LOG(ERROR) << "offset cannot be a value between (" << sizeof(Hit)
642                      << ", " << kSpecialHitsSize << "). offset is " << offset
643                      << ".";
644     return false;
645   }
646   if (offset < sizeof(Hit) && offset != 0) {
647     ICING_LOG(ERROR) << "offset cannot be a value between (0, " << sizeof(Hit)
648                      << "). offset is " << offset << ".";
649     return false;
650   }
651   if (offset >= kSpecialHitsSize) {
652     // not_full state. Safe to ignore the return value because 0 and 1 are both
653     // < kNumSpecialData.
654     SetSpecialHit(posting_list_used, /*index=*/0, Hit(offset));
655     SetSpecialHit(posting_list_used, /*index=*/1, Hit());
656   } else if (offset == sizeof(Hit)) {
657     // almost_full state. Safe to ignore the return value because 1 is both <
658     // kNumSpecialData.
659     SetSpecialHit(posting_list_used, /*index=*/0, Hit());
660   }
661   // Nothing to do for the FULL state - the offset isn't actually stored
662   // anywhere and both special hits hold valid hits.
663   return true;
664 }
665 
666 libtextclassifier3::StatusOr<uint32_t>
PrependHitUncompressed(PostingListUsed * posting_list_used,const Hit & hit,uint32_t offset) const667 PostingListHitSerializer::PrependHitUncompressed(
668     PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const {
669   if (hit.has_term_frequency()) {
670     if (offset < kSpecialHitsSize + sizeof(Hit)) {
671       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
672           "Not enough room to prepend Hit at offset %d.", offset));
673     }
674     offset -= sizeof(Hit);
675     memcpy(posting_list_used->posting_list_buffer() + offset, &hit,
676            sizeof(Hit));
677   } else {
678     if (offset < kSpecialHitsSize + sizeof(Hit::Value)) {
679       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
680           "Not enough room to prepend Hit::Value at offset %d.", offset));
681     }
682     offset -= sizeof(Hit::Value);
683     Hit::Value val = hit.value();
684     memcpy(posting_list_used->posting_list_buffer() + offset, &val,
685            sizeof(Hit::Value));
686   }
687   return offset;
688 }
689 
690 libtextclassifier3::Status
ConsumeTermFrequencyIfPresent(const PostingListUsed * posting_list_used,Hit * hit,uint32_t * offset) const691 PostingListHitSerializer::ConsumeTermFrequencyIfPresent(
692     const PostingListUsed* posting_list_used, Hit* hit,
693     uint32_t* offset) const {
694   if (!hit->has_term_frequency()) {
695     // No term frequency to consume. Everything is fine.
696     return libtextclassifier3::Status::OK;
697   }
698   if (*offset + sizeof(Hit::TermFrequency) >
699       posting_list_used->size_in_bytes()) {
700     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
701         "offset %d must not point past the end of the posting list of size %d.",
702         *offset, posting_list_used->size_in_bytes()));
703   }
704   Hit::TermFrequency term_frequency;
705   memcpy(&term_frequency, posting_list_used->posting_list_buffer() + *offset,
706          sizeof(Hit::TermFrequency));
707   *hit = Hit(hit->value(), term_frequency);
708   *offset += sizeof(Hit::TermFrequency);
709   return libtextclassifier3::Status::OK;
710 }
711 
712 }  // namespace lib
713 }  // namespace icing
714