1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/index/numeric/posting-list-integer-index-serializer.h"
16
17 #include <cstdint>
18 #include <vector>
19
20 #include "icing/text_classifier/lib3/utils/base/status.h"
21 #include "icing/text_classifier/lib3/utils/base/statusor.h"
22 #include "icing/absl_ports/canonical_errors.h"
23 #include "icing/file/posting_list/posting-list-used.h"
24 #include "icing/index/numeric/integer-index-data.h"
25 #include "icing/legacy/core/icing-string-util.h"
26 #include "icing/util/logging.h"
27 #include "icing/util/status-macros.h"
28
29 namespace icing {
30 namespace lib {
31
GetBytesUsed(const PostingListUsed * posting_list_used) const32 uint32_t PostingListIntegerIndexSerializer::GetBytesUsed(
33 const PostingListUsed* posting_list_used) const {
34 // The special data will be included if they represent actual data. If they
35 // represent the data start offset or the invalid data sentinel, they are not
36 // included.
37 return posting_list_used->size_in_bytes() -
38 GetStartByteOffset(posting_list_used);
39 }
40
GetMinPostingListSizeToFit(const PostingListUsed * posting_list_used) const41 uint32_t PostingListIntegerIndexSerializer::GetMinPostingListSizeToFit(
42 const PostingListUsed* posting_list_used) const {
43 if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) {
44 // If in either the FULL state or ALMOST_FULL state, this posting list *is*
45 // the minimum size posting list that can fit these data. So just return the
46 // size of the posting list.
47 return posting_list_used->size_in_bytes();
48 }
49
50 // In NOT_FULL state, BytesUsed contains no special data. The minimum sized
51 // posting list that would be guaranteed to fit these data would be
52 // ALMOST_FULL, with kInvalidData in special data 0, the uncompressed data in
53 // special data 1 and the n compressed data in the compressed region.
54 // BytesUsed contains one uncompressed data and n compressed data. Therefore,
55 // fitting these data into a posting list would require BytesUsed plus one
56 // extra data.
57 return GetBytesUsed(posting_list_used) + GetDataTypeBytes();
58 }
59
Clear(PostingListUsed * posting_list_used) const60 void PostingListIntegerIndexSerializer::Clear(
61 PostingListUsed* posting_list_used) const {
62 // Safe to ignore return value because posting_list_used->size_in_bytes() is
63 // a valid argument.
64 SetStartByteOffset(posting_list_used,
65 /*offset=*/posting_list_used->size_in_bytes());
66 }
67
MoveFrom(PostingListUsed * dst,PostingListUsed * src) const68 libtextclassifier3::Status PostingListIntegerIndexSerializer::MoveFrom(
69 PostingListUsed* dst, PostingListUsed* src) const {
70 ICING_RETURN_ERROR_IF_NULL(dst);
71 ICING_RETURN_ERROR_IF_NULL(src);
72 if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) {
73 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
74 "src MinPostingListSizeToFit %d must be larger than size %d.",
75 GetMinPostingListSizeToFit(src), dst->size_in_bytes()));
76 }
77
78 if (!IsPostingListValid(dst)) {
79 return absl_ports::FailedPreconditionError(
80 "Dst posting list is in an invalid state and can't be used!");
81 }
82 if (!IsPostingListValid(src)) {
83 return absl_ports::InvalidArgumentError(
84 "Cannot MoveFrom an invalid src posting list!");
85 }
86
87 // Pop just enough data that all of src's compressed data fit in
88 // dst posting_list's compressed area. Then we can memcpy that area.
89 std::vector<IntegerIndexData> data_arr;
90 while (IsFull(src) || IsAlmostFull(src) ||
91 (dst->size_in_bytes() - kSpecialDataSize < GetBytesUsed(src))) {
92 if (!GetDataInternal(src, /*limit=*/1, /*pop=*/true, &data_arr).ok()) {
93 return absl_ports::AbortedError(
94 "Unable to retrieve data from src posting list.");
95 }
96 }
97
98 // memcpy the area and set up start byte offset.
99 Clear(dst);
100 memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src),
101 src->posting_list_buffer() + GetStartByteOffset(src),
102 GetBytesUsed(src));
103 // Because we popped all data from src outside of the compressed area and we
104 // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() -
105 // kSpecialDataSize. This is guaranteed to be a valid byte offset for the
106 // NOT_FULL state, so ignoring the value is safe.
107 SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src));
108
109 // Put back remaining data.
110 for (auto riter = data_arr.rbegin(); riter != data_arr.rend(); ++riter) {
111 // PrependData may return:
112 // - INVALID_ARGUMENT: if data is invalid or not less than the previous data
113 // - RESOURCE_EXHAUSTED
114 // RESOURCE_EXHAUSTED should be impossible because we've already assured
115 // that there is enough room above.
116 ICING_RETURN_IF_ERROR(PrependData(dst, *riter));
117 }
118
119 Clear(src);
120 return libtextclassifier3::Status::OK;
121 }
122
123 libtextclassifier3::Status
PrependDataToAlmostFull(PostingListUsed * posting_list_used,const IntegerIndexData & data) const124 PostingListIntegerIndexSerializer::PrependDataToAlmostFull(
125 PostingListUsed* posting_list_used, const IntegerIndexData& data) const {
126 SpecialDataType special_data = GetSpecialData(posting_list_used, /*index=*/1);
127 if (special_data.data().basic_hit() < data.basic_hit()) {
128 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
129 "BasicHit %d being prepended must not be greater than the most recent"
130 "BasicHit %d",
131 data.basic_hit().value(), special_data.data().basic_hit().value()));
132 }
133
134 // TODO(b/259743562): [Optimization 2] compression
135 // Without compression, prepend a new data into ALMOST_FULL posting list will
136 // change the posting list to FULL state. Therefore, set special data 0
137 // directly.
138 SetSpecialData(posting_list_used, /*index=*/0, SpecialDataType(data));
139 return libtextclassifier3::Status::OK;
140 }
141
PrependDataToEmpty(PostingListUsed * posting_list_used,const IntegerIndexData & data) const142 void PostingListIntegerIndexSerializer::PrependDataToEmpty(
143 PostingListUsed* posting_list_used, const IntegerIndexData& data) const {
144 // First data to be added. Just add verbatim, no compression.
145 if (posting_list_used->size_in_bytes() == kSpecialDataSize) {
146 // First data will be stored at special data 1.
147 // Safe to ignore the return value because 1 < kNumSpecialData
148 SetSpecialData(posting_list_used, /*index=*/1, SpecialDataType(data));
149 // Safe to ignore the return value because sizeof(IntegerIndexData) is a
150 // valid argument.
151 SetStartByteOffset(posting_list_used,
152 /*offset=*/sizeof(IntegerIndexData));
153 } else {
154 // Since this is the first data, size != kSpecialDataSize and
155 // size % sizeof(IntegerIndexData) == 0, we know that there is room to fit
156 // 'data' into the compressed region, so ValueOrDie is safe.
157 uint32_t offset =
158 PrependDataUncompressed(posting_list_used, data,
159 /*offset=*/posting_list_used->size_in_bytes())
160 .ValueOrDie();
161 // Safe to ignore the return value because PrependDataUncompressed is
162 // guaranteed to return a valid offset.
163 SetStartByteOffset(posting_list_used, offset);
164 }
165 }
166
167 libtextclassifier3::Status
PrependDataToNotFull(PostingListUsed * posting_list_used,const IntegerIndexData & data,uint32_t offset) const168 PostingListIntegerIndexSerializer::PrependDataToNotFull(
169 PostingListUsed* posting_list_used, const IntegerIndexData& data,
170 uint32_t offset) const {
171 IntegerIndexData cur;
172 memcpy(&cur, posting_list_used->posting_list_buffer() + offset,
173 sizeof(IntegerIndexData));
174 if (cur.basic_hit() < data.basic_hit()) {
175 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
176 "BasicHit %d being prepended must not be greater than the most recent"
177 "BasicHit %d",
178 data.basic_hit().value(), cur.basic_hit().value()));
179 }
180
181 // TODO(b/259743562): [Optimization 2] compression
182 if (offset >= kSpecialDataSize + sizeof(IntegerIndexData)) {
183 offset =
184 PrependDataUncompressed(posting_list_used, data, offset).ValueOrDie();
185 SetStartByteOffset(posting_list_used, offset);
186 } else {
187 // The new data must be put in special data 1.
188 SetSpecialData(posting_list_used, /*index=*/1, SpecialDataType(data));
189 // State ALMOST_FULL. Safe to ignore the return value because
190 // sizeof(IntegerIndexData) is a valid argument.
191 SetStartByteOffset(posting_list_used, /*offset=*/sizeof(IntegerIndexData));
192 }
193 return libtextclassifier3::Status::OK;
194 }
195
PrependData(PostingListUsed * posting_list_used,const IntegerIndexData & data) const196 libtextclassifier3::Status PostingListIntegerIndexSerializer::PrependData(
197 PostingListUsed* posting_list_used, const IntegerIndexData& data) const {
198 static_assert(
199 sizeof(BasicHit::Value) <= sizeof(uint64_t),
200 "BasicHit::Value cannot be larger than 8 bytes because the delta "
201 "must be able to fit in 8 bytes.");
202
203 if (!data.is_valid()) {
204 return absl_ports::InvalidArgumentError("Cannot prepend an invalid data!");
205 }
206 if (!IsPostingListValid(posting_list_used)) {
207 return absl_ports::FailedPreconditionError(
208 "This PostingListUsed is in an invalid state and can't add any data!");
209 }
210
211 if (IsFull(posting_list_used)) {
212 // State FULL: no space left.
213 return absl_ports::ResourceExhaustedError("No more room for data");
214 } else if (IsAlmostFull(posting_list_used)) {
215 return PrependDataToAlmostFull(posting_list_used, data);
216 } else if (IsEmpty(posting_list_used)) {
217 PrependDataToEmpty(posting_list_used, data);
218 return libtextclassifier3::Status::OK;
219 } else {
220 uint32_t offset = GetStartByteOffset(posting_list_used);
221 return PrependDataToNotFull(posting_list_used, data, offset);
222 }
223 }
224
PrependDataArray(PostingListUsed * posting_list_used,const IntegerIndexData * array,uint32_t num_data,bool keep_prepended) const225 uint32_t PostingListIntegerIndexSerializer::PrependDataArray(
226 PostingListUsed* posting_list_used, const IntegerIndexData* array,
227 uint32_t num_data, bool keep_prepended) const {
228 if (!IsPostingListValid(posting_list_used)) {
229 return 0;
230 }
231
232 uint32_t i;
233 for (i = 0; i < num_data; ++i) {
234 if (!PrependData(posting_list_used, array[i]).ok()) {
235 break;
236 }
237 }
238 if (i != num_data && !keep_prepended) {
239 // Didn't fit. Undo everything and check that we have the same offset as
240 // before. PopFrontData guarantees that it will remove all 'i' data so long
241 // as there are at least 'i' data in the posting list, which we know there
242 // are.
243 PopFrontData(posting_list_used, /*num_data=*/i);
244 return 0;
245 }
246 return i;
247 }
248
249 libtextclassifier3::StatusOr<std::vector<IntegerIndexData>>
GetData(const PostingListUsed * posting_list_used) const250 PostingListIntegerIndexSerializer::GetData(
251 const PostingListUsed* posting_list_used) const {
252 std::vector<IntegerIndexData> data_arr_out;
253 ICING_RETURN_IF_ERROR(GetData(posting_list_used, &data_arr_out));
254 return data_arr_out;
255 }
256
GetData(const PostingListUsed * posting_list_used,std::vector<IntegerIndexData> * data_arr_out) const257 libtextclassifier3::Status PostingListIntegerIndexSerializer::GetData(
258 const PostingListUsed* posting_list_used,
259 std::vector<IntegerIndexData>* data_arr_out) const {
260 return GetDataInternal(posting_list_used,
261 /*limit=*/std::numeric_limits<uint32_t>::max(),
262 /*pop=*/false, data_arr_out);
263 }
264
PopFrontData(PostingListUsed * posting_list_used,uint32_t num_data) const265 libtextclassifier3::Status PostingListIntegerIndexSerializer::PopFrontData(
266 PostingListUsed* posting_list_used, uint32_t num_data) const {
267 if (num_data == 1 && IsFull(posting_list_used)) {
268 // The PL is in FULL state which means that we save 2 uncompressed data in
269 // the 2 special postions. But FULL state may be reached by 2 different
270 // states.
271 // (1) In ALMOST_FULL state
272 // +------------------+-----------------+-----+---------------------------+
273 // |Data::Invalid |1st data |(pad)|(compressed) data |
274 // | | | | |
275 // +------------------+-----------------+-----+---------------------------+
276 // When we prepend another data, we can only put it at special data 0, and
277 // thus get a FULL PL
278 // +------------------+-----------------+-----+---------------------------+
279 // |new 1st data |original 1st data|(pad)|(compressed) data |
280 // | | | | |
281 // +------------------+-----------------+-----+---------------------------+
282 //
283 // (2) In NOT_FULL state
284 // +------------------+-----------------+-------+---------+---------------+
285 // |data-start-offset |Data::Invalid |(pad) |1st data |(compressed) |
286 // | | | | |data |
287 // +------------------+-----------------+-------+---------+---------------+
288 // When we prepend another data, we can reach any of the 3 following
289 // scenarios:
290 // (2.1) NOT_FULL
291 // if the space of pad and original 1st data can accommodate the new 1st
292 // data and the encoded delta value.
293 // +------------------+-----------------+-----+--------+------------------+
294 // |data-start-offset |Data::Invalid |(pad)|new |(compressed) data |
295 // | | | |1st data| |
296 // +------------------+-----------------+-----+--------+------------------+
297 // (2.2) ALMOST_FULL
298 // If the space of pad and original 1st data cannot accommodate the new 1st
299 // data and the encoded delta value but can accommodate the encoded delta
300 // value only. We can put the new 1st data at special position 1.
301 // +------------------+-----------------+---------+-----------------------+
302 // |Data::Invalid |new 1st data |(pad) |(compressed) data |
303 // | | | | |
304 // +------------------+-----------------+---------+-----------------------+
305 // (2.3) FULL
306 // In very rare case, it cannot even accommodate only the encoded delta
307 // value. we can move the original 1st data into special position 1 and the
308 // new 1st data into special position 0. This may happen because we use
309 // VarInt encoding method which may make the encoded value longer (about
310 // 4/3 times of original)
311 // +------------------+-----------------+--------------+------------------+
312 // |new 1st data |original 1st data|(pad) |(compressed) data |
313 // | | | | |
314 // +------------------+-----------------+--------------+------------------+
315 //
316 // Suppose now the PL is in FULL state. But we don't know whether it arrived
317 // this state from NOT_FULL (like (2.3)) or from ALMOST_FULL (like (1)).
318 // We'll return to ALMOST_FULL state like (1) if we simply pop the new 1st
319 // data, but we want to make the prepending operation "reversible". So
320 // there should be some way to return to NOT_FULL if possible. A simple way
321 // to do is:
322 // - Pop 2 data out of the PL to state ALMOST_FULL or NOT_FULL.
323 // - Add the second data ("original 1st data") back.
324 //
325 // Then we can return to the correct original states of (2.1) or (1). This
326 // makes our prepending operation reversible.
327 std::vector<IntegerIndexData> out;
328
329 // Popping 2 data should never fail because we've just ensured that the
330 // posting list is in the FULL state.
331 ICING_RETURN_IF_ERROR(
332 GetDataInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out));
333
334 // PrependData should never fail because:
335 // - out[1] is a valid data less than all previous data in the posting list.
336 // - There's no way that the posting list could run out of room because it
337 // previously stored these 2 data.
338 PrependData(posting_list_used, out[1]);
339 } else if (num_data > 0) {
340 return GetDataInternal(posting_list_used, /*limit=*/num_data, /*pop=*/true,
341 /*out=*/nullptr);
342 }
343 return libtextclassifier3::Status::OK;
344 }
345
GetDataInternal(const PostingListUsed * posting_list_used,uint32_t limit,bool pop,std::vector<IntegerIndexData> * out) const346 libtextclassifier3::Status PostingListIntegerIndexSerializer::GetDataInternal(
347 const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
348 std::vector<IntegerIndexData>* out) const {
349 // TODO(b/259743562): [Optimization 2] handle compressed data
350
351 uint32_t offset = GetStartByteOffset(posting_list_used);
352 uint32_t count = 0;
353
354 // First traverse the first two special positions.
355 while (count < limit && offset < kSpecialDataSize) {
356 // offset / sizeof(IntegerIndexData) < kNumSpecialData because of the check
357 // above.
358 SpecialDataType special_data =
359 GetSpecialData(posting_list_used,
360 /*index=*/offset / sizeof(IntegerIndexData));
361 if (out != nullptr) {
362 out->push_back(special_data.data());
363 }
364 offset += sizeof(IntegerIndexData);
365 ++count;
366 }
367
368 // - We don't compress the data now.
369 // - The posting list size is a multiple of data type bytes.
370 // So offset of the first non-special data is guaranteed to be at
371 // kSpecialDataSize if in ALMOST_FULL or FULL state. In fact, we must not
372 // apply padding skipping logic here when still storing uncompressed data,
373 // because in this case 0 bytes are meanful (e.g. inverted doc id byte = 0).
374 // TODO(b/259743562): [Optimization 2] deal with padding skipping logic when
375 // apply data compression.
376
377 while (count < limit && offset < posting_list_used->size_in_bytes()) {
378 IntegerIndexData data;
379 memcpy(&data, posting_list_used->posting_list_buffer() + offset,
380 sizeof(IntegerIndexData));
381 offset += sizeof(IntegerIndexData);
382 if (out != nullptr) {
383 out->push_back(data);
384 }
385 ++count;
386 }
387
388 if (pop) {
389 PostingListUsed* mutable_posting_list_used =
390 const_cast<PostingListUsed*>(posting_list_used);
391 // Modify the posting list so that we pop all data actually traversed.
392 if (offset >= kSpecialDataSize &&
393 offset < posting_list_used->size_in_bytes()) {
394 memset(
395 mutable_posting_list_used->posting_list_buffer() + kSpecialDataSize,
396 0, offset - kSpecialDataSize);
397 }
398 SetStartByteOffset(mutable_posting_list_used, offset);
399 }
400
401 return libtextclassifier3::Status::OK;
402 }
403
404 PostingListIntegerIndexSerializer::SpecialDataType
GetSpecialData(const PostingListUsed * posting_list_used,uint32_t index) const405 PostingListIntegerIndexSerializer::GetSpecialData(
406 const PostingListUsed* posting_list_used, uint32_t index) const {
407 // It is ok to temporarily construct a SpecialData with offset = 0 since we're
408 // going to overwrite it by memcpy.
409 SpecialDataType special_data(0);
410 memcpy(&special_data,
411 posting_list_used->posting_list_buffer() +
412 index * sizeof(SpecialDataType),
413 sizeof(SpecialDataType));
414 return special_data;
415 }
416
SetSpecialData(PostingListUsed * posting_list_used,uint32_t index,const SpecialDataType & special_data) const417 void PostingListIntegerIndexSerializer::SetSpecialData(
418 PostingListUsed* posting_list_used, uint32_t index,
419 const SpecialDataType& special_data) const {
420 memcpy(posting_list_used->posting_list_buffer() +
421 index * sizeof(SpecialDataType),
422 &special_data, sizeof(SpecialDataType));
423 }
424
IsPostingListValid(const PostingListUsed * posting_list_used) const425 bool PostingListIntegerIndexSerializer::IsPostingListValid(
426 const PostingListUsed* posting_list_used) const {
427 if (IsAlmostFull(posting_list_used)) {
428 // Special data 1 should hold a valid data.
429 if (!GetSpecialData(posting_list_used, /*index=*/1).data().is_valid()) {
430 ICING_LOG(ERROR)
431 << "Both special data cannot be invalid at the same time.";
432 return false;
433 }
434 } else if (!IsFull(posting_list_used)) {
435 // NOT_FULL. Special data 0 should hold a valid offset.
436 SpecialDataType special_data =
437 GetSpecialData(posting_list_used, /*index=*/0);
438 if (special_data.data_start_offset() > posting_list_used->size_in_bytes() ||
439 special_data.data_start_offset() < kSpecialDataSize) {
440 ICING_LOG(ERROR) << "Offset: " << special_data.data_start_offset()
441 << " size: " << posting_list_used->size_in_bytes()
442 << " sp size: " << kSpecialDataSize;
443 return false;
444 }
445 }
446 return true;
447 }
448
GetStartByteOffset(const PostingListUsed * posting_list_used) const449 uint32_t PostingListIntegerIndexSerializer::GetStartByteOffset(
450 const PostingListUsed* posting_list_used) const {
451 if (IsFull(posting_list_used)) {
452 return 0;
453 } else if (IsAlmostFull(posting_list_used)) {
454 return sizeof(IntegerIndexData);
455 } else {
456 return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset();
457 }
458 }
459
SetStartByteOffset(PostingListUsed * posting_list_used,uint32_t offset) const460 bool PostingListIntegerIndexSerializer::SetStartByteOffset(
461 PostingListUsed* posting_list_used, uint32_t offset) const {
462 if (offset > posting_list_used->size_in_bytes()) {
463 ICING_LOG(ERROR) << "offset cannot be a value greater than size "
464 << posting_list_used->size_in_bytes() << ". offset is "
465 << offset << ".";
466 return false;
467 }
468 if (offset < kSpecialDataSize && offset > sizeof(IntegerIndexData)) {
469 ICING_LOG(ERROR) << "offset cannot be a value between ("
470 << sizeof(IntegerIndexData) << ", " << kSpecialDataSize
471 << "). offset is " << offset << ".";
472 return false;
473 }
474 if (offset < sizeof(IntegerIndexData) && offset != 0) {
475 ICING_LOG(ERROR) << "offset cannot be a value between (0, "
476 << sizeof(IntegerIndexData) << "). offset is " << offset
477 << ".";
478 return false;
479 }
480
481 if (offset >= kSpecialDataSize) {
482 // NOT_FULL state.
483 SetSpecialData(posting_list_used, /*index=*/0, SpecialDataType(offset));
484 SetSpecialData(posting_list_used, /*index=*/1,
485 SpecialDataType(IntegerIndexData()));
486 } else if (offset == sizeof(IntegerIndexData)) {
487 // ALMOST_FULL state.
488 SetSpecialData(posting_list_used, /*index=*/0,
489 SpecialDataType(IntegerIndexData()));
490 }
491 // Nothing to do for the FULL state - the offset isn't actually stored
492 // anywhere and both 2 special data hold valid data.
493 return true;
494 }
495
496 libtextclassifier3::StatusOr<uint32_t>
PrependDataUncompressed(PostingListUsed * posting_list_used,const IntegerIndexData & data,uint32_t offset) const497 PostingListIntegerIndexSerializer::PrependDataUncompressed(
498 PostingListUsed* posting_list_used, const IntegerIndexData& data,
499 uint32_t offset) const {
500 if (offset < kSpecialDataSize + sizeof(IntegerIndexData)) {
501 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
502 "Not enough room to prepend IntegerIndexData at offset %d.", offset));
503 }
504 offset -= sizeof(IntegerIndexData);
505 memcpy(posting_list_used->posting_list_buffer() + offset, &data,
506 sizeof(IntegerIndexData));
507 return offset;
508 }
509
510 } // namespace lib
511 } // namespace icing
512