• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
16 #define ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
17 
18 #include <cstdint>
19 #include <vector>
20 
21 #include "icing/text_classifier/lib3/utils/base/status.h"
22 #include "icing/text_classifier/lib3/utils/base/statusor.h"
23 #include "icing/file/posting_list/posting-list-common.h"
24 #include "icing/file/posting_list/posting-list-used.h"
25 #include "icing/index/hit/hit.h"
26 
27 namespace icing {
28 namespace lib {
29 
30 // A serializer class to serialize hits to PostingListUsed. Layout described in
31 // comments in posting-list-hit-serializer.cc.
32 class PostingListHitSerializer : public PostingListSerializer {
33  public:
34   static constexpr uint32_t kSpecialHitsSize = kNumSpecialData * sizeof(Hit);
35 
GetDataTypeBytes()36   uint32_t GetDataTypeBytes() const override { return sizeof(Hit); }
37 
GetMinPostingListSize()38   uint32_t GetMinPostingListSize() const override {
39     static constexpr uint32_t kMinPostingListSize = kSpecialHitsSize;
40     static_assert(sizeof(PostingListIndex) <= kMinPostingListSize,
41                   "PostingListIndex must be small enough to fit in a "
42                   "minimum-sized Posting List.");
43 
44     return kMinPostingListSize;
45   }
46 
47   uint32_t GetMinPostingListSizeToFit(
48       const PostingListUsed* posting_list_used) const override;
49 
50   uint32_t GetBytesUsed(
51       const PostingListUsed* posting_list_used) const override;
52 
53   void Clear(PostingListUsed* posting_list_used) const override;
54 
55   libtextclassifier3::Status MoveFrom(PostingListUsed* dst,
56                                       PostingListUsed* src) const override;
57 
58   // Prepend a hit to the posting list.
59   //
60   // RETURNS:
61   //   - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the
62   //       previously added hit.
63   //   - RESOURCE_EXHAUSTED if there is no more room to add hit to the posting
64   //       list.
65   libtextclassifier3::Status PrependHit(PostingListUsed* posting_list_used,
66                                         const Hit& hit) const;
67 
68   // Prepend hits to the posting list. Hits should be sorted in descending order
69   // (as defined by the less than operator for Hit)
70   //
71   // Returns the number of hits that could be prepended to the posting list. If
72   // keep_prepended is true, whatever could be prepended is kept, otherwise the
73   // posting list is left in its original state.
74   template <class T, Hit (*GetHit)(const T&)>
75   uint32_t PrependHitArray(PostingListUsed* posting_list_used, const T* array,
76                            uint32_t num_hits, bool keep_prepended) const;
77 
78   // Retrieves the hits stored in the posting list.
79   //
80   // RETURNS:
81   //   - On success, a vector of hits sorted by the reverse order of prepending.
82   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
83   libtextclassifier3::StatusOr<std::vector<Hit>> GetHits(
84       const PostingListUsed* posting_list_used) const;
85 
86   // Same as GetHits but appends hits to hits_out.
87   //
88   // RETURNS:
89   //   - On success, a vector of hits sorted by the reverse order of prepending.
90   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
91   libtextclassifier3::Status GetHits(const PostingListUsed* posting_list_used,
92                                      std::vector<Hit>* hits_out) const;
93 
94   // Undo the last num_hits hits prepended. If num_hits > number of
95   // hits we clear all hits.
96   //
97   // RETURNS:
98   //   - OK on success
99   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
100   libtextclassifier3::Status PopFrontHits(PostingListUsed* posting_list_used,
101                                           uint32_t num_hits) const;
102 
103  private:
104   // Posting list layout formats:
105   //
106   // not_full
107   //
108   // +-----------------+----------------+-------+-----------------+
109   // |hits-start-offset|Hit::kInvalidVal|xxxxxxx|(compressed) hits|
110   // +-----------------+----------------+-------+-----------------+
111   //
112   // almost_full
113   //
114   // +-----------------+----------------+-------+-----------------+
115   // |Hit::kInvalidVal |1st hit         |(pad)  |(compressed) hits|
116   // +-----------------+----------------+-------+-----------------+
117   //
118   // full()
119   //
120   // +-----------------+----------------+-------+-----------------+
121   // |1st hit          |2nd hit         |(pad)  |(compressed) hits|
122   // +-----------------+----------------+-------+-----------------+
123   //
124   // The first two uncompressed hits also implicitly encode information about
125   // the size of the compressed hits region.
126   //
127   // 1. If the posting list is NOT_FULL, then
128   // posting_list_buffer_[0] contains the byte offset of the start of the
129   // compressed hits - and, thus, the size of the compressed hits region is
130   // size_in_bytes - posting_list_buffer_[0].
131   //
132   // 2. If posting list is ALMOST_FULL or FULL, then the compressed hits region
133   // starts somewhere between [kSpecialHitsSize, kSpecialHitsSize + sizeof(Hit)
134   // - 1] and ends at size_in_bytes - 1.
135   //
136   // Hit term frequencies are stored after the hit value, compressed or
137   // uncompressed. For the first two special hits, we always have a
138   // space for the term frequency. For hits in the compressed area, we only have
139   // the term frequency following the hit value of hit.has_term_frequency() is
140   // true. This allows good compression in the common case where hits don't have
141   // a valid term frequency.
142   //
143   // EXAMPLE
144   // Posting list storage. Posting list size: 20 bytes
145   // EMPTY!
146   // +--bytes 0-4--+----- 5-9 ------+---------------- 10-19 -----------------+
147   // |     20      |Hit::kInvalidVal|                 0x000                  |
148   // +-------------+----------------+----------------+-----------------------+
149   //
150   // Add Hit 0x07FFF998 (DocumentId = 12, SectionId = 3, Flags = 0)
151   // NOT FULL!
152   // +--bytes 0-4--+----- 5-9 ------+----- 10-15 -----+-------- 16-19 -------+
153   // |     16      |Hit::kInvalidVal|      0x000      |       0x07FFF998     |
154   // +-------------+----------------+-----------------+----------------------+
155   //
156   // Add Hit 0x07FFF684 (DocumentId = 18, SectionId = 0, Flags = 4,
157   // TermFrequency=125)
158   // (Hit 0x07FFF998 - Hit 0x07FFF684 = 788)
159   // +--bytes 0-4--+----- 5-9 ------+-- 10-12 --+-- 13-16 --+- 17 -+-- 18-19 --+
160   // |      13     |Hit::kInvalidVal|   0x000   | 0x07FFF684| 125  |    788    |
161   // +-------------+----------------+-----------+-----------+------+-----------+
162   //
163   // Add Hit 0x07FFF4D2 (DocumentId = 22, SectionId = 10, Flags = 2)
164   // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 434)
165   // +--bytes 0-4--+--- 5-9 ----+-- 10 --+-- 11-14 -+- 15-16 -+- 17 -+- 18-19 -+
166   // |      9      |Hit::kInvVal|  0x00  |0x07FFF4D2|   434   | 125  |   788   |
167   // +-------------+------------+--------+----------+---------+------+---------+
168   //
169   // Add Hit 0x07FFF40E (DocumentId = 23, SectionId = 1, Flags = 6,
170   // TermFrequency = 87)
171   // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 196) ALMOST FULL!
172   // +--bytes 0-4-+---- 5-9 ----+- 10-12 -+- 13-14 -+- 15-16 -+- 17 -+- 18-19 -+
173   // |Hit::kInvVal|0x07FFF40E,87|  0x000  |    196  |   434   |  125 |   788   |
174   // +-------------+------------+---------+---------+---------+------+---------+
175   //
176   // Add Hit 0x07FFF320 (DocumentId = 27, SectionId = 4, Flags = 0)
177   // FULL!
178   // +--bytes 0-4--+---- 5-9 ----+- 10-13 -+-- 14-15 -+- 16-17 -+- 18 -+- 19-20
179   // -+ | 0x07FFF320  |0x07FFF40E,87|  0x000  |    196   |   434   |  125 | 788
180   // |
181   // +-------------+-------------+---------+----------+---------+------+---------+
182 
183   // Helpers to determine what state the posting list is in.
IsFull(const PostingListUsed * posting_list_used)184   bool IsFull(const PostingListUsed* posting_list_used) const {
185     return GetSpecialHit(posting_list_used, /*index=*/0)
186                .ValueOrDie()
187                .is_valid() &&
188            GetSpecialHit(posting_list_used, /*index=*/1)
189                .ValueOrDie()
190                .is_valid();
191   }
192 
IsAlmostFull(const PostingListUsed * posting_list_used)193   bool IsAlmostFull(const PostingListUsed* posting_list_used) const {
194     return !GetSpecialHit(posting_list_used, /*index=*/0)
195                 .ValueOrDie()
196                 .is_valid();
197   }
198 
IsEmpty(const PostingListUsed * posting_list_used)199   bool IsEmpty(const PostingListUsed* posting_list_used) const {
200     return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() ==
201                posting_list_used->size_in_bytes() &&
202            !GetSpecialHit(posting_list_used, /*index=*/1)
203                 .ValueOrDie()
204                 .is_valid();
205   }
206 
207   // Returns false if both special hits are invalid or if the offset value
208   // stored in the special hit is less than kSpecialHitsSize or greater than
209   // posting_list_used->size_in_bytes(). Returns true, otherwise.
210   bool IsPostingListValid(const PostingListUsed* posting_list_used) const;
211 
212   // Prepend hit to a posting list that is in the ALMOST_FULL state.
213   // RETURNS:
214   //  - OK, if successful
215   //  - INVALID_ARGUMENT if hit is not less than the previously added hit.
216   libtextclassifier3::Status PrependHitToAlmostFull(
217       PostingListUsed* posting_list_used, const Hit& hit) const;
218 
219   // Prepend hit to a posting list that is in the EMPTY state. This will always
220   // succeed because there are no pre-existing hits and no validly constructed
221   // posting list could fail to fit one hit.
222   void PrependHitToEmpty(PostingListUsed* posting_list_used,
223                          const Hit& hit) const;
224 
225   // Prepend hit to a posting list that is in the NOT_FULL state.
226   // RETURNS:
227   //  - OK, if successful
228   //  - INVALID_ARGUMENT if hit is not less than the previously added hit.
229   libtextclassifier3::Status PrependHitToNotFull(
230       PostingListUsed* posting_list_used, const Hit& hit,
231       uint32_t offset) const;
232 
233   // Returns either 0 (full state), sizeof(Hit) (almost_full state) or
234   // a byte offset between kSpecialHitsSize and
235   // posting_list_used->size_in_bytes() (inclusive) (not_full state).
236   uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const;
237 
238   // Sets the special hits to properly reflect what offset is (see layout
239   // comment for further details).
240   //
241   // Returns false if offset > posting_list_used->size_in_bytes() or offset is
242   // (kSpecialHitsSize, sizeof(Hit)) or offset is (sizeof(Hit), 0). True,
243   // otherwise.
244   bool SetStartByteOffset(PostingListUsed* posting_list_used,
245                           uint32_t offset) const;
246 
247   // Manipulate padded areas. We never store the same hit value twice
248   // so a delta of 0 is a pad byte.
249 
250   // Returns offset of first non-pad byte.
251   uint32_t GetPadEnd(const PostingListUsed* posting_list_used,
252                      uint32_t offset) const;
253 
254   // Fill padding between offset start and offset end with 0s.
255   // Returns false if end > posting_list_used->size_in_bytes(). True,
256   // otherwise.
257   bool PadToEnd(PostingListUsed* posting_list_used, uint32_t start,
258                 uint32_t end) const;
259 
260   // Helper for AppendHits/PopFrontHits. Adds limit number of hits to out or all
261   // hits in the posting list if the posting list contains less than limit
262   // number of hits. out can be NULL.
263   //
264   // NOTE: If called with limit=1, pop=true on a posting list that transitioned
265   // from NOT_FULL directly to FULL, GetHitsInternal will not return the posting
266   // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
267   // ALMOST_FULL.
268   //
269   // RETURNS:
270   //   - OK on success
271   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
272   libtextclassifier3::Status GetHitsInternal(
273       const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
274       std::vector<Hit>* out) const;
275 
276   // Retrieves the value stored in the index-th special hit.
277   //
278   // RETURNS:
279   //   - A valid Hit, on success
280   //   - INVALID_ARGUMENT if index is not less than kNumSpecialData
281   libtextclassifier3::StatusOr<Hit> GetSpecialHit(
282       const PostingListUsed* posting_list_used, uint32_t index) const;
283 
284   // Sets the value stored in the index-th special hit to val. If index is not
285   // less than kSpecialHitSize / sizeof(Hit), this has no effect.
286   bool SetSpecialHit(PostingListUsed* posting_list_used, uint32_t index,
287                      const Hit& val) const;
288 
289   // Prepends hit to the memory region [offset - sizeof(Hit), offset] and
290   // returns the new beginning of the padded region.
291   //
292   // RETURNS:
293   //   - The new beginning of the padded region, if successful.
294   //   - INVALID_ARGUMENT if hit will not fit (uncompressed) between offset and
295   // kSpecialHitsSize
296   libtextclassifier3::StatusOr<uint32_t> PrependHitUncompressed(
297       PostingListUsed* posting_list_used, const Hit& hit,
298       uint32_t offset) const;
299 
300   // If hit has a term frequency, consumes the term frequency at offset, updates
301   // hit to include the term frequency and updates offset to reflect that the
302   // term frequency has been consumed.
303   //
304   // RETURNS:
305   //   - OK, if successful
306   //   - INVALID_ARGUMENT if hit has a term frequency and offset +
307   //     sizeof(Hit::TermFrequency) >= posting_list_used->size_in_bytes()
308   libtextclassifier3::Status ConsumeTermFrequencyIfPresent(
309       const PostingListUsed* posting_list_used, Hit* hit,
310       uint32_t* offset) const;
311 };
312 
313 // Inlined functions. Implementation details below. Avert eyes!
314 template <class T, Hit (*GetHit)(const T&)>
PrependHitArray(PostingListUsed * posting_list_used,const T * array,uint32_t num_hits,bool keep_prepended)315 uint32_t PostingListHitSerializer::PrependHitArray(
316     PostingListUsed* posting_list_used, const T* array, uint32_t num_hits,
317     bool keep_prepended) const {
318   if (!IsPostingListValid(posting_list_used)) {
319     return 0;
320   }
321 
322   // Prepend hits working backwards from array[num_hits - 1].
323   uint32_t i;
324   for (i = 0; i < num_hits; ++i) {
325     if (!PrependHit(posting_list_used, GetHit(array[num_hits - i - 1])).ok()) {
326       break;
327     }
328   }
329   if (i != num_hits && !keep_prepended) {
330     // Didn't fit. Undo everything and check that we have the same offset as
331     // before. PopFrontHits guarantees that it will remove all 'i' hits so long
332     // as there are at least 'i' hits in the posting list, which we know there
333     // are.
334     PopFrontHits(posting_list_used, /*num_hits=*/i);
335   }
336   return i;
337 }
338 
339 }  // namespace lib
340 }  // namespace icing
341 
342 #endif  // ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
343