• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2019 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/trace_processor/containers/bit_vector.h"
18 
19 #include <limits>
20 
21 #include "src/trace_processor/containers/bit_vector_iterators.h"
22 
23 #if PERFETTO_BUILDFLAG(PERFETTO_X64_CPU_OPT)
24 #include <immintrin.h>
25 #endif
26 
27 namespace perfetto {
28 namespace trace_processor {
29 namespace {
30 
31 // This function implements the PDEP instruction in x64 as a loop.
32 // See https://www.felixcloutier.com/x86/pdep for details on what PDEP does.
33 //
34 // Unfortunately, as we're emulating this in software, it scales with the number
35 // of set bits in |mask| rather than being a constant time instruction:
36 // therefore, this should be avoided where real instructions are available.
PdepSlow(uint64_t word,uint64_t mask)37 uint64_t PdepSlow(uint64_t word, uint64_t mask) {
38   if (word == 0 || mask == std::numeric_limits<uint64_t>::max())
39     return word;
40 
41   // This algorithm is for calculating PDEP was found to be the fastest "simple"
42   // one among those tested when writing this function.
43   uint64_t result = 0;
44   for (uint64_t bb = 1; mask; bb += bb) {
45     if (word & bb) {
46       // MSVC doesn't like -mask so work around this by doing 0 - mask.
47       result |= mask & (0ull - mask);
48     }
49     mask &= mask - 1;
50   }
51   return result;
52 }
53 
54 // See |PdepSlow| for information on PDEP.
Pdep(uint64_t word,uint64_t mask)55 uint64_t Pdep(uint64_t word, uint64_t mask) {
56 #if PERFETTO_BUILDFLAG(PERFETTO_X64_CPU_OPT)
57   base::ignore_result(PdepSlow);
58   return _pdep_u64(word, mask);
59 #else
60   return PdepSlow(word, mask);
61 #endif
62 }
63 
64 }  // namespace
65 
66 BitVector::BitVector() = default;
67 
BitVector(std::initializer_list<bool> init)68 BitVector::BitVector(std::initializer_list<bool> init) {
69   for (bool x : init) {
70     if (x) {
71       AppendTrue();
72     } else {
73       AppendFalse();
74     }
75   }
76 }
77 
BitVector(uint32_t count,bool value)78 BitVector::BitVector(uint32_t count, bool value) {
79   Resize(count, value);
80 }
81 
BitVector(std::vector<uint64_t> words,std::vector<uint32_t> counts,uint32_t size)82 BitVector::BitVector(std::vector<uint64_t> words,
83                      std::vector<uint32_t> counts,
84                      uint32_t size)
85     : size_(size), counts_(std::move(counts)), words_(std::move(words)) {
86   uint32_t words_size = static_cast<uint32_t>(words_.size());
87   if (words_size % Block::kWords != 0)
88     words_.resize(words_.size() + 8 - (words_.size() % 8u));
89 }
90 
Resize(uint32_t new_size,bool filler)91 void BitVector::Resize(uint32_t new_size, bool filler) {
92   uint32_t old_size = size_;
93   if (new_size == old_size)
94     return;
95 
96   // Empty bitvectors should be memory efficient so we don't keep any data
97   // around in the bitvector.
98   if (new_size == 0) {
99     words_.clear();
100     counts_.clear();
101     size_ = 0;
102     return;
103   }
104 
105   // Compute the address of the new last bit in the bitvector.
106   Address last_addr = IndexToAddress(new_size - 1);
107   uint32_t old_blocks_size = static_cast<uint32_t>(counts_.size());
108   uint32_t new_blocks_size = last_addr.block_idx + 1;
109 
110   // Resize the block and count vectors to have the correct number of entries.
111   words_.resize(Block::kWords * new_blocks_size);
112   counts_.resize(new_blocks_size);
113 
114   if (new_size > old_size) {
115     if (filler) {
116       // If the new space should be filled with ones, then set all the bits
117       // between the address of the old size and the new last address.
118       const Address& start = IndexToAddress(old_size);
119       Set(start, last_addr);
120 
121       // We then need to update the counts vector to match the changes we
122       // made to the blocks.
123 
124       // We start by adding the bits we set in the first block to the
125       // cummulative count before the range we changed.
126       Address end_of_block = {start.block_idx,
127                               {Block::kWords - 1, BitWord::kBits - 1}};
128       uint32_t count_in_block_after_end =
129           AddressToIndex(end_of_block) - AddressToIndex(start) + 1;
130       uint32_t set_count = CountSetBits() + count_in_block_after_end;
131 
132       for (uint32_t i = start.block_idx + 1; i <= last_addr.block_idx; ++i) {
133         // Set the count to the cummulative count so far.
134         counts_[i] = set_count;
135 
136         // Add a full block of set bits to the count.
137         set_count += Block::kBits;
138       }
139     } else {
140       // If the newly added bits are false, we just need to update the
141       // counts vector with the current size of the bitvector for all
142       // the newly added blocks.
143       if (new_blocks_size > old_blocks_size) {
144         uint32_t count = CountSetBits();
145         for (uint32_t i = old_blocks_size; i < new_blocks_size; ++i) {
146           counts_[i] = count;
147         }
148       }
149     }
150   } else {
151     // Throw away all the bits after the new last bit. We do this to make
152     // future lookup, append and resize operations not have to worrying about
153     // trailing garbage bits in the last block.
154     BlockFromIndex(last_addr.block_idx).ClearAfter(last_addr.block_offset);
155   }
156 
157   // Actually update the size.
158   size_ = new_size;
159 }
160 
Copy() const161 BitVector BitVector::Copy() const {
162   return BitVector(words_, counts_, size_);
163 }
164 
IterateAllBits() const165 BitVector::AllBitsIterator BitVector::IterateAllBits() const {
166   return AllBitsIterator(this);
167 }
168 
IterateSetBits() const169 BitVector::SetBitsIterator BitVector::IterateSetBits() const {
170   return SetBitsIterator(this);
171 }
172 
Not() const173 BitVector BitVector::Not() const {
174   Builder builder(size());
175 
176   // Append all words from all blocks except the last one.
177   uint32_t full_words = builder.BitsInCompleteWordsUntilFull();
178   for (uint32_t i = 0; i < full_words; ++i) {
179     builder.AppendWord(ConstBitWord(&words_[i]).Not());
180   }
181 
182   // Append bits from the last word.
183   uint32_t bits_from_last_word = builder.BitsUntilFull();
184   ConstBitWord last_word(&words_[full_words]);
185   for (uint32_t i = 0; i < bits_from_last_word; ++i) {
186     builder.Append(!last_word.IsSet(i));
187   }
188 
189   return std::move(builder).Build();
190 }
191 
UpdateSetBits(const BitVector & update)192 void BitVector::UpdateSetBits(const BitVector& update) {
193   if (update.CountSetBits() == 0 || CountSetBits() == 0) {
194     *this = BitVector();
195     return;
196   }
197   PERFETTO_DCHECK(update.size() <= CountSetBits());
198 
199   // Get the start and end ptrs for the current bitvector.
200   // Safe because of the static_assert above.
201   uint64_t* ptr = words_.data();
202   const uint64_t* ptr_end = ptr + WordCount(size());
203 
204   // Get the start and end ptrs for the update bitvector.
205   // Safe because of the static_assert above.
206   const uint64_t* update_ptr = update.words_.data();
207   const uint64_t* update_ptr_end = update_ptr + WordCount(update.size());
208 
209   // |update_unused_bits| contains |unused_bits_count| bits at the bottom
210   // which indicates how the next |unused_bits_count| set bits in |this|
211   // should be changed. This is necessary because word boundaries in |this| will
212   // almost always *not* match the word boundaries in |update|.
213   uint64_t update_unused_bits = 0;
214   uint8_t unused_bits_count = 0;
215 
216   // The basic premise of this loop is, for each word in |this| we find
217   // enough bits from |update| to cover every set bit in the word. We then use
218   // the PDEP x64 instruction (or equivalent instructions/software emulation) to
219   // update the word and store it back in |this|.
220   for (; ptr != ptr_end; ++ptr) {
221     uint64_t current = *ptr;
222 
223     // If the current value is all zeros, there's nothing to update.
224     if (PERFETTO_UNLIKELY(current == 0))
225       continue;
226 
227     uint8_t popcount = static_cast<uint8_t>(PERFETTO_POPCOUNT(current));
228     PERFETTO_DCHECK(popcount >= 1);
229 
230     // Check if we have enough unused bits from the previous iteration - if so,
231     // we don't need to read anything from |update|.
232     uint64_t update_for_current = update_unused_bits;
233     if (unused_bits_count >= popcount) {
234       // We have enough bits so just do the accounting to not reuse these bits
235       // for the future.
236       unused_bits_count -= popcount;
237       update_unused_bits = popcount == 64 ? 0 : update_unused_bits >> popcount;
238     } else {
239       // We don't have enough bits so we need to read the next word of bits from
240       // |current|.
241       uint64_t next_update = update_ptr == update_ptr_end ? 0 : *update_ptr++;
242 
243       // Bitwise or |64 - unused_bits_count| bits from the bottom of
244       // |next_update| to the top of |update_for_current|. Only |popcount| bits
245       // will actually be used by PDEP but masking off the unused bits takes
246       // *more* instructions than not doing anything.
247       update_for_current |= next_update << unused_bits_count;
248 
249       // PDEP will use |popcount| bits from update: this means it will use
250       // |unused_bits_count| from |update_for_current| and |popcount -
251       // unused_bits_count| from |next_update|
252       uint8_t used_next_bits = popcount - unused_bits_count;
253 
254       // Shift off any bits which will be used by current and store the
255       // remainder for use in the next iteration.
256       update_unused_bits =
257           used_next_bits == 64 ? 0 : next_update >> used_next_bits;
258       unused_bits_count = 64 - used_next_bits;
259     }
260 
261     // We should never end up with more than 64 bits available.
262     PERFETTO_CHECK(unused_bits_count <= 64);
263 
264     // PDEP precisely captures the notion of "updating set bits" for a single
265     // word.
266     *ptr = Pdep(update_for_current, current);
267   }
268 
269   // We shouldn't have any non-zero unused bits and we should have consumed the
270   // whole |update| bitvector. Note that we cannot really say anything about
271   // |unused_bits_count| because it's possible for the above algorithm to use
272   // some bits which are "past the end" of |update|; as long as these bits are
273   // zero, it meets the pre-condition of this function.
274   PERFETTO_DCHECK(update_unused_bits == 0);
275   PERFETTO_DCHECK(update_ptr == update_ptr_end);
276 
277   for (uint32_t i = 0; i < counts_.size() - 1; ++i) {
278     counts_[i + 1] = counts_[i] + ConstBlockFromIndex(i).CountSetBits();
279   }
280 
281   // After the loop, we should have precisely the same number of bits
282   // set as |update|.
283   PERFETTO_DCHECK(update.CountSetBits() == CountSetBits());
284 }
285 
IntersectRange(uint32_t range_start,uint32_t range_end) const286 BitVector BitVector::IntersectRange(uint32_t range_start,
287                                     uint32_t range_end) const {
288   uint32_t total_set_bits = CountSetBits();
289   if (total_set_bits == 0 || range_start >= range_end)
290     return BitVector();
291 
292   // We should skip all bits until the index of first set bit bigger than
293   // |range_start|.
294   uint32_t start_idx = std::max(range_start, IndexOfNthSet(0));
295   uint32_t end_idx = std::min(range_end, size());
296 
297   if (start_idx >= end_idx)
298     return BitVector();
299 
300   Builder builder(end_idx);
301 
302   // All bits before start should be empty.
303   builder.Skip(start_idx);
304 
305   uint32_t front_bits = builder.BitsUntilWordBoundaryOrFull();
306   uint32_t cur_index = start_idx;
307   for (uint32_t i = 0; i < front_bits; ++i, ++cur_index) {
308     builder.Append(IsSet(cur_index));
309   }
310 
311   PERFETTO_DCHECK(cur_index == end_idx || cur_index % BitWord::kBits == 0);
312   uint32_t cur_words = cur_index / BitWord::kBits;
313   uint32_t full_words = builder.BitsInCompleteWordsUntilFull() / BitWord::kBits;
314   uint32_t total_full_words = cur_words + full_words;
315   for (; cur_words < total_full_words; ++cur_words) {
316     builder.AppendWord(words_[cur_words]);
317   }
318 
319   uint32_t last_bits = builder.BitsUntilFull();
320   cur_index += full_words * BitWord::kBits;
321   for (uint32_t i = 0; i < last_bits; ++i, ++cur_index) {
322     builder.Append(IsSet(cur_index));
323   }
324 
325   return std::move(builder).Build();
326 }
327 
328 }  // namespace trace_processor
329 }  // namespace perfetto
330