1 /**************************************************************************
2 *
3 * Copyright 2008 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 #ifndef BITSCAN_H
30 #define BITSCAN_H
31
32 #include <assert.h>
33 #include <stdint.h>
34 #include <stdbool.h>
35 #include <string.h>
36
37 #if defined(_MSC_VER)
38 #include <intrin.h>
39 #endif
40
41 #if defined(__POPCNT__)
42 #include <popcntintrin.h>
43 #endif
44
45 #include "macros.h"
46
47 #ifdef __cplusplus
48 extern "C" {
49 #endif
50
51
52 /**
53 * Find first bit set in word. Least significant bit is 1.
54 * Return 0 if no bits set.
55 */
56 #ifdef HAVE___BUILTIN_FFS
57 #define ffs __builtin_ffs
58 #elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
59 static inline
60 int ffs(int i)
61 {
62 unsigned long index;
63 if (_BitScanForward(&index, i))
64 return index + 1;
65 else
66 return 0;
67 }
68 #else
69 extern
70 int ffs(int i);
71 #endif
72
73 #ifdef HAVE___BUILTIN_FFSLL
74 #define ffsll __builtin_ffsll
75 #elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
76 static inline int
77 ffsll(long long int i)
78 {
79 unsigned long index;
80 if (_BitScanForward64(&index, i))
81 return index + 1;
82 else
83 return 0;
84 }
85 #else
86 extern int
87 ffsll(long long int val);
88 #endif
89
90
91 /* Destructively loop over all of the bits in a mask as in:
92 *
93 * while (mymask) {
94 * int i = u_bit_scan(&mymask);
95 * ... process element i
96 * }
97 *
98 */
99 static inline int
u_bit_scan(unsigned * mask)100 u_bit_scan(unsigned *mask)
101 {
102 const int i = ffs(*mask) - 1;
103 *mask ^= (1u << i);
104 return i;
105 }
106
107 #define u_foreach_bit(b, dword) \
108 for (uint32_t __dword = (dword), b; \
109 ((b) = ffs(__dword) - 1, __dword); \
110 __dword &= ~(1 << (b)))
111
112 static inline int
u_bit_scan64(uint64_t * mask)113 u_bit_scan64(uint64_t *mask)
114 {
115 const int i = ffsll(*mask) - 1;
116 *mask ^= (((uint64_t)1) << i);
117 return i;
118 }
119
120 #define u_foreach_bit64(b, dword) \
121 for (uint64_t __dword = (dword), b; \
122 ((b) = ffsll(__dword) - 1, __dword); \
123 __dword &= ~(1ull << (b)))
124
125 /* Determine if an uint32_t value is a power of two.
126 *
127 * \note
128 * Zero is treated as a power of two.
129 */
130 static inline bool
util_is_power_of_two_or_zero(uint32_t v)131 util_is_power_of_two_or_zero(uint32_t v)
132 {
133 return IS_POT(v);
134 }
135
136 /* Determine if an uint64_t value is a power of two.
137 *
138 * \note
139 * Zero is treated as a power of two.
140 */
141 static inline bool
util_is_power_of_two_or_zero64(uint64_t v)142 util_is_power_of_two_or_zero64(uint64_t v)
143 {
144 return IS_POT(v);
145 }
146
147 /* Determine if an uint32_t value is a power of two.
148 *
149 * \note
150 * Zero is \b not treated as a power of two.
151 */
152 static inline bool
util_is_power_of_two_nonzero(uint32_t v)153 util_is_power_of_two_nonzero(uint32_t v)
154 {
155 /* __POPCNT__ is different from HAVE___BUILTIN_POPCOUNT. The latter
156 * indicates the existence of the __builtin_popcount function. The former
157 * indicates that _mm_popcnt_u32 exists and is a native instruction.
158 *
159 * The other alternative is to use SSE 4.2 compile-time flags. This has
160 * two drawbacks. First, there is currently no build infrastructure for
161 * SSE 4.2 (only 4.1), so that would have to be added. Second, some AMD
162 * CPUs support POPCNT but not SSE 4.2 (e.g., Barcelona).
163 */
164 #ifdef __POPCNT__
165 return _mm_popcnt_u32(v) == 1;
166 #else
167 return v != 0 && IS_POT(v);
168 #endif
169 }
170
171 /* Determine if an uint64_t value is a power of two.
172 *
173 * \note
174 * Zero is \b not treated as a power of two.
175 */
176 static inline bool
util_is_power_of_two_nonzero64(uint64_t v)177 util_is_power_of_two_nonzero64(uint64_t v)
178 {
179 return v != 0 && IS_POT(v);
180 }
181
182 /* For looping over a bitmask when you want to loop over consecutive bits
183 * manually, for example:
184 *
185 * while (mask) {
186 * int start, count, i;
187 *
188 * u_bit_scan_consecutive_range(&mask, &start, &count);
189 *
190 * for (i = 0; i < count; i++)
191 * ... process element (start+i)
192 * }
193 */
194 static inline void
u_bit_scan_consecutive_range(unsigned * mask,int * start,int * count)195 u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count)
196 {
197 if (*mask == 0xffffffff) {
198 *start = 0;
199 *count = 32;
200 *mask = 0;
201 return;
202 }
203 *start = ffs(*mask) - 1;
204 *count = ffs(~(*mask >> *start)) - 1;
205 *mask &= ~(((1u << *count) - 1) << *start);
206 }
207
208 static inline void
u_bit_scan_consecutive_range64(uint64_t * mask,int * start,int * count)209 u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
210 {
211 if (*mask == ~0ull) {
212 *start = 0;
213 *count = 64;
214 *mask = 0;
215 return;
216 }
217 *start = ffsll(*mask) - 1;
218 *count = ffsll(~(*mask >> *start)) - 1;
219 *mask &= ~(((((uint64_t)1) << *count) - 1) << *start);
220 }
221
222
223 /**
224 * Find last bit set in a word. The least significant bit is 1.
225 * Return 0 if no bits are set.
226 * Essentially ffs() in the reverse direction.
227 */
228 static inline unsigned
util_last_bit(unsigned u)229 util_last_bit(unsigned u)
230 {
231 #if defined(HAVE___BUILTIN_CLZ)
232 return u == 0 ? 0 : 32 - __builtin_clz(u);
233 #elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
234 unsigned long index;
235 if (_BitScanReverse(&index, u))
236 return index + 1;
237 else
238 return 0;
239 #else
240 unsigned r = 0;
241 while (u) {
242 r++;
243 u >>= 1;
244 }
245 return r;
246 #endif
247 }
248
249 /**
250 * Find last bit set in a word. The least significant bit is 1.
251 * Return 0 if no bits are set.
252 * Essentially ffsll() in the reverse direction.
253 */
254 static inline unsigned
util_last_bit64(uint64_t u)255 util_last_bit64(uint64_t u)
256 {
257 #if defined(HAVE___BUILTIN_CLZLL)
258 return u == 0 ? 0 : 64 - __builtin_clzll(u);
259 #elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
260 unsigned long index;
261 if (_BitScanReverse64(&index, u))
262 return index + 1;
263 else
264 return 0;
265 #else
266 unsigned r = 0;
267 while (u) {
268 r++;
269 u >>= 1;
270 }
271 return r;
272 #endif
273 }
274
275 /**
276 * Find last bit in a word that does not match the sign bit. The least
277 * significant bit is 1.
278 * Return 0 if no bits are set.
279 */
280 static inline unsigned
util_last_bit_signed(int i)281 util_last_bit_signed(int i)
282 {
283 if (i >= 0)
284 return util_last_bit(i);
285 else
286 return util_last_bit(~(unsigned)i);
287 }
288
289 /* Returns a bitfield in which the first count bits starting at start are
290 * set.
291 */
292 static inline unsigned
u_bit_consecutive(unsigned start,unsigned count)293 u_bit_consecutive(unsigned start, unsigned count)
294 {
295 assert(start + count <= 32);
296 if (count == 32)
297 return ~0;
298 return ((1u << count) - 1) << start;
299 }
300
301 static inline uint64_t
u_bit_consecutive64(unsigned start,unsigned count)302 u_bit_consecutive64(unsigned start, unsigned count)
303 {
304 assert(start + count <= 64);
305 if (count == 64)
306 return ~(uint64_t)0;
307 return (((uint64_t)1 << count) - 1) << start;
308 }
309
310 /**
311 * Return number of bits set in n.
312 */
313 static inline unsigned
util_bitcount(unsigned n)314 util_bitcount(unsigned n)
315 {
316 #if defined(HAVE___BUILTIN_POPCOUNT)
317 return __builtin_popcount(n);
318 #else
319 /* K&R classic bitcount.
320 *
321 * For each iteration, clear the LSB from the bitfield.
322 * Requires only one iteration per set bit, instead of
323 * one iteration per bit less than highest set bit.
324 */
325 unsigned bits;
326 for (bits = 0; n; bits++) {
327 n &= n - 1;
328 }
329 return bits;
330 #endif
331 }
332
333 /**
334 * Return the number of bits set in n using the native popcnt instruction.
335 * The caller is responsible for ensuring that popcnt is supported by the CPU.
336 *
337 * gcc doesn't use it if -mpopcnt or -march= that has popcnt is missing.
338 *
339 */
340 static inline unsigned
util_popcnt_inline_asm(unsigned n)341 util_popcnt_inline_asm(unsigned n)
342 {
343 #if defined(USE_X86_64_ASM) || defined(USE_X86_ASM)
344 uint32_t out;
345 __asm volatile("popcnt %1, %0" : "=r"(out) : "r"(n));
346 return out;
347 #else
348 /* We should never get here by accident, but I'm sure it'll happen. */
349 return util_bitcount(n);
350 #endif
351 }
352
353 static inline unsigned
util_bitcount64(uint64_t n)354 util_bitcount64(uint64_t n)
355 {
356 #ifdef HAVE___BUILTIN_POPCOUNTLL
357 return __builtin_popcountll(n);
358 #else
359 return util_bitcount(n) + util_bitcount(n >> 32);
360 #endif
361 }
362
363 /**
364 * Widens the given bit mask by a multiplier, meaning that it will
365 * replicate each bit by that amount.
366 *
367 * For example:
368 * 0b101 widened by 2 will become: 0b110011
369 *
370 * This is typically used in shader I/O to transform a 64-bit
371 * writemask to a 32-bit writemask.
372 */
373 static inline uint32_t
util_widen_mask(uint32_t mask,unsigned multiplier)374 util_widen_mask(uint32_t mask, unsigned multiplier)
375 {
376 uint32_t new_mask = 0;
377 u_foreach_bit(i, mask)
378 new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
379 return new_mask;
380 }
381
382 #ifdef __cplusplus
383 }
384
385 /* util_bitcount has large measurable overhead (~2%), so it's recommended to
386 * use the POPCNT instruction via inline assembly if the CPU supports it.
387 */
388 enum util_popcnt {
389 POPCNT_NO,
390 POPCNT_YES,
391 };
392
393 /* Convenient function to select popcnt through a C++ template argument.
394 * This should be used as part of larger functions that are optimized
395 * as a whole.
396 */
397 template<util_popcnt POPCNT> inline unsigned
util_bitcount_fast(unsigned n)398 util_bitcount_fast(unsigned n)
399 {
400 if (POPCNT == POPCNT_YES)
401 return util_popcnt_inline_asm(n);
402 else
403 return util_bitcount(n);
404 }
405
406 #endif /* __cplusplus */
407
408 #endif /* BITSCAN_H */
409