• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /// A trait for describing vector operations used by vectorized searchers.
2 ///
3 /// The trait is highly constrained to low level vector operations needed.
4 /// In general, it was invented mostly to be generic over x86's __m128i and
5 /// __m256i types. At time of writing, it also supports wasm and aarch64
6 /// 128-bit vector types as well.
7 ///
8 /// # Safety
9 ///
10 /// All methods are not safe since they are intended to be implemented using
11 /// vendor intrinsics, which are also not safe. Callers must ensure that the
12 /// appropriate target features are enabled in the calling function, and that
13 /// the current CPU supports them. All implementations should avoid marking the
14 /// routines with #[target_feature] and instead mark them as #[inline(always)]
15 /// to ensure they get appropriately inlined. (inline(always) cannot be used
16 /// with target_feature.)
17 pub(crate) trait Vector: Copy + core::fmt::Debug {
18     /// The number of bytes in the vector. That is, this is the size of the
19     /// vector in memory.
20     const BYTES: usize;
21     /// The bits that must be zero in order for a `*const u8` pointer to be
22     /// correctly aligned to read vector values.
23     const ALIGN: usize;
24 
25     /// The type of the value returned by `Vector::movemask`.
26     ///
27     /// This supports abstracting over the specific representation used in
28     /// order to accommodate different representations in different ISAs.
29     type Mask: MoveMask;
30 
31     /// Create a vector with 8-bit lanes with the given byte repeated into each
32     /// lane.
splat(byte: u8) -> Self33     unsafe fn splat(byte: u8) -> Self;
34 
35     /// Read a vector-size number of bytes from the given pointer. The pointer
36     /// must be aligned to the size of the vector.
37     ///
38     /// # Safety
39     ///
40     /// Callers must guarantee that at least `BYTES` bytes are readable from
41     /// `data` and that `data` is aligned to a `BYTES` boundary.
load_aligned(data: *const u8) -> Self42     unsafe fn load_aligned(data: *const u8) -> Self;
43 
44     /// Read a vector-size number of bytes from the given pointer. The pointer
45     /// does not need to be aligned.
46     ///
47     /// # Safety
48     ///
49     /// Callers must guarantee that at least `BYTES` bytes are readable from
50     /// `data`.
load_unaligned(data: *const u8) -> Self51     unsafe fn load_unaligned(data: *const u8) -> Self;
52 
53     /// _mm_movemask_epi8 or _mm256_movemask_epi8
movemask(self) -> Self::Mask54     unsafe fn movemask(self) -> Self::Mask;
55     /// _mm_cmpeq_epi8 or _mm256_cmpeq_epi8
cmpeq(self, vector2: Self) -> Self56     unsafe fn cmpeq(self, vector2: Self) -> Self;
57     /// _mm_and_si128 or _mm256_and_si256
and(self, vector2: Self) -> Self58     unsafe fn and(self, vector2: Self) -> Self;
59     /// _mm_or or _mm256_or_si256
or(self, vector2: Self) -> Self60     unsafe fn or(self, vector2: Self) -> Self;
61     /// Returns true if and only if `Self::movemask` would return a mask that
62     /// contains at least one non-zero bit.
movemask_will_have_non_zero(self) -> bool63     unsafe fn movemask_will_have_non_zero(self) -> bool {
64         self.movemask().has_non_zero()
65     }
66 }
67 
68 /// A trait that abstracts over a vector-to-scalar operation called
69 /// "move mask."
70 ///
71 /// On x86-64, this is `_mm_movemask_epi8` for SSE2 and `_mm256_movemask_epi8`
72 /// for AVX2. It takes a vector of `u8` lanes and returns a scalar where the
73 /// `i`th bit is set if and only if the most significant bit in the `i`th lane
74 /// of the vector is set. The simd128 ISA for wasm32 also supports this
75 /// exact same operation natively.
76 ///
77 /// ... But aarch64 doesn't. So we have to fake it with more instructions and
78 /// a slightly different representation. We could do extra work to unify the
79 /// representations, but then would require additional costs in the hot path
80 /// for `memchr` and `packedpair`. So instead, we abstraction over the specific
81 /// representation with this trait an ddefine the operations we actually need.
82 pub(crate) trait MoveMask: Copy + core::fmt::Debug {
83     /// Return a mask that is all zeros except for the least significant `n`
84     /// lanes in a corresponding vector.
all_zeros_except_least_significant(n: usize) -> Self85     fn all_zeros_except_least_significant(n: usize) -> Self;
86 
87     /// Returns true if and only if this mask has a a non-zero bit anywhere.
has_non_zero(self) -> bool88     fn has_non_zero(self) -> bool;
89 
90     /// Returns the number of bits set to 1 in this mask.
count_ones(self) -> usize91     fn count_ones(self) -> usize;
92 
93     /// Does a bitwise `and` operation between `self` and `other`.
and(self, other: Self) -> Self94     fn and(self, other: Self) -> Self;
95 
96     /// Does a bitwise `or` operation between `self` and `other`.
or(self, other: Self) -> Self97     fn or(self, other: Self) -> Self;
98 
99     /// Returns a mask that is equivalent to `self` but with the least
100     /// significant 1-bit set to 0.
clear_least_significant_bit(self) -> Self101     fn clear_least_significant_bit(self) -> Self;
102 
103     /// Returns the offset of the first non-zero lane this mask represents.
first_offset(self) -> usize104     fn first_offset(self) -> usize;
105 
106     /// Returns the offset of the last non-zero lane this mask represents.
last_offset(self) -> usize107     fn last_offset(self) -> usize;
108 }
109 
110 /// This is a "sensible" movemask implementation where each bit represents
111 /// whether the most significant bit is set in each corresponding lane of a
112 /// vector. This is used on x86-64 and wasm, but such a mask is more expensive
113 /// to get on aarch64 so we use something a little different.
114 ///
115 /// We call this "sensible" because this is what we get using native sse/avx
116 /// movemask instructions. But neon has no such native equivalent.
117 #[derive(Clone, Copy, Debug)]
118 pub(crate) struct SensibleMoveMask(u32);
119 
120 impl SensibleMoveMask {
121     /// Get the mask in a form suitable for computing offsets.
122     ///
123     /// Basically, this normalizes to little endian. On big endian, this swaps
124     /// the bytes.
125     #[inline(always)]
get_for_offset(self) -> u32126     fn get_for_offset(self) -> u32 {
127         #[cfg(target_endian = "big")]
128         {
129             self.0.swap_bytes()
130         }
131         #[cfg(target_endian = "little")]
132         {
133             self.0
134         }
135     }
136 }
137 
138 impl MoveMask for SensibleMoveMask {
139     #[inline(always)]
all_zeros_except_least_significant(n: usize) -> SensibleMoveMask140     fn all_zeros_except_least_significant(n: usize) -> SensibleMoveMask {
141         debug_assert!(n < 32);
142         SensibleMoveMask(!((1 << n) - 1))
143     }
144 
145     #[inline(always)]
has_non_zero(self) -> bool146     fn has_non_zero(self) -> bool {
147         self.0 != 0
148     }
149 
150     #[inline(always)]
count_ones(self) -> usize151     fn count_ones(self) -> usize {
152         self.0.count_ones() as usize
153     }
154 
155     #[inline(always)]
and(self, other: SensibleMoveMask) -> SensibleMoveMask156     fn and(self, other: SensibleMoveMask) -> SensibleMoveMask {
157         SensibleMoveMask(self.0 & other.0)
158     }
159 
160     #[inline(always)]
or(self, other: SensibleMoveMask) -> SensibleMoveMask161     fn or(self, other: SensibleMoveMask) -> SensibleMoveMask {
162         SensibleMoveMask(self.0 | other.0)
163     }
164 
165     #[inline(always)]
clear_least_significant_bit(self) -> SensibleMoveMask166     fn clear_least_significant_bit(self) -> SensibleMoveMask {
167         SensibleMoveMask(self.0 & (self.0 - 1))
168     }
169 
170     #[inline(always)]
first_offset(self) -> usize171     fn first_offset(self) -> usize {
172         // We are dealing with little endian here (and if we aren't, we swap
173         // the bytes so we are in practice), where the most significant byte
174         // is at a higher address. That means the least significant bit that
175         // is set corresponds to the position of our first matching byte.
176         // That position corresponds to the number of zeros after the least
177         // significant bit.
178         self.get_for_offset().trailing_zeros() as usize
179     }
180 
181     #[inline(always)]
last_offset(self) -> usize182     fn last_offset(self) -> usize {
183         // We are dealing with little endian here (and if we aren't, we swap
184         // the bytes so we are in practice), where the most significant byte is
185         // at a higher address. That means the most significant bit that is set
186         // corresponds to the position of our last matching byte. The position
187         // from the end of the mask is therefore the number of leading zeros
188         // in a 32 bit integer, and the position from the start of the mask is
189         // therefore 32 - (leading zeros) - 1.
190         32 - self.get_for_offset().leading_zeros() as usize - 1
191     }
192 }
193 
194 #[cfg(target_arch = "x86_64")]
195 mod x86sse2 {
196     use core::arch::x86_64::*;
197 
198     use super::{SensibleMoveMask, Vector};
199 
200     impl Vector for __m128i {
201         const BYTES: usize = 16;
202         const ALIGN: usize = Self::BYTES - 1;
203 
204         type Mask = SensibleMoveMask;
205 
206         #[inline(always)]
splat(byte: u8) -> __m128i207         unsafe fn splat(byte: u8) -> __m128i {
208             _mm_set1_epi8(byte as i8)
209         }
210 
211         #[inline(always)]
load_aligned(data: *const u8) -> __m128i212         unsafe fn load_aligned(data: *const u8) -> __m128i {
213             _mm_load_si128(data as *const __m128i)
214         }
215 
216         #[inline(always)]
load_unaligned(data: *const u8) -> __m128i217         unsafe fn load_unaligned(data: *const u8) -> __m128i {
218             _mm_loadu_si128(data as *const __m128i)
219         }
220 
221         #[inline(always)]
movemask(self) -> SensibleMoveMask222         unsafe fn movemask(self) -> SensibleMoveMask {
223             SensibleMoveMask(_mm_movemask_epi8(self) as u32)
224         }
225 
226         #[inline(always)]
cmpeq(self, vector2: Self) -> __m128i227         unsafe fn cmpeq(self, vector2: Self) -> __m128i {
228             _mm_cmpeq_epi8(self, vector2)
229         }
230 
231         #[inline(always)]
and(self, vector2: Self) -> __m128i232         unsafe fn and(self, vector2: Self) -> __m128i {
233             _mm_and_si128(self, vector2)
234         }
235 
236         #[inline(always)]
or(self, vector2: Self) -> __m128i237         unsafe fn or(self, vector2: Self) -> __m128i {
238             _mm_or_si128(self, vector2)
239         }
240     }
241 }
242 
243 #[cfg(target_arch = "x86_64")]
244 mod x86avx2 {
245     use core::arch::x86_64::*;
246 
247     use super::{SensibleMoveMask, Vector};
248 
249     impl Vector for __m256i {
250         const BYTES: usize = 32;
251         const ALIGN: usize = Self::BYTES - 1;
252 
253         type Mask = SensibleMoveMask;
254 
255         #[inline(always)]
splat(byte: u8) -> __m256i256         unsafe fn splat(byte: u8) -> __m256i {
257             _mm256_set1_epi8(byte as i8)
258         }
259 
260         #[inline(always)]
load_aligned(data: *const u8) -> __m256i261         unsafe fn load_aligned(data: *const u8) -> __m256i {
262             _mm256_load_si256(data as *const __m256i)
263         }
264 
265         #[inline(always)]
load_unaligned(data: *const u8) -> __m256i266         unsafe fn load_unaligned(data: *const u8) -> __m256i {
267             _mm256_loadu_si256(data as *const __m256i)
268         }
269 
270         #[inline(always)]
movemask(self) -> SensibleMoveMask271         unsafe fn movemask(self) -> SensibleMoveMask {
272             SensibleMoveMask(_mm256_movemask_epi8(self) as u32)
273         }
274 
275         #[inline(always)]
cmpeq(self, vector2: Self) -> __m256i276         unsafe fn cmpeq(self, vector2: Self) -> __m256i {
277             _mm256_cmpeq_epi8(self, vector2)
278         }
279 
280         #[inline(always)]
and(self, vector2: Self) -> __m256i281         unsafe fn and(self, vector2: Self) -> __m256i {
282             _mm256_and_si256(self, vector2)
283         }
284 
285         #[inline(always)]
or(self, vector2: Self) -> __m256i286         unsafe fn or(self, vector2: Self) -> __m256i {
287             _mm256_or_si256(self, vector2)
288         }
289     }
290 }
291 
292 #[cfg(target_arch = "aarch64")]
293 mod aarch64neon {
294     use core::arch::aarch64::*;
295 
296     use super::{MoveMask, Vector};
297 
298     impl Vector for uint8x16_t {
299         const BYTES: usize = 16;
300         const ALIGN: usize = Self::BYTES - 1;
301 
302         type Mask = NeonMoveMask;
303 
304         #[inline(always)]
splat(byte: u8) -> uint8x16_t305         unsafe fn splat(byte: u8) -> uint8x16_t {
306             vdupq_n_u8(byte)
307         }
308 
309         #[inline(always)]
load_aligned(data: *const u8) -> uint8x16_t310         unsafe fn load_aligned(data: *const u8) -> uint8x16_t {
311             // I've tried `data.cast::<uint8x16_t>().read()` instead, but
312             // couldn't observe any benchmark differences.
313             Self::load_unaligned(data)
314         }
315 
316         #[inline(always)]
load_unaligned(data: *const u8) -> uint8x16_t317         unsafe fn load_unaligned(data: *const u8) -> uint8x16_t {
318             vld1q_u8(data)
319         }
320 
321         #[inline(always)]
movemask(self) -> NeonMoveMask322         unsafe fn movemask(self) -> NeonMoveMask {
323             let asu16s = vreinterpretq_u16_u8(self);
324             let mask = vshrn_n_u16(asu16s, 4);
325             let asu64 = vreinterpret_u64_u8(mask);
326             let scalar64 = vget_lane_u64(asu64, 0);
327             NeonMoveMask(scalar64 & 0x8888888888888888)
328         }
329 
330         #[inline(always)]
cmpeq(self, vector2: Self) -> uint8x16_t331         unsafe fn cmpeq(self, vector2: Self) -> uint8x16_t {
332             vceqq_u8(self, vector2)
333         }
334 
335         #[inline(always)]
and(self, vector2: Self) -> uint8x16_t336         unsafe fn and(self, vector2: Self) -> uint8x16_t {
337             vandq_u8(self, vector2)
338         }
339 
340         #[inline(always)]
or(self, vector2: Self) -> uint8x16_t341         unsafe fn or(self, vector2: Self) -> uint8x16_t {
342             vorrq_u8(self, vector2)
343         }
344 
345         /// This is the only interesting implementation of this routine.
346         /// Basically, instead of doing the "shift right narrow" dance, we use
347         /// adajacent folding max to determine whether there are any non-zero
348         /// bytes in our mask. If there are, *then* we'll do the "shift right
349         /// narrow" dance. In benchmarks, this does lead to slightly better
350         /// throughput, but the win doesn't appear huge.
351         #[inline(always)]
movemask_will_have_non_zero(self) -> bool352         unsafe fn movemask_will_have_non_zero(self) -> bool {
353             let low = vreinterpretq_u64_u8(vpmaxq_u8(self, self));
354             vgetq_lane_u64(low, 0) != 0
355         }
356     }
357 
358     /// Neon doesn't have a `movemask` that works like the one in x86-64, so we
359     /// wind up using a different method[1]. The different method also produces
360     /// a mask, but 4 bits are set in the neon case instead of a single bit set
361     /// in the x86-64 case. We do an extra step to zero out 3 of the 4 bits,
362     /// but we still wind up with at least 3 zeroes between each set bit. This
363     /// generally means that we need to do some division by 4 before extracting
364     /// offsets.
365     ///
366     /// In fact, the existence of this type is the entire reason that we have
367     /// the `MoveMask` trait in the first place. This basically lets us keep
368     /// the different representations of masks without being forced to unify
369     /// them into a single representation, which could result in extra and
370     /// unnecessary work.
371     ///
372     /// [1]: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
373     #[derive(Clone, Copy, Debug)]
374     pub(crate) struct NeonMoveMask(u64);
375 
376     impl NeonMoveMask {
377         /// Get the mask in a form suitable for computing offsets.
378         ///
379         /// Basically, this normalizes to little endian. On big endian, this
380         /// swaps the bytes.
381         #[inline(always)]
get_for_offset(self) -> u64382         fn get_for_offset(self) -> u64 {
383             #[cfg(target_endian = "big")]
384             {
385                 self.0.swap_bytes()
386             }
387             #[cfg(target_endian = "little")]
388             {
389                 self.0
390             }
391         }
392     }
393 
394     impl MoveMask for NeonMoveMask {
395         #[inline(always)]
all_zeros_except_least_significant(n: usize) -> NeonMoveMask396         fn all_zeros_except_least_significant(n: usize) -> NeonMoveMask {
397             debug_assert!(n < 16);
398             NeonMoveMask(!(((1 << n) << 2) - 1))
399         }
400 
401         #[inline(always)]
has_non_zero(self) -> bool402         fn has_non_zero(self) -> bool {
403             self.0 != 0
404         }
405 
406         #[inline(always)]
count_ones(self) -> usize407         fn count_ones(self) -> usize {
408             self.0.count_ones() as usize
409         }
410 
411         #[inline(always)]
and(self, other: NeonMoveMask) -> NeonMoveMask412         fn and(self, other: NeonMoveMask) -> NeonMoveMask {
413             NeonMoveMask(self.0 & other.0)
414         }
415 
416         #[inline(always)]
or(self, other: NeonMoveMask) -> NeonMoveMask417         fn or(self, other: NeonMoveMask) -> NeonMoveMask {
418             NeonMoveMask(self.0 | other.0)
419         }
420 
421         #[inline(always)]
clear_least_significant_bit(self) -> NeonMoveMask422         fn clear_least_significant_bit(self) -> NeonMoveMask {
423             NeonMoveMask(self.0 & (self.0 - 1))
424         }
425 
426         #[inline(always)]
first_offset(self) -> usize427         fn first_offset(self) -> usize {
428             // We are dealing with little endian here (and if we aren't,
429             // we swap the bytes so we are in practice), where the most
430             // significant byte is at a higher address. That means the least
431             // significant bit that is set corresponds to the position of our
432             // first matching byte. That position corresponds to the number of
433             // zeros after the least significant bit.
434             //
435             // Note that unlike `SensibleMoveMask`, this mask has its bits
436             // spread out over 64 bits instead of 16 bits (for a 128 bit
437             // vector). Namely, where as x86-64 will turn
438             //
439             //   0x00 0xFF 0x00 0x00 0xFF
440             //
441             // into 10010, our neon approach will turn it into
442             //
443             //   10000000000010000000
444             //
445             // And this happens because neon doesn't have a native `movemask`
446             // instruction, so we kind of fake it[1]. Thus, we divide the
447             // number of trailing zeros by 4 to get the "real" offset.
448             //
449             // [1]: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
450             (self.get_for_offset().trailing_zeros() >> 2) as usize
451         }
452 
453         #[inline(always)]
last_offset(self) -> usize454         fn last_offset(self) -> usize {
455             // See comment in `first_offset` above. This is basically the same,
456             // but coming from the other direction.
457             16 - (self.get_for_offset().leading_zeros() >> 2) as usize - 1
458         }
459     }
460 }
461 
462 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
463 mod wasm_simd128 {
464     use core::arch::wasm32::*;
465 
466     use super::{SensibleMoveMask, Vector};
467 
468     impl Vector for v128 {
469         const BYTES: usize = 16;
470         const ALIGN: usize = Self::BYTES - 1;
471 
472         type Mask = SensibleMoveMask;
473 
474         #[inline(always)]
splat(byte: u8) -> v128475         unsafe fn splat(byte: u8) -> v128 {
476             u8x16_splat(byte)
477         }
478 
479         #[inline(always)]
load_aligned(data: *const u8) -> v128480         unsafe fn load_aligned(data: *const u8) -> v128 {
481             *data.cast()
482         }
483 
484         #[inline(always)]
load_unaligned(data: *const u8) -> v128485         unsafe fn load_unaligned(data: *const u8) -> v128 {
486             v128_load(data.cast())
487         }
488 
489         #[inline(always)]
movemask(self) -> SensibleMoveMask490         unsafe fn movemask(self) -> SensibleMoveMask {
491             SensibleMoveMask(u8x16_bitmask(self).into())
492         }
493 
494         #[inline(always)]
cmpeq(self, vector2: Self) -> v128495         unsafe fn cmpeq(self, vector2: Self) -> v128 {
496             u8x16_eq(self, vector2)
497         }
498 
499         #[inline(always)]
and(self, vector2: Self) -> v128500         unsafe fn and(self, vector2: Self) -> v128 {
501             v128_and(self, vector2)
502         }
503 
504         #[inline(always)]
or(self, vector2: Self) -> v128505         unsafe fn or(self, vector2: Self) -> v128 {
506             v128_or(self, vector2)
507         }
508     }
509 }
510