1 // This file contains a set of fairly generic utility functions when working
2 // with SIMD vectors.
3 //
4 // SAFETY: All of the routines below are unsafe to call because they assume
5 // the necessary CPU target features in order to use particular vendor
6 // intrinsics. Calling these routines when the underlying CPU does not support
7 // the appropriate target features is NOT safe. Callers must ensure this
8 // themselves.
9 //
10 // Note that it may not look like this safety invariant is being upheld when
11 // these routines are called. Namely, the CPU feature check is typically pretty
12 // far away from when these routines are used. Instead, we rely on the fact
13 // that certain types serve as a guaranteed receipt that pertinent target
14 // features are enabled. For example, the only way TeddySlim3Mask256 can be
15 // constructed is if the AVX2 CPU feature is available. Thus, any code running
16 // inside of TeddySlim3Mask256 can use any of the functions below without any
17 // additional checks: its very existence *is* the check.
18 
19 use std::arch::x86_64::*;
20 
21 /// Shift `a` to the left by two bytes (removing its two most significant
22 /// bytes), and concatenate it with the the two most significant bytes of `b`.
23 #[target_feature(enable = "avx2")]
alignr256_14(a: __m256i, b: __m256i) -> __m256i24 pub unsafe fn alignr256_14(a: __m256i, b: __m256i) -> __m256i {
25     // Credit goes to jneem for figuring this out:
26     // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
27     //
28     // TL;DR avx2's PALIGNR instruction is actually just two 128-bit PALIGNR
29     // instructions, which is not what we want, so we need to do some extra
30     // shuffling.
31 
32     // This permute gives us the low 16 bytes of a concatenated with the high
33     // 16 bytes of b, in order of most significant to least significant. So
34     // `v = a[15:0] b[31:16]`.
35     let v = _mm256_permute2x128_si256(b, a, 0x21);
36     // This effectively does this (where we deal in terms of byte-indexing
37     // and byte-shifting, and use inclusive ranges):
38     //
39     //   ret[15:0]  := ((a[15:0] << 16) | v[15:0]) >> 14
40     //               = ((a[15:0] << 16) | b[31:16]) >> 14
41     //   ret[31:16] := ((a[31:16] << 16) | v[31:16]) >> 14
42     //               = ((a[31:16] << 16) | a[15:0]) >> 14
43     //
44     // Which therefore results in:
45     //
46     //   ret[31:0]  := a[29:16] a[15:14] a[13:0] b[31:30]
47     //
48     // The end result is that we've effectively done this:
49     //
50     //   (a << 2) | (b >> 30)
51     //
52     // When `A` and `B` are strings---where the beginning of the string is in
53     // the least significant bits---we effectively result in the following
54     // semantic operation:
55     //
56     //   (A >> 2) | (B << 30)
57     //
58     // The reversal being attributed to the fact that we are in little-endian.
59     _mm256_alignr_epi8(a, v, 14)
60 }
61 
62 /// Shift `a` to the left by one byte (removing its most significant byte), and
63 /// concatenate it with the the most significant byte of `b`.
64 #[target_feature(enable = "avx2")]
alignr256_15(a: __m256i, b: __m256i) -> __m256i65 pub unsafe fn alignr256_15(a: __m256i, b: __m256i) -> __m256i {
66     // For explanation, see alignr256_14.
67     let v = _mm256_permute2x128_si256(b, a, 0x21);
68     _mm256_alignr_epi8(a, v, 15)
69 }
70 
71 /// Unpack the given 128-bit vector into its 64-bit components. The first
72 /// element of the array returned corresponds to the least significant 64-bit
73 /// lane in `a`.
74 #[target_feature(enable = "ssse3")]
unpack64x128(a: __m128i) -> [u64; 2]75 pub unsafe fn unpack64x128(a: __m128i) -> [u64; 2] {
76     [
77         _mm_cvtsi128_si64(a) as u64,
78         _mm_cvtsi128_si64(_mm_srli_si128(a, 8)) as u64,
79     ]
80 }
81 
82 /// Unpack the given 256-bit vector into its 64-bit components. The first
83 /// element of the array returned corresponds to the least significant 64-bit
84 /// lane in `a`.
85 #[target_feature(enable = "avx2")]
unpack64x256(a: __m256i) -> [u64; 4]86 pub unsafe fn unpack64x256(a: __m256i) -> [u64; 4] {
87     // Using transmute here is precisely equivalent, but actually slower. It's
88     // not quite clear why.
89     let lo = _mm256_extracti128_si256(a, 0);
90     let hi = _mm256_extracti128_si256(a, 1);
91     [
92         _mm_cvtsi128_si64(lo) as u64,
93         _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64,
94         _mm_cvtsi128_si64(hi) as u64,
95         _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64,
96     ]
97 }
98 
99 /// Unpack the low 128-bits of `a` and `b`, and return them as 4 64-bit
100 /// integers.
101 ///
102 /// More precisely, if a = a4 a3 a2 a1 and b = b4 b3 b2 b1, where each element
103 /// is a 64-bit integer and a1/b1 correspond to the least significant 64 bits,
104 /// then the return value is `b2 b1 a2 a1`.
105 #[target_feature(enable = "avx2")]
unpacklo64x256(a: __m256i, b: __m256i) -> [u64; 4]106 pub unsafe fn unpacklo64x256(a: __m256i, b: __m256i) -> [u64; 4] {
107     let lo = _mm256_castsi256_si128(a);
108     let hi = _mm256_castsi256_si128(b);
109     [
110         _mm_cvtsi128_si64(lo) as u64,
111         _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64,
112         _mm_cvtsi128_si64(hi) as u64,
113         _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64,
114     ]
115 }
116 
117 /// Returns true if and only if all bits in the given 128-bit vector are 0.
118 #[target_feature(enable = "ssse3")]
is_all_zeroes128(a: __m128i) -> bool119 pub unsafe fn is_all_zeroes128(a: __m128i) -> bool {
120     let cmp = _mm_cmpeq_epi8(a, zeroes128());
121     _mm_movemask_epi8(cmp) as u32 == 0xFFFF
122 }
123 
124 /// Returns true if and only if all bits in the given 256-bit vector are 0.
125 #[target_feature(enable = "avx2")]
is_all_zeroes256(a: __m256i) -> bool126 pub unsafe fn is_all_zeroes256(a: __m256i) -> bool {
127     let cmp = _mm256_cmpeq_epi8(a, zeroes256());
128     _mm256_movemask_epi8(cmp) as u32 == 0xFFFFFFFF
129 }
130 
131 /// Load a 128-bit vector from slice at the given position. The slice does
132 /// not need to be unaligned.
133 ///
134 /// Since this code assumes little-endian (there is no big-endian x86), the
135 /// bytes starting in `slice[at..]` will be at the least significant bits of
136 /// the returned vector. This is important for the surrounding code, since for
137 /// example, shifting the resulting vector right is equivalent to logically
138 /// shifting the bytes in `slice` left.
139 #[target_feature(enable = "sse2")]
loadu128(slice: &[u8], at: usize) -> __m128i140 pub unsafe fn loadu128(slice: &[u8], at: usize) -> __m128i {
141     let ptr = slice.get_unchecked(at..).as_ptr();
142     _mm_loadu_si128(ptr as *const u8 as *const __m128i)
143 }
144 
145 /// Load a 256-bit vector from slice at the given position. The slice does
146 /// not need to be unaligned.
147 ///
148 /// Since this code assumes little-endian (there is no big-endian x86), the
149 /// bytes starting in `slice[at..]` will be at the least significant bits of
150 /// the returned vector. This is important for the surrounding code, since for
151 /// example, shifting the resulting vector right is equivalent to logically
152 /// shifting the bytes in `slice` left.
153 #[target_feature(enable = "avx2")]
loadu256(slice: &[u8], at: usize) -> __m256i154 pub unsafe fn loadu256(slice: &[u8], at: usize) -> __m256i {
155     let ptr = slice.get_unchecked(at..).as_ptr();
156     _mm256_loadu_si256(ptr as *const u8 as *const __m256i)
157 }
158 
159 /// Returns a 128-bit vector with all bits set to 0.
160 #[target_feature(enable = "sse2")]
zeroes128() -> __m128i161 pub unsafe fn zeroes128() -> __m128i {
162     _mm_set1_epi8(0)
163 }
164 
165 /// Returns a 256-bit vector with all bits set to 0.
166 #[target_feature(enable = "avx2")]
zeroes256() -> __m256i167 pub unsafe fn zeroes256() -> __m256i {
168     _mm256_set1_epi8(0)
169 }
170 
171 /// Returns a 128-bit vector with all bits set to 1.
172 #[target_feature(enable = "sse2")]
ones128() -> __m128i173 pub unsafe fn ones128() -> __m128i {
174     _mm_set1_epi8(0xFF as u8 as i8)
175 }
176 
177 /// Returns a 256-bit vector with all bits set to 1.
178 #[target_feature(enable = "avx2")]
ones256() -> __m256i179 pub unsafe fn ones256() -> __m256i {
180     _mm256_set1_epi8(0xFF as u8 as i8)
181 }
182