• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // crate minimums: sse2, x86_64
2 
3 use crate::types::*;
4 use core::arch::x86_64::{__m128i, __m256i};
5 
6 mod sse2;
7 
8 #[derive(Copy, Clone)]
9 pub struct YesS3;
10 #[derive(Copy, Clone)]
11 pub struct NoS3;
12 
13 #[derive(Copy, Clone)]
14 pub struct YesS4;
15 #[derive(Copy, Clone)]
16 pub struct NoS4;
17 
18 #[derive(Copy, Clone)]
19 pub struct YesA1;
20 #[derive(Copy, Clone)]
21 pub struct NoA1;
22 
23 #[derive(Copy, Clone)]
24 pub struct YesA2;
25 #[derive(Copy, Clone)]
26 pub struct NoA2;
27 
28 #[derive(Copy, Clone)]
29 pub struct YesNI;
30 #[derive(Copy, Clone)]
31 pub struct NoNI;
32 
33 use core::marker::PhantomData;
34 
35 #[derive(Copy, Clone)]
36 pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
37 impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
38 where
39     sse2::u128x1_sse2<S3, S4, NI>: Swap64,
40     sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
41     sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
42     sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
43     sse2::u128x1_sse2<S3, S4, NI>: BSwap,
44     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
45     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
46     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
47     sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
48     sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
49 {
50     type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
51     type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
52     type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;
53 
54     type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
55     type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
56     type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
57     type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;
58 
59     type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
60     type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
61     type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;
62 
63     #[inline(always)]
instance() -> Self64     unsafe fn instance() -> Self {
65         SseMachine(PhantomData)
66     }
67 }
68 
69 #[derive(Copy, Clone)]
70 pub struct Avx2Machine<NI>(PhantomData<NI>);
71 impl<NI: Copy> Machine for Avx2Machine<NI>
72 where
73     sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
74     sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
75     sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
76     sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
77 {
78     type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
79     type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
80     type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
81 
82     type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>;
83     type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
84     type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
85     type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
86 
87     type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
88     type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
89     type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;
90 
91     #[inline(always)]
instance() -> Self92     unsafe fn instance() -> Self {
93         Avx2Machine(PhantomData)
94     }
95 }
96 
97 pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
98 pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
99 pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
100 /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
101 /// to avoid expensive SSE/VEX conflicts.
102 pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
103 pub type AVX2 = Avx2Machine<NoNI>;
104 
105 zerocopy::cryptocorrosion_derive_traits! {
106     #[repr(C)]
107     /// Generic wrapper for unparameterized storage of any of the possible impls.
108     /// Converting into and out of this type should be essentially free, although it may be more
109     /// aligned than a particular impl requires.
110     #[allow(non_camel_case_types)]
111     #[derive(Copy, Clone)]
112     pub union vec128_storage {
113         u32x4: [u32; 4],
114         u64x2: [u64; 2],
115         u128x1: [u128; 1],
116         sse2: __m128i,
117     }
118 }
119 
120 impl Store<vec128_storage> for vec128_storage {
121     #[inline(always)]
unpack(p: vec128_storage) -> Self122     unsafe fn unpack(p: vec128_storage) -> Self {
123         p
124     }
125 }
126 impl<'a> From<&'a vec128_storage> for &'a [u32; 4] {
127     #[inline(always)]
from(x: &'a vec128_storage) -> Self128     fn from(x: &'a vec128_storage) -> Self {
129         unsafe { &x.u32x4 }
130     }
131 }
132 impl From<[u32; 4]> for vec128_storage {
133     #[inline(always)]
from(u32x4: [u32; 4]) -> Self134     fn from(u32x4: [u32; 4]) -> Self {
135         vec128_storage { u32x4 }
136     }
137 }
138 impl Default for vec128_storage {
139     #[inline(always)]
default() -> Self140     fn default() -> Self {
141         vec128_storage { u128x1: [0] }
142     }
143 }
144 impl Eq for vec128_storage {}
145 impl PartialEq for vec128_storage {
146     #[inline(always)]
eq(&self, rhs: &Self) -> bool147     fn eq(&self, rhs: &Self) -> bool {
148         unsafe { self.u128x1 == rhs.u128x1 }
149     }
150 }
151 
152 #[allow(non_camel_case_types)]
153 #[derive(Copy, Clone)]
154 pub union vec256_storage {
155     u32x8: [u32; 8],
156     u64x4: [u64; 4],
157     u128x2: [u128; 2],
158     sse2: [vec128_storage; 2],
159     avx: __m256i,
160 }
161 impl From<[u64; 4]> for vec256_storage {
162     #[inline(always)]
from(u64x4: [u64; 4]) -> Self163     fn from(u64x4: [u64; 4]) -> Self {
164         vec256_storage { u64x4 }
165     }
166 }
167 impl Default for vec256_storage {
168     #[inline(always)]
default() -> Self169     fn default() -> Self {
170         vec256_storage { u128x2: [0, 0] }
171     }
172 }
173 impl vec256_storage {
174     #[inline(always)]
new128(xs: [vec128_storage; 2]) -> Self175     pub fn new128(xs: [vec128_storage; 2]) -> Self {
176         Self { sse2: xs }
177     }
178     #[inline(always)]
split128(self) -> [vec128_storage; 2]179     pub fn split128(self) -> [vec128_storage; 2] {
180         unsafe { self.sse2 }
181     }
182 }
183 impl Eq for vec256_storage {}
184 impl PartialEq for vec256_storage {
185     #[inline(always)]
eq(&self, rhs: &Self) -> bool186     fn eq(&self, rhs: &Self) -> bool {
187         unsafe { self.sse2 == rhs.sse2 }
188     }
189 }
190 
191 #[allow(non_camel_case_types)]
192 #[derive(Copy, Clone)]
193 pub union vec512_storage {
194     u32x16: [u32; 16],
195     u64x8: [u64; 8],
196     u128x4: [u128; 4],
197     sse2: [vec128_storage; 4],
198     avx: [vec256_storage; 2],
199 }
200 impl Default for vec512_storage {
201     #[inline(always)]
default() -> Self202     fn default() -> Self {
203         vec512_storage {
204             u128x4: [0, 0, 0, 0],
205         }
206     }
207 }
208 impl vec512_storage {
209     #[inline(always)]
new128(xs: [vec128_storage; 4]) -> Self210     pub fn new128(xs: [vec128_storage; 4]) -> Self {
211         Self { sse2: xs }
212     }
213     #[inline(always)]
split128(self) -> [vec128_storage; 4]214     pub fn split128(self) -> [vec128_storage; 4] {
215         unsafe { self.sse2 }
216     }
217 }
218 impl Eq for vec512_storage {}
219 impl PartialEq for vec512_storage {
220     #[inline(always)]
eq(&self, rhs: &Self) -> bool221     fn eq(&self, rhs: &Self) -> bool {
222         unsafe { self.avx == rhs.avx }
223     }
224 }
225 
226 macro_rules! impl_into {
227     ($storage:ident, $array:ty, $name:ident) => {
228         impl From<$storage> for $array {
229             #[inline(always)]
230             fn from(vec: $storage) -> Self {
231                 unsafe { vec.$name }
232             }
233         }
234     };
235 }
236 impl_into!(vec128_storage, [u32; 4], u32x4);
237 impl_into!(vec128_storage, [u64; 2], u64x2);
238 impl_into!(vec128_storage, [u128; 1], u128x1);
239 impl_into!(vec256_storage, [u32; 8], u32x8);
240 impl_into!(vec256_storage, [u64; 4], u64x4);
241 impl_into!(vec256_storage, [u128; 2], u128x2);
242 impl_into!(vec512_storage, [u32; 16], u32x16);
243 impl_into!(vec512_storage, [u64; 8], u64x8);
244 impl_into!(vec512_storage, [u128; 4], u128x4);
245 
246 /// Generate the full set of optimized implementations to take advantage of the most important
247 /// hardware feature sets.
248 ///
249 /// This dispatcher is suitable for maximizing throughput.
250 #[macro_export]
251 macro_rules! dispatch {
252     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
253         #[cfg(feature = "std")]
254         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
255             #[inline(always)]
256             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
257             use std::arch::x86_64::*;
258             #[target_feature(enable = "avx2")]
259             unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
260                 let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
261                 _mm256_zeroupper();
262                 ret
263             }
264             #[target_feature(enable = "avx")]
265             #[target_feature(enable = "sse4.1")]
266             #[target_feature(enable = "ssse3")]
267             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
268                 let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
269                 _mm256_zeroupper();
270                 ret
271             }
272             #[target_feature(enable = "sse4.1")]
273             #[target_feature(enable = "ssse3")]
274             unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
275                 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
276             }
277             #[target_feature(enable = "ssse3")]
278             unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
279                 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
280             }
281             #[target_feature(enable = "sse2")]
282             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
283                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
284             }
285             unsafe {
286                 if is_x86_feature_detected!("avx2") {
287                     impl_avx2($($arg),*)
288                 } else if is_x86_feature_detected!("avx") {
289                     impl_avx($($arg),*)
290                 } else if is_x86_feature_detected!("sse4.1") {
291                     impl_sse41($($arg),*)
292                 } else if is_x86_feature_detected!("ssse3") {
293                     impl_ssse3($($arg),*)
294                 } else if is_x86_feature_detected!("sse2") {
295                     impl_sse2($($arg),*)
296                 } else {
297                     unimplemented!()
298                 }
299             }
300         }
301         #[cfg(not(feature = "std"))]
302         #[inline(always)]
303         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
304             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
305             unsafe {
306                 if cfg!(target_feature = "avx2") {
307                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
308                 } else if cfg!(target_feature = "avx") {
309                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
310                 } else if cfg!(target_feature = "sse4.1") {
311                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
312                 } else if cfg!(target_feature = "ssse3") {
313                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
314                 } else {
315                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
316                 }
317             }
318         }
319     };
320     ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
321         dispatch!($mach, $MTy, {
322             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
323         });
324     }
325 }
326 
327 /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
328 /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
329 ///
330 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
331 /// features (e.g. because they are done infrequently), so minimizing their contribution to code
332 /// size is more important.
333 #[macro_export]
334 macro_rules! dispatch_light128 {
335     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
336         #[cfg(feature = "std")]
337         $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
338             #[inline(always)]
339             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
340             use std::arch::x86_64::*;
341             #[target_feature(enable = "avx")]
342             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
343                 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
344             }
345             #[target_feature(enable = "sse2")]
346             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
347                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
348             }
349             unsafe {
350                 if is_x86_feature_detected!("avx") {
351                     impl_avx($($arg),*)
352                 } else if is_x86_feature_detected!("sse2") {
353                     impl_sse2($($arg),*)
354                 } else {
355                     unimplemented!()
356                 }
357             }
358         }
359         #[cfg(not(feature = "std"))]
360         #[inline(always)]
361         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
362             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
363             unsafe {
364                 if cfg!(target_feature = "avx2") {
365                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
366                 } else if cfg!(target_feature = "avx") {
367                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
368                 } else if cfg!(target_feature = "sse4.1") {
369                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
370                 } else if cfg!(target_feature = "ssse3") {
371                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
372                 } else {
373                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
374                 }
375             }
376         }
377     };
378     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
379         dispatch_light128!($mach, $MTy, {
380             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
381         });
382     }
383 }
384 
385 /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
386 /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
387 ///
388 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
389 /// features (e.g. because they are done infrequently), so minimizing their contribution to code
390 /// size is more important.
391 #[macro_export]
392 macro_rules! dispatch_light256 {
393     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
394         #[cfg(feature = "std")]
395         $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret {
396             #[inline(always)]
397             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
398             use std::arch::x86_64::*;
399             #[target_feature(enable = "avx")]
400             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
401                 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
402             }
403             #[target_feature(enable = "sse2")]
404             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
405                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
406             }
407             unsafe {
408                 if is_x86_feature_detected!("avx") {
409                     impl_avx($($arg),*)
410                 } else if is_x86_feature_detected!("sse2") {
411                     impl_sse2($($arg),*)
412                 } else {
413                     unimplemented!()
414                 }
415             }
416         }
417         #[cfg(not(feature = "std"))]
418         #[inline(always)]
419         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
420             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
421             unsafe {
422                 if cfg!(target_feature = "avx2") {
423                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
424                 } else if cfg!(target_feature = "avx") {
425                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
426                 } else if cfg!(target_feature = "sse4.1") {
427                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
428                 } else if cfg!(target_feature = "ssse3") {
429                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
430                 } else {
431                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
432                 }
433             }
434         }
435     };
436     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
437         dispatch_light256!($mach, $MTy, {
438             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
439         });
440     }
441 }
442