1 // crate minimums: sse2, x86_64 2 3 use crate::types::*; 4 use core::arch::x86_64::{__m128i, __m256i}; 5 6 mod sse2; 7 8 #[derive(Copy, Clone)] 9 pub struct YesS3; 10 #[derive(Copy, Clone)] 11 pub struct NoS3; 12 13 #[derive(Copy, Clone)] 14 pub struct YesS4; 15 #[derive(Copy, Clone)] 16 pub struct NoS4; 17 18 #[derive(Copy, Clone)] 19 pub struct YesA1; 20 #[derive(Copy, Clone)] 21 pub struct NoA1; 22 23 #[derive(Copy, Clone)] 24 pub struct YesA2; 25 #[derive(Copy, Clone)] 26 pub struct NoA2; 27 28 #[derive(Copy, Clone)] 29 pub struct YesNI; 30 #[derive(Copy, Clone)] 31 pub struct NoNI; 32 33 use core::marker::PhantomData; 34 35 #[derive(Copy, Clone)] 36 pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>); 37 impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI> 38 where 39 sse2::u128x1_sse2<S3, S4, NI>: Swap64, 40 sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, 41 sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, 42 sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4, 43 sse2::u128x1_sse2<S3, S4, NI>: BSwap, 44 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>, 45 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>, 46 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>, 47 sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>, 48 sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>, 49 { 50 type u32x4 = sse2::u32x4_sse2<S3, S4, NI>; 51 type u64x2 = sse2::u64x2_sse2<S3, S4, NI>; 52 type u128x1 = sse2::u128x1_sse2<S3, S4, NI>; 53 54 type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>; 55 type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>; 56 type u64x4 = sse2::u64x4_sse2<S3, S4, NI>; 57 type u128x2 = sse2::u128x2_sse2<S3, S4, NI>; 58 59 type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>; 60 type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>; 61 type u128x4 = sse2::u128x4_sse2<S3, S4, NI>; 62 63 #[inline(always)] instance() -> Self64 unsafe fn instance() -> Self { 65 SseMachine(PhantomData) 66 } 67 } 68 69 #[derive(Copy, Clone)] 70 pub struct Avx2Machine<NI>(PhantomData<NI>); 71 impl<NI: Copy> Machine for Avx2Machine<NI> 72 where 73 sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64, 74 sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, 75 sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, 76 sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4, 77 { 78 type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>; 79 type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>; 80 type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>; 81 82 type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>; 83 type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>; 84 type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>; 85 type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>; 86 87 type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>; 88 type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>; 89 type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>; 90 91 #[inline(always)] instance() -> Self92 unsafe fn instance() -> Self { 93 Avx2Machine(PhantomData) 94 } 95 } 96 97 pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>; 98 pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>; 99 pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>; 100 /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything 101 /// to avoid expensive SSE/VEX conflicts. 102 pub type AVX = SseMachine<YesS3, YesS4, NoNI>; 103 pub type AVX2 = Avx2Machine<NoNI>; 104 105 zerocopy::cryptocorrosion_derive_traits! { 106 #[repr(C)] 107 /// Generic wrapper for unparameterized storage of any of the possible impls. 108 /// Converting into and out of this type should be essentially free, although it may be more 109 /// aligned than a particular impl requires. 110 #[allow(non_camel_case_types)] 111 #[derive(Copy, Clone)] 112 pub union vec128_storage { 113 u32x4: [u32; 4], 114 u64x2: [u64; 2], 115 u128x1: [u128; 1], 116 sse2: __m128i, 117 } 118 } 119 120 impl Store<vec128_storage> for vec128_storage { 121 #[inline(always)] unpack(p: vec128_storage) -> Self122 unsafe fn unpack(p: vec128_storage) -> Self { 123 p 124 } 125 } 126 impl<'a> From<&'a vec128_storage> for &'a [u32; 4] { 127 #[inline(always)] from(x: &'a vec128_storage) -> Self128 fn from(x: &'a vec128_storage) -> Self { 129 unsafe { &x.u32x4 } 130 } 131 } 132 impl From<[u32; 4]> for vec128_storage { 133 #[inline(always)] from(u32x4: [u32; 4]) -> Self134 fn from(u32x4: [u32; 4]) -> Self { 135 vec128_storage { u32x4 } 136 } 137 } 138 impl Default for vec128_storage { 139 #[inline(always)] default() -> Self140 fn default() -> Self { 141 vec128_storage { u128x1: [0] } 142 } 143 } 144 impl Eq for vec128_storage {} 145 impl PartialEq for vec128_storage { 146 #[inline(always)] eq(&self, rhs: &Self) -> bool147 fn eq(&self, rhs: &Self) -> bool { 148 unsafe { self.u128x1 == rhs.u128x1 } 149 } 150 } 151 152 #[allow(non_camel_case_types)] 153 #[derive(Copy, Clone)] 154 pub union vec256_storage { 155 u32x8: [u32; 8], 156 u64x4: [u64; 4], 157 u128x2: [u128; 2], 158 sse2: [vec128_storage; 2], 159 avx: __m256i, 160 } 161 impl From<[u64; 4]> for vec256_storage { 162 #[inline(always)] from(u64x4: [u64; 4]) -> Self163 fn from(u64x4: [u64; 4]) -> Self { 164 vec256_storage { u64x4 } 165 } 166 } 167 impl Default for vec256_storage { 168 #[inline(always)] default() -> Self169 fn default() -> Self { 170 vec256_storage { u128x2: [0, 0] } 171 } 172 } 173 impl vec256_storage { 174 #[inline(always)] new128(xs: [vec128_storage; 2]) -> Self175 pub fn new128(xs: [vec128_storage; 2]) -> Self { 176 Self { sse2: xs } 177 } 178 #[inline(always)] split128(self) -> [vec128_storage; 2]179 pub fn split128(self) -> [vec128_storage; 2] { 180 unsafe { self.sse2 } 181 } 182 } 183 impl Eq for vec256_storage {} 184 impl PartialEq for vec256_storage { 185 #[inline(always)] eq(&self, rhs: &Self) -> bool186 fn eq(&self, rhs: &Self) -> bool { 187 unsafe { self.sse2 == rhs.sse2 } 188 } 189 } 190 191 #[allow(non_camel_case_types)] 192 #[derive(Copy, Clone)] 193 pub union vec512_storage { 194 u32x16: [u32; 16], 195 u64x8: [u64; 8], 196 u128x4: [u128; 4], 197 sse2: [vec128_storage; 4], 198 avx: [vec256_storage; 2], 199 } 200 impl Default for vec512_storage { 201 #[inline(always)] default() -> Self202 fn default() -> Self { 203 vec512_storage { 204 u128x4: [0, 0, 0, 0], 205 } 206 } 207 } 208 impl vec512_storage { 209 #[inline(always)] new128(xs: [vec128_storage; 4]) -> Self210 pub fn new128(xs: [vec128_storage; 4]) -> Self { 211 Self { sse2: xs } 212 } 213 #[inline(always)] split128(self) -> [vec128_storage; 4]214 pub fn split128(self) -> [vec128_storage; 4] { 215 unsafe { self.sse2 } 216 } 217 } 218 impl Eq for vec512_storage {} 219 impl PartialEq for vec512_storage { 220 #[inline(always)] eq(&self, rhs: &Self) -> bool221 fn eq(&self, rhs: &Self) -> bool { 222 unsafe { self.avx == rhs.avx } 223 } 224 } 225 226 macro_rules! impl_into { 227 ($storage:ident, $array:ty, $name:ident) => { 228 impl From<$storage> for $array { 229 #[inline(always)] 230 fn from(vec: $storage) -> Self { 231 unsafe { vec.$name } 232 } 233 } 234 }; 235 } 236 impl_into!(vec128_storage, [u32; 4], u32x4); 237 impl_into!(vec128_storage, [u64; 2], u64x2); 238 impl_into!(vec128_storage, [u128; 1], u128x1); 239 impl_into!(vec256_storage, [u32; 8], u32x8); 240 impl_into!(vec256_storage, [u64; 4], u64x4); 241 impl_into!(vec256_storage, [u128; 2], u128x2); 242 impl_into!(vec512_storage, [u32; 16], u32x16); 243 impl_into!(vec512_storage, [u64; 8], u64x8); 244 impl_into!(vec512_storage, [u128; 4], u128x4); 245 246 /// Generate the full set of optimized implementations to take advantage of the most important 247 /// hardware feature sets. 248 /// 249 /// This dispatcher is suitable for maximizing throughput. 250 #[macro_export] 251 macro_rules! dispatch { 252 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { 253 #[cfg(feature = "std")] 254 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 255 #[inline(always)] 256 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 257 use std::arch::x86_64::*; 258 #[target_feature(enable = "avx2")] 259 unsafe fn impl_avx2($($arg: $argty),*) -> $ret { 260 let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*); 261 _mm256_zeroupper(); 262 ret 263 } 264 #[target_feature(enable = "avx")] 265 #[target_feature(enable = "sse4.1")] 266 #[target_feature(enable = "ssse3")] 267 unsafe fn impl_avx($($arg: $argty),*) -> $ret { 268 let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*); 269 _mm256_zeroupper(); 270 ret 271 } 272 #[target_feature(enable = "sse4.1")] 273 #[target_feature(enable = "ssse3")] 274 unsafe fn impl_sse41($($arg: $argty),*) -> $ret { 275 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 276 } 277 #[target_feature(enable = "ssse3")] 278 unsafe fn impl_ssse3($($arg: $argty),*) -> $ret { 279 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 280 } 281 #[target_feature(enable = "sse2")] 282 unsafe fn impl_sse2($($arg: $argty),*) -> $ret { 283 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 284 } 285 unsafe { 286 if is_x86_feature_detected!("avx2") { 287 impl_avx2($($arg),*) 288 } else if is_x86_feature_detected!("avx") { 289 impl_avx($($arg),*) 290 } else if is_x86_feature_detected!("sse4.1") { 291 impl_sse41($($arg),*) 292 } else if is_x86_feature_detected!("ssse3") { 293 impl_ssse3($($arg),*) 294 } else if is_x86_feature_detected!("sse2") { 295 impl_sse2($($arg),*) 296 } else { 297 unimplemented!() 298 } 299 } 300 } 301 #[cfg(not(feature = "std"))] 302 #[inline(always)] 303 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 304 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 305 unsafe { 306 if cfg!(target_feature = "avx2") { 307 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) 308 } else if cfg!(target_feature = "avx") { 309 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 310 } else if cfg!(target_feature = "sse4.1") { 311 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 312 } else if cfg!(target_feature = "ssse3") { 313 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 314 } else { 315 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 316 } 317 } 318 } 319 }; 320 ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { 321 dispatch!($mach, $MTy, { 322 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body 323 }); 324 } 325 } 326 327 /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit 328 /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX. 329 /// 330 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware 331 /// features (e.g. because they are done infrequently), so minimizing their contribution to code 332 /// size is more important. 333 #[macro_export] 334 macro_rules! dispatch_light128 { 335 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { 336 #[cfg(feature = "std")] 337 $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 338 #[inline(always)] 339 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 340 use std::arch::x86_64::*; 341 #[target_feature(enable = "avx")] 342 unsafe fn impl_avx($($arg: $argty),*) -> $ret { 343 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 344 } 345 #[target_feature(enable = "sse2")] 346 unsafe fn impl_sse2($($arg: $argty),*) -> $ret { 347 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 348 } 349 unsafe { 350 if is_x86_feature_detected!("avx") { 351 impl_avx($($arg),*) 352 } else if is_x86_feature_detected!("sse2") { 353 impl_sse2($($arg),*) 354 } else { 355 unimplemented!() 356 } 357 } 358 } 359 #[cfg(not(feature = "std"))] 360 #[inline(always)] 361 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 362 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 363 unsafe { 364 if cfg!(target_feature = "avx2") { 365 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) 366 } else if cfg!(target_feature = "avx") { 367 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 368 } else if cfg!(target_feature = "sse4.1") { 369 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 370 } else if cfg!(target_feature = "ssse3") { 371 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 372 } else { 373 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 374 } 375 } 376 } 377 }; 378 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { 379 dispatch_light128!($mach, $MTy, { 380 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body 381 }); 382 } 383 } 384 385 /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit 386 /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2. 387 /// 388 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware 389 /// features (e.g. because they are done infrequently), so minimizing their contribution to code 390 /// size is more important. 391 #[macro_export] 392 macro_rules! dispatch_light256 { 393 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { 394 #[cfg(feature = "std")] 395 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret { 396 #[inline(always)] 397 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 398 use std::arch::x86_64::*; 399 #[target_feature(enable = "avx")] 400 unsafe fn impl_avx($($arg: $argty),*) -> $ret { 401 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 402 } 403 #[target_feature(enable = "sse2")] 404 unsafe fn impl_sse2($($arg: $argty),*) -> $ret { 405 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 406 } 407 unsafe { 408 if is_x86_feature_detected!("avx") { 409 impl_avx($($arg),*) 410 } else if is_x86_feature_detected!("sse2") { 411 impl_sse2($($arg),*) 412 } else { 413 unimplemented!() 414 } 415 } 416 } 417 #[cfg(not(feature = "std"))] 418 #[inline(always)] 419 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 420 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 421 unsafe { 422 if cfg!(target_feature = "avx2") { 423 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) 424 } else if cfg!(target_feature = "avx") { 425 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 426 } else if cfg!(target_feature = "sse4.1") { 427 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 428 } else if cfg!(target_feature = "ssse3") { 429 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 430 } else { 431 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 432 } 433 } 434 } 435 }; 436 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { 437 dispatch_light256!($mach, $MTy, { 438 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body 439 }); 440 } 441 } 442