1 // crate minimums: sse2, x86_64 2 3 use crate::types::*; 4 use core::arch::x86_64::{__m128i, __m256i}; 5 6 mod sse2; 7 8 #[derive(Copy, Clone)] 9 pub struct YesS3; 10 #[derive(Copy, Clone)] 11 pub struct NoS3; 12 13 #[derive(Copy, Clone)] 14 pub struct YesS4; 15 #[derive(Copy, Clone)] 16 pub struct NoS4; 17 18 #[derive(Copy, Clone)] 19 pub struct YesA1; 20 #[derive(Copy, Clone)] 21 pub struct NoA1; 22 23 #[derive(Copy, Clone)] 24 pub struct YesA2; 25 #[derive(Copy, Clone)] 26 pub struct NoA2; 27 28 #[derive(Copy, Clone)] 29 pub struct YesNI; 30 #[derive(Copy, Clone)] 31 pub struct NoNI; 32 33 use core::marker::PhantomData; 34 35 #[derive(Copy, Clone)] 36 pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>); 37 impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI> 38 where 39 sse2::u128x1_sse2<S3, S4, NI>: Swap64, 40 sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, 41 sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, 42 sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4, 43 sse2::u128x1_sse2<S3, S4, NI>: BSwap, 44 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>, 45 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>, 46 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>, 47 sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>, 48 sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>, 49 { 50 type u32x4 = sse2::u32x4_sse2<S3, S4, NI>; 51 type u64x2 = sse2::u64x2_sse2<S3, S4, NI>; 52 type u128x1 = sse2::u128x1_sse2<S3, S4, NI>; 53 54 type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>; 55 type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>; 56 type u64x4 = sse2::u64x4_sse2<S3, S4, NI>; 57 type u128x2 = sse2::u128x2_sse2<S3, S4, NI>; 58 59 type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>; 60 type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>; 61 type u128x4 = sse2::u128x4_sse2<S3, S4, NI>; 62 63 #[inline(always)] instance() -> Self64 unsafe fn instance() -> Self { 65 SseMachine(PhantomData) 66 } 67 } 68 69 #[derive(Copy, Clone)] 70 pub struct Avx2Machine<NI>(PhantomData<NI>); 71 impl<NI: Copy> Machine for Avx2Machine<NI> 72 where 73 sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64, 74 sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, 75 sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, 76 sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4, 77 { 78 type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>; 79 type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>; 80 type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>; 81 82 type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>; 83 type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>; 84 type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>; 85 type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>; 86 87 type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>; 88 type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>; 89 type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>; 90 91 #[inline(always)] instance() -> Self92 unsafe fn instance() -> Self { 93 Avx2Machine(PhantomData) 94 } 95 } 96 97 pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>; 98 pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>; 99 pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>; 100 /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything 101 /// to avoid expensive SSE/VEX conflicts. 102 pub type AVX = SseMachine<YesS3, YesS4, NoNI>; 103 pub type AVX2 = Avx2Machine<NoNI>; 104 105 /// Generic wrapper for unparameterized storage of any of the possible impls. 106 /// Converting into and out of this type should be essentially free, although it may be more 107 /// aligned than a particular impl requires. 108 #[allow(non_camel_case_types)] 109 #[derive(Copy, Clone)] 110 pub union vec128_storage { 111 u32x4: [u32; 4], 112 u64x2: [u64; 2], 113 u128x1: [u128; 1], 114 sse2: __m128i, 115 } 116 impl Store<vec128_storage> for vec128_storage { 117 #[inline(always)] unpack(p: vec128_storage) -> Self118 unsafe fn unpack(p: vec128_storage) -> Self { 119 p 120 } 121 } 122 impl<'a> From<&'a vec128_storage> for &'a [u32; 4] { 123 #[inline(always)] from(x: &'a vec128_storage) -> Self124 fn from(x: &'a vec128_storage) -> Self { 125 unsafe { &x.u32x4 } 126 } 127 } 128 impl From<[u32; 4]> for vec128_storage { 129 #[inline(always)] from(u32x4: [u32; 4]) -> Self130 fn from(u32x4: [u32; 4]) -> Self { 131 vec128_storage { u32x4 } 132 } 133 } 134 impl Default for vec128_storage { 135 #[inline(always)] default() -> Self136 fn default() -> Self { 137 vec128_storage { u128x1: [0] } 138 } 139 } 140 impl Eq for vec128_storage {} 141 impl PartialEq for vec128_storage { 142 #[inline(always)] eq(&self, rhs: &Self) -> bool143 fn eq(&self, rhs: &Self) -> bool { 144 unsafe { self.u128x1 == rhs.u128x1 } 145 } 146 } 147 148 #[allow(non_camel_case_types)] 149 #[derive(Copy, Clone)] 150 pub union vec256_storage { 151 u32x8: [u32; 8], 152 u64x4: [u64; 4], 153 u128x2: [u128; 2], 154 sse2: [vec128_storage; 2], 155 avx: __m256i, 156 } 157 impl From<[u64; 4]> for vec256_storage { 158 #[inline(always)] from(u64x4: [u64; 4]) -> Self159 fn from(u64x4: [u64; 4]) -> Self { 160 vec256_storage { u64x4 } 161 } 162 } 163 impl Default for vec256_storage { 164 #[inline(always)] default() -> Self165 fn default() -> Self { 166 vec256_storage { u128x2: [0, 0] } 167 } 168 } 169 impl vec256_storage { 170 #[inline(always)] new128(xs: [vec128_storage; 2]) -> Self171 pub fn new128(xs: [vec128_storage; 2]) -> Self { 172 Self { sse2: xs } 173 } 174 #[inline(always)] split128(self) -> [vec128_storage; 2]175 pub fn split128(self) -> [vec128_storage; 2] { 176 unsafe { self.sse2 } 177 } 178 } 179 impl Eq for vec256_storage {} 180 impl PartialEq for vec256_storage { 181 #[inline(always)] eq(&self, rhs: &Self) -> bool182 fn eq(&self, rhs: &Self) -> bool { 183 unsafe { self.sse2 == rhs.sse2 } 184 } 185 } 186 187 #[allow(non_camel_case_types)] 188 #[derive(Copy, Clone)] 189 pub union vec512_storage { 190 u32x16: [u32; 16], 191 u64x8: [u64; 8], 192 u128x4: [u128; 4], 193 sse2: [vec128_storage; 4], 194 avx: [vec256_storage; 2], 195 } 196 impl Default for vec512_storage { 197 #[inline(always)] default() -> Self198 fn default() -> Self { 199 vec512_storage { 200 u128x4: [0, 0, 0, 0], 201 } 202 } 203 } 204 impl vec512_storage { 205 #[inline(always)] new128(xs: [vec128_storage; 4]) -> Self206 pub fn new128(xs: [vec128_storage; 4]) -> Self { 207 Self { sse2: xs } 208 } 209 #[inline(always)] split128(self) -> [vec128_storage; 4]210 pub fn split128(self) -> [vec128_storage; 4] { 211 unsafe { self.sse2 } 212 } 213 } 214 impl Eq for vec512_storage {} 215 impl PartialEq for vec512_storage { 216 #[inline(always)] eq(&self, rhs: &Self) -> bool217 fn eq(&self, rhs: &Self) -> bool { 218 unsafe { self.avx == rhs.avx } 219 } 220 } 221 222 macro_rules! impl_into { 223 ($storage:ident, $array:ty, $name:ident) => { 224 impl From<$storage> for $array { 225 #[inline(always)] 226 fn from(vec: $storage) -> Self { 227 unsafe { vec.$name } 228 } 229 } 230 }; 231 } 232 impl_into!(vec128_storage, [u32; 4], u32x4); 233 impl_into!(vec128_storage, [u64; 2], u64x2); 234 impl_into!(vec128_storage, [u128; 1], u128x1); 235 impl_into!(vec256_storage, [u32; 8], u32x8); 236 impl_into!(vec256_storage, [u64; 4], u64x4); 237 impl_into!(vec256_storage, [u128; 2], u128x2); 238 impl_into!(vec512_storage, [u32; 16], u32x16); 239 impl_into!(vec512_storage, [u64; 8], u64x8); 240 impl_into!(vec512_storage, [u128; 4], u128x4); 241 242 /// Generate the full set of optimized implementations to take advantage of the most important 243 /// hardware feature sets. 244 /// 245 /// This dispatcher is suitable for maximizing throughput. 246 #[macro_export] 247 macro_rules! dispatch { 248 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { 249 #[cfg(feature = "std")] 250 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 251 #[inline(always)] 252 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 253 use std::arch::x86_64::*; 254 #[target_feature(enable = "avx2")] 255 unsafe fn impl_avx2($($arg: $argty),*) -> $ret { 256 let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*); 257 _mm256_zeroupper(); 258 ret 259 } 260 #[target_feature(enable = "avx")] 261 #[target_feature(enable = "sse4.1")] 262 #[target_feature(enable = "ssse3")] 263 unsafe fn impl_avx($($arg: $argty),*) -> $ret { 264 let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*); 265 _mm256_zeroupper(); 266 ret 267 } 268 #[target_feature(enable = "sse4.1")] 269 #[target_feature(enable = "ssse3")] 270 unsafe fn impl_sse41($($arg: $argty),*) -> $ret { 271 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 272 } 273 #[target_feature(enable = "ssse3")] 274 unsafe fn impl_ssse3($($arg: $argty),*) -> $ret { 275 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 276 } 277 #[target_feature(enable = "sse2")] 278 unsafe fn impl_sse2($($arg: $argty),*) -> $ret { 279 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 280 } 281 unsafe { 282 if is_x86_feature_detected!("avx2") { 283 impl_avx2($($arg),*) 284 } else if is_x86_feature_detected!("avx") { 285 impl_avx($($arg),*) 286 } else if is_x86_feature_detected!("sse4.1") { 287 impl_sse41($($arg),*) 288 } else if is_x86_feature_detected!("ssse3") { 289 impl_ssse3($($arg),*) 290 } else if is_x86_feature_detected!("sse2") { 291 impl_sse2($($arg),*) 292 } else { 293 unimplemented!() 294 } 295 } 296 } 297 #[cfg(not(feature = "std"))] 298 #[inline(always)] 299 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 300 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 301 unsafe { 302 if cfg!(target_feature = "avx2") { 303 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) 304 } else if cfg!(target_feature = "avx") { 305 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 306 } else if cfg!(target_feature = "sse4.1") { 307 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 308 } else if cfg!(target_feature = "ssse3") { 309 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 310 } else { 311 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 312 } 313 } 314 } 315 }; 316 ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { 317 dispatch!($mach, $MTy, { 318 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body 319 }); 320 } 321 } 322 323 /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit 324 /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX. 325 /// 326 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware 327 /// features (e.g. because they are done infrequently), so minimizing their contribution to code 328 /// size is more important. 329 #[macro_export] 330 macro_rules! dispatch_light128 { 331 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { 332 #[cfg(feature = "std")] 333 $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 334 #[inline(always)] 335 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 336 use std::arch::x86_64::*; 337 #[target_feature(enable = "avx")] 338 unsafe fn impl_avx($($arg: $argty),*) -> $ret { 339 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 340 } 341 #[target_feature(enable = "sse2")] 342 unsafe fn impl_sse2($($arg: $argty),*) -> $ret { 343 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 344 } 345 unsafe { 346 if is_x86_feature_detected!("avx") { 347 impl_avx($($arg),*) 348 } else if is_x86_feature_detected!("sse2") { 349 impl_sse2($($arg),*) 350 } else { 351 unimplemented!() 352 } 353 } 354 } 355 #[cfg(not(feature = "std"))] 356 #[inline(always)] 357 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 358 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 359 unsafe { 360 if cfg!(target_feature = "avx2") { 361 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) 362 } else if cfg!(target_feature = "avx") { 363 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 364 } else if cfg!(target_feature = "sse4.1") { 365 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 366 } else if cfg!(target_feature = "ssse3") { 367 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 368 } else { 369 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 370 } 371 } 372 } 373 }; 374 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { 375 dispatch_light128!($mach, $MTy, { 376 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body 377 }); 378 } 379 } 380 381 /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit 382 /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2. 383 /// 384 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware 385 /// features (e.g. because they are done infrequently), so minimizing their contribution to code 386 /// size is more important. 387 #[macro_export] 388 macro_rules! dispatch_light256 { 389 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { 390 #[cfg(feature = "std")] 391 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret { 392 #[inline(always)] 393 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 394 use std::arch::x86_64::*; 395 #[target_feature(enable = "avx")] 396 unsafe fn impl_avx($($arg: $argty),*) -> $ret { 397 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 398 } 399 #[target_feature(enable = "sse2")] 400 unsafe fn impl_sse2($($arg: $argty),*) -> $ret { 401 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 402 } 403 unsafe { 404 if is_x86_feature_detected!("avx") { 405 impl_avx($($arg),*) 406 } else if is_x86_feature_detected!("sse2") { 407 impl_sse2($($arg),*) 408 } else { 409 unimplemented!() 410 } 411 } 412 } 413 #[cfg(not(feature = "std"))] 414 #[inline(always)] 415 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 416 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 417 unsafe { 418 if cfg!(target_feature = "avx2") { 419 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) 420 } else if cfg!(target_feature = "avx") { 421 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 422 } else if cfg!(target_feature = "sse4.1") { 423 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 424 } else if cfg!(target_feature = "ssse3") { 425 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 426 } else { 427 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 428 } 429 } 430 } 431 }; 432 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { 433 dispatch_light256!($mach, $MTy, { 434 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body 435 }); 436 } 437 } 438