1 use core::arch::x86_64::*;
2
3 use super::{scalar, Vector};
4
5 #[derive(Copy, Clone)]
6 pub struct Impl(());
7
8 impl Impl {
9 /// # Safety
10 ///
11 /// You must ensure that the CPU has the SSE2 feature
12 #[inline]
13 #[cfg(feature = "std")]
new_unchecked() -> Impl14 pub unsafe fn new_unchecked() -> Impl {
15 Impl(())
16 }
17 }
18
19 impl Vector for Impl {
20 #[inline]
round_scramble(&self, acc: &mut [u64; 8], secret_end: &[u8; 64])21 fn round_scramble(&self, acc: &mut [u64; 8], secret_end: &[u8; 64]) {
22 // Safety: Type can only be constructed when SSE2 feature is present
23 unsafe { round_scramble_sse2(acc, secret_end) }
24 }
25
26 #[inline]
accumulate(&self, acc: &mut [u64; 8], stripe: &[u8; 64], secret: &[u8; 64])27 fn accumulate(&self, acc: &mut [u64; 8], stripe: &[u8; 64], secret: &[u8; 64]) {
28 // Safety: Type can only be constructed when SSE2 feature is present
29 unsafe { accumulate_sse2(acc, stripe, secret) }
30 }
31 }
32
33 /// # Safety
34 ///
35 /// You must ensure that the CPU has the SSE2 feature
36 #[inline]
37 #[target_feature(enable = "sse2")]
round_scramble_sse2(acc: &mut [u64; 8], secret_end: &[u8; 64])38 unsafe fn round_scramble_sse2(acc: &mut [u64; 8], secret_end: &[u8; 64]) {
39 // The scalar implementation is autovectorized nicely enough
40 scalar::Impl.round_scramble(acc, secret_end)
41 }
42
43 /// # Safety
44 ///
45 /// You must ensure that the CPU has the SSE2 feature
46 #[inline]
47 #[target_feature(enable = "sse2")]
accumulate_sse2(acc: &mut [u64; 8], stripe: &[u8; 64], secret: &[u8; 64])48 unsafe fn accumulate_sse2(acc: &mut [u64; 8], stripe: &[u8; 64], secret: &[u8; 64]) {
49 let acc = acc.as_mut_ptr().cast::<__m128i>();
50 let stripe = stripe.as_ptr().cast::<__m128i>();
51 let secret = secret.as_ptr().cast::<__m128i>();
52
53 // Safety: The caller has ensured we have the SSE2
54 // feature. We load from and store to references so we
55 // know that data is valid. We use unaligned loads /
56 // stores. Data manipulation is otherwise done on
57 // intermediate values.
58 unsafe {
59 for i in 0..4 {
60 // See [align-acc].
61 let mut acc_0 = _mm_loadu_si128(acc.add(i));
62 let stripe_0 = _mm_loadu_si128(stripe.add(i));
63 let secret_0 = _mm_loadu_si128(secret.add(i));
64
65 // let value[i] = stripe[i] ^ secret[i];
66 let value_0 = _mm_xor_si128(stripe_0, secret_0);
67
68 // stripe_swap[i] = stripe[i ^ 1]
69 let stripe_swap_0 = _mm_shuffle_epi32::<0b01_00_11_10>(stripe_0);
70
71 // acc[i] += stripe_swap[i]
72 acc_0 = _mm_add_epi64(acc_0, stripe_swap_0);
73
74 // value_shift[i] = value[i] >> 32
75 let value_shift_0 = _mm_srli_epi64::<32>(value_0);
76
77 // product[i] = lower_32_bit(value[i]) * lower_32_bit(value_shift[i])
78 let product_0 = _mm_mul_epu32(value_0, value_shift_0);
79
80 // acc[i] += product[i]
81 acc_0 = _mm_add_epi64(acc_0, product_0);
82
83 _mm_storeu_si128(acc.add(i), acc_0);
84 }
85 }
86 }
87