• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use core::arch::x86_64::*;
2 
3 use super::{scalar, Vector};
4 
5 #[derive(Copy, Clone)]
6 pub struct Impl(());
7 
8 impl Impl {
9     /// # Safety
10     ///
11     /// You must ensure that the CPU has the SSE2 feature
12     #[inline]
13     #[cfg(feature = "std")]
new_unchecked() -> Impl14     pub unsafe fn new_unchecked() -> Impl {
15         Impl(())
16     }
17 }
18 
19 impl Vector for Impl {
20     #[inline]
round_scramble(&self, acc: &mut [u64; 8], secret_end: &[u8; 64])21     fn round_scramble(&self, acc: &mut [u64; 8], secret_end: &[u8; 64]) {
22         // Safety: Type can only be constructed when SSE2 feature is present
23         unsafe { round_scramble_sse2(acc, secret_end) }
24     }
25 
26     #[inline]
accumulate(&self, acc: &mut [u64; 8], stripe: &[u8; 64], secret: &[u8; 64])27     fn accumulate(&self, acc: &mut [u64; 8], stripe: &[u8; 64], secret: &[u8; 64]) {
28         // Safety: Type can only be constructed when SSE2 feature is present
29         unsafe { accumulate_sse2(acc, stripe, secret) }
30     }
31 }
32 
33 /// # Safety
34 ///
35 /// You must ensure that the CPU has the SSE2 feature
36 #[inline]
37 #[target_feature(enable = "sse2")]
round_scramble_sse2(acc: &mut [u64; 8], secret_end: &[u8; 64])38 unsafe fn round_scramble_sse2(acc: &mut [u64; 8], secret_end: &[u8; 64]) {
39     // The scalar implementation is autovectorized nicely enough
40     scalar::Impl.round_scramble(acc, secret_end)
41 }
42 
43 /// # Safety
44 ///
45 /// You must ensure that the CPU has the SSE2 feature
46 #[inline]
47 #[target_feature(enable = "sse2")]
accumulate_sse2(acc: &mut [u64; 8], stripe: &[u8; 64], secret: &[u8; 64])48 unsafe fn accumulate_sse2(acc: &mut [u64; 8], stripe: &[u8; 64], secret: &[u8; 64]) {
49     let acc = acc.as_mut_ptr().cast::<__m128i>();
50     let stripe = stripe.as_ptr().cast::<__m128i>();
51     let secret = secret.as_ptr().cast::<__m128i>();
52 
53     // Safety: The caller has ensured we have the SSE2
54     // feature. We load from and store to references so we
55     // know that data is valid. We use unaligned loads /
56     // stores. Data manipulation is otherwise done on
57     // intermediate values.
58     unsafe {
59         for i in 0..4 {
60             // See [align-acc].
61             let mut acc_0 = _mm_loadu_si128(acc.add(i));
62             let stripe_0 = _mm_loadu_si128(stripe.add(i));
63             let secret_0 = _mm_loadu_si128(secret.add(i));
64 
65             // let value[i] = stripe[i] ^ secret[i];
66             let value_0 = _mm_xor_si128(stripe_0, secret_0);
67 
68             // stripe_swap[i] = stripe[i ^ 1]
69             let stripe_swap_0 = _mm_shuffle_epi32::<0b01_00_11_10>(stripe_0);
70 
71             // acc[i] += stripe_swap[i]
72             acc_0 = _mm_add_epi64(acc_0, stripe_swap_0);
73 
74             // value_shift[i] = value[i] >> 32
75             let value_shift_0 = _mm_srli_epi64::<32>(value_0);
76 
77             // product[i] = lower_32_bit(value[i]) * lower_32_bit(value_shift[i])
78             let product_0 = _mm_mul_epu32(value_0, value_shift_0);
79 
80             // acc[i] += product[i]
81             acc_0 = _mm_add_epi64(acc_0, product_0);
82 
83             _mm_storeu_si128(acc.add(i), acc_0);
84         }
85     }
86 }
87