• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use super::Adler32Imp;
2 
3 /// Resolves update implementation if CPU supports avx512f and avx512bw instructions.
get_imp() -> Option<Adler32Imp>4 pub fn get_imp() -> Option<Adler32Imp> {
5   get_imp_inner()
6 }
7 
8 #[inline]
9 #[cfg(all(feature = "std", feature = "nightly", target_arch = "arm"))]
get_imp_inner() -> Option<Adler32Imp>10 fn get_imp_inner() -> Option<Adler32Imp> {
11   if std::is_arm_feature_detected("neon") {
12     Some(imp::update)
13   } else {
14     None
15   }
16 }
17 
18 #[inline]
19 #[cfg(all(feature = "std", feature = "nightly", target_arch = "aarch64"))]
get_imp_inner() -> Option<Adler32Imp>20 fn get_imp_inner() -> Option<Adler32Imp> {
21   if std::is_aarch64_feature_detected("neon") {
22     Some(imp::update)
23   } else {
24     None
25   }
26 }
27 
28 #[inline]
29 #[cfg(all(
30   feature = "nightly",
31   target_feature = "neon",
32   not(all(feature = "std", any(target_arch = "arm", target_arch = "aarch64")))
33 ))]
get_imp_inner() -> Option<Adler32Imp>34 fn get_imp_inner() -> Option<Adler32Imp> {
35   Some(imp::update)
36 }
37 
38 #[inline]
39 #[cfg(all(
40   not(target_feature = "neon"),
41   not(all(
42     feature = "std",
43     feature = "nightly",
44     any(target_arch = "arm", target_arch = "aarch64")
45   ))
46 ))]
get_imp_inner() -> Option<Adler32Imp>47 fn get_imp_inner() -> Option<Adler32Imp> {
48   None
49 }
50 
51 #[cfg(all(
52   feature = "nightly",
53   any(target_arch = "arm", target_arch = "aarch64"),
54   any(feature = "std", target_feature = "neon")
55 ))]
56 mod imp {
57   const MOD: u32 = 65521;
58   const NMAX: usize = 5552;
59   const BLOCK_SIZE: usize = 64;
60   const CHUNK_SIZE: usize = NMAX / BLOCK_SIZE * BLOCK_SIZE;
61 
62   #[cfg(target_arch = "aarch64")]
63   use core::arch::aarch64::*;
64   #[cfg(target_arch = "arm")]
65   use core::arch::arm::*;
66 
update(a: u16, b: u16, data: &[u8]) -> (u16, u16)67   pub fn update(a: u16, b: u16, data: &[u8]) -> (u16, u16) {
68     unsafe { update_imp(a, b, data) }
69   }
70 
71   #[inline]
72   #[target_feature(enable = "neon")]
update_imp(a: u16, b: u16, data: &[u8]) -> (u16, u16)73   unsafe fn update_imp(a: u16, b: u16, data: &[u8]) -> (u16, u16) {
74     let mut a = a as u32;
75     let mut b = b as u32;
76 
77     let chunks = data.chunks_exact(CHUNK_SIZE);
78     let remainder = chunks.remainder();
79     for chunk in chunks {
80       update_chunk_block(&mut a, &mut b, chunk);
81     }
82 
83     update_block(&mut a, &mut b, remainder);
84 
85     (a as u16, b as u16)
86   }
87 
88   #[inline]
update_chunk_block(a: &mut u32, b: &mut u32, chunk: &[u8])89   unsafe fn update_chunk_block(a: &mut u32, b: &mut u32, chunk: &[u8]) {
90     debug_assert_eq!(
91       chunk.len(),
92       CHUNK_SIZE,
93       "Unexpected chunk size (expected {}, got {})",
94       CHUNK_SIZE,
95       chunk.len()
96     );
97 
98     reduce_add_blocks(a, b, chunk);
99 
100     *a %= MOD;
101     *b %= MOD;
102   }
103 
104   #[inline]
update_block(a: &mut u32, b: &mut u32, chunk: &[u8])105   unsafe fn update_block(a: &mut u32, b: &mut u32, chunk: &[u8]) {
106     debug_assert!(
107       chunk.len() <= CHUNK_SIZE,
108       "Unexpected chunk size (expected <= {}, got {})",
109       CHUNK_SIZE,
110       chunk.len()
111     );
112 
113     for byte in reduce_add_blocks(a, b, chunk) {
114       *a += *byte as u32;
115       *b += *a;
116     }
117 
118     *a %= MOD;
119     *b %= MOD;
120   }
121 
122   #[inline(always)]
reduce_add_blocks<'a>(a: &mut u32, b: &mut u32, chunk: &'a [u8]) -> &'a [u8]123   unsafe fn reduce_add_blocks<'a>(a: &mut u32, b: &mut u32, chunk: &'a [u8]) -> &'a [u8] {
124     if chunk.len() < BLOCK_SIZE {
125       return chunk;
126     }
127 
128     let blocks = chunk.chunks_exact(BLOCK_SIZE);
129     let blocks_remainder = blocks.remainder();
130 
131     let one_v = _mm512_set1_epi16(1);
132     let zero_v = _mm512_setzero_si512();
133     let weights = get_weights();
134 
135     let p_v = (*a * blocks.len() as u32) as _;
136     let mut p_v = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, p_v);
137     let mut a_v = _mm512_setzero_si512();
138     let mut b_v = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, *b as _);
139 
140     for block in blocks {
141       let block_ptr = block.as_ptr() as *const _;
142       let block = _mm512_loadu_si512(block_ptr);
143 
144       p_v = _mm512_add_epi32(p_v, a_v);
145 
146       a_v = _mm512_add_epi32(a_v, _mm512_sad_epu8(block, zero_v));
147       let mad = _mm512_maddubs_epi16(block, weights);
148       b_v = _mm512_add_epi32(b_v, _mm512_madd_epi16(mad, one_v));
149     }
150 
151     b_v = _mm512_add_epi32(b_v, _mm512_slli_epi32(p_v, 6));
152 
153     *a += reduce_add(a_v);
154     *b = reduce_add(b_v);
155 
156     blocks_remainder
157   }
158 
159   #[inline(always)]
reduce_add(v: __m512i) -> u32160   unsafe fn reduce_add(v: __m512i) -> u32 {
161     let v: [__m256i; 2] = core::mem::transmute(v);
162 
163     reduce_add_256(v[0]) + reduce_add_256(v[1])
164   }
165 
166   #[inline(always)]
reduce_add_256(v: __m256i) -> u32167   unsafe fn reduce_add_256(v: __m256i) -> u32 {
168     let v: [__m128i; 2] = core::mem::transmute(v);
169     let sum = _mm_add_epi32(v[0], v[1]);
170     let hi = _mm_unpackhi_epi64(sum, sum);
171 
172     let sum = _mm_add_epi32(hi, sum);
173     let hi = _mm_shuffle_epi32(sum, crate::imp::_MM_SHUFFLE(2, 3, 0, 1));
174 
175     let sum = _mm_add_epi32(sum, hi);
176     let sum = _mm_cvtsi128_si32(sum) as _;
177 
178     sum
179   }
180 
181   #[inline(always)]
get_weights() -> __m512i182   unsafe fn get_weights() -> __m512i {
183     _mm512_set_epi8(
184       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
185       24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
186       45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
187     )
188   }
189 }
190 
191 #[cfg(test)]
192 mod tests {
193   use rand::Rng;
194 
195   #[test]
zeroes()196   fn zeroes() {
197     assert_sum_eq(&[]);
198     assert_sum_eq(&[0]);
199     assert_sum_eq(&[0, 0]);
200     assert_sum_eq(&[0; 100]);
201     assert_sum_eq(&[0; 1024]);
202     assert_sum_eq(&[0; 1024 * 1024]);
203   }
204 
205   #[test]
ones()206   fn ones() {
207     assert_sum_eq(&[]);
208     assert_sum_eq(&[1]);
209     assert_sum_eq(&[1, 1]);
210     assert_sum_eq(&[1; 100]);
211     assert_sum_eq(&[1; 1024]);
212     assert_sum_eq(&[1; 1024 * 1024]);
213   }
214 
215   #[test]
random()216   fn random() {
217     let mut random = [0; 1024 * 1024];
218     rand::thread_rng().fill(&mut random[..]);
219 
220     assert_sum_eq(&random[..1]);
221     assert_sum_eq(&random[..100]);
222     assert_sum_eq(&random[..1024]);
223     assert_sum_eq(&random[..1024 * 1024]);
224   }
225 
226   /// Example calculation from https://en.wikipedia.org/wiki/Adler-32.
227   #[test]
wiki()228   fn wiki() {
229     assert_sum_eq(b"Wikipedia");
230   }
231 
assert_sum_eq(data: &[u8])232   fn assert_sum_eq(data: &[u8]) {
233     if let Some(update) = super::get_imp() {
234       let (a, b) = update(1, 0, data);
235       let left = u32::from(b) << 16 | u32::from(a);
236       let right = adler::adler32_slice(data);
237 
238       assert_eq!(left, right, "len({})", data.len());
239     }
240   }
241 }
242