1 use super::Adler32Imp;
2
3 /// Resolves update implementation if CPU supports avx512f and avx512bw instructions.
get_imp() -> Option<Adler32Imp>4 pub fn get_imp() -> Option<Adler32Imp> {
5 get_imp_inner()
6 }
7
8 #[inline]
9 #[cfg(all(feature = "std", feature = "nightly", target_arch = "arm"))]
get_imp_inner() -> Option<Adler32Imp>10 fn get_imp_inner() -> Option<Adler32Imp> {
11 if std::is_arm_feature_detected("neon") {
12 Some(imp::update)
13 } else {
14 None
15 }
16 }
17
18 #[inline]
19 #[cfg(all(feature = "std", feature = "nightly", target_arch = "aarch64"))]
get_imp_inner() -> Option<Adler32Imp>20 fn get_imp_inner() -> Option<Adler32Imp> {
21 if std::is_aarch64_feature_detected("neon") {
22 Some(imp::update)
23 } else {
24 None
25 }
26 }
27
28 #[inline]
29 #[cfg(all(
30 feature = "nightly",
31 target_feature = "neon",
32 not(all(feature = "std", any(target_arch = "arm", target_arch = "aarch64")))
33 ))]
get_imp_inner() -> Option<Adler32Imp>34 fn get_imp_inner() -> Option<Adler32Imp> {
35 Some(imp::update)
36 }
37
38 #[inline]
39 #[cfg(all(
40 not(target_feature = "neon"),
41 not(all(
42 feature = "std",
43 feature = "nightly",
44 any(target_arch = "arm", target_arch = "aarch64")
45 ))
46 ))]
get_imp_inner() -> Option<Adler32Imp>47 fn get_imp_inner() -> Option<Adler32Imp> {
48 None
49 }
50
51 #[cfg(all(
52 feature = "nightly",
53 any(target_arch = "arm", target_arch = "aarch64"),
54 any(feature = "std", target_feature = "neon")
55 ))]
56 mod imp {
57 const MOD: u32 = 65521;
58 const NMAX: usize = 5552;
59 const BLOCK_SIZE: usize = 64;
60 const CHUNK_SIZE: usize = NMAX / BLOCK_SIZE * BLOCK_SIZE;
61
62 #[cfg(target_arch = "aarch64")]
63 use core::arch::aarch64::*;
64 #[cfg(target_arch = "arm")]
65 use core::arch::arm::*;
66
update(a: u16, b: u16, data: &[u8]) -> (u16, u16)67 pub fn update(a: u16, b: u16, data: &[u8]) -> (u16, u16) {
68 unsafe { update_imp(a, b, data) }
69 }
70
71 #[inline]
72 #[target_feature(enable = "neon")]
update_imp(a: u16, b: u16, data: &[u8]) -> (u16, u16)73 unsafe fn update_imp(a: u16, b: u16, data: &[u8]) -> (u16, u16) {
74 let mut a = a as u32;
75 let mut b = b as u32;
76
77 let chunks = data.chunks_exact(CHUNK_SIZE);
78 let remainder = chunks.remainder();
79 for chunk in chunks {
80 update_chunk_block(&mut a, &mut b, chunk);
81 }
82
83 update_block(&mut a, &mut b, remainder);
84
85 (a as u16, b as u16)
86 }
87
88 #[inline]
update_chunk_block(a: &mut u32, b: &mut u32, chunk: &[u8])89 unsafe fn update_chunk_block(a: &mut u32, b: &mut u32, chunk: &[u8]) {
90 debug_assert_eq!(
91 chunk.len(),
92 CHUNK_SIZE,
93 "Unexpected chunk size (expected {}, got {})",
94 CHUNK_SIZE,
95 chunk.len()
96 );
97
98 reduce_add_blocks(a, b, chunk);
99
100 *a %= MOD;
101 *b %= MOD;
102 }
103
104 #[inline]
update_block(a: &mut u32, b: &mut u32, chunk: &[u8])105 unsafe fn update_block(a: &mut u32, b: &mut u32, chunk: &[u8]) {
106 debug_assert!(
107 chunk.len() <= CHUNK_SIZE,
108 "Unexpected chunk size (expected <= {}, got {})",
109 CHUNK_SIZE,
110 chunk.len()
111 );
112
113 for byte in reduce_add_blocks(a, b, chunk) {
114 *a += *byte as u32;
115 *b += *a;
116 }
117
118 *a %= MOD;
119 *b %= MOD;
120 }
121
122 #[inline(always)]
reduce_add_blocks<'a>(a: &mut u32, b: &mut u32, chunk: &'a [u8]) -> &'a [u8]123 unsafe fn reduce_add_blocks<'a>(a: &mut u32, b: &mut u32, chunk: &'a [u8]) -> &'a [u8] {
124 if chunk.len() < BLOCK_SIZE {
125 return chunk;
126 }
127
128 let blocks = chunk.chunks_exact(BLOCK_SIZE);
129 let blocks_remainder = blocks.remainder();
130
131 let one_v = _mm512_set1_epi16(1);
132 let zero_v = _mm512_setzero_si512();
133 let weights = get_weights();
134
135 let p_v = (*a * blocks.len() as u32) as _;
136 let mut p_v = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, p_v);
137 let mut a_v = _mm512_setzero_si512();
138 let mut b_v = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, *b as _);
139
140 for block in blocks {
141 let block_ptr = block.as_ptr() as *const _;
142 let block = _mm512_loadu_si512(block_ptr);
143
144 p_v = _mm512_add_epi32(p_v, a_v);
145
146 a_v = _mm512_add_epi32(a_v, _mm512_sad_epu8(block, zero_v));
147 let mad = _mm512_maddubs_epi16(block, weights);
148 b_v = _mm512_add_epi32(b_v, _mm512_madd_epi16(mad, one_v));
149 }
150
151 b_v = _mm512_add_epi32(b_v, _mm512_slli_epi32(p_v, 6));
152
153 *a += reduce_add(a_v);
154 *b = reduce_add(b_v);
155
156 blocks_remainder
157 }
158
159 #[inline(always)]
reduce_add(v: __m512i) -> u32160 unsafe fn reduce_add(v: __m512i) -> u32 {
161 let v: [__m256i; 2] = core::mem::transmute(v);
162
163 reduce_add_256(v[0]) + reduce_add_256(v[1])
164 }
165
166 #[inline(always)]
reduce_add_256(v: __m256i) -> u32167 unsafe fn reduce_add_256(v: __m256i) -> u32 {
168 let v: [__m128i; 2] = core::mem::transmute(v);
169 let sum = _mm_add_epi32(v[0], v[1]);
170 let hi = _mm_unpackhi_epi64(sum, sum);
171
172 let sum = _mm_add_epi32(hi, sum);
173 let hi = _mm_shuffle_epi32(sum, crate::imp::_MM_SHUFFLE(2, 3, 0, 1));
174
175 let sum = _mm_add_epi32(sum, hi);
176 let sum = _mm_cvtsi128_si32(sum) as _;
177
178 sum
179 }
180
181 #[inline(always)]
get_weights() -> __m512i182 unsafe fn get_weights() -> __m512i {
183 _mm512_set_epi8(
184 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
185 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
186 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
187 )
188 }
189 }
190
191 #[cfg(test)]
192 mod tests {
193 use rand::Rng;
194
195 #[test]
zeroes()196 fn zeroes() {
197 assert_sum_eq(&[]);
198 assert_sum_eq(&[0]);
199 assert_sum_eq(&[0, 0]);
200 assert_sum_eq(&[0; 100]);
201 assert_sum_eq(&[0; 1024]);
202 assert_sum_eq(&[0; 1024 * 1024]);
203 }
204
205 #[test]
ones()206 fn ones() {
207 assert_sum_eq(&[]);
208 assert_sum_eq(&[1]);
209 assert_sum_eq(&[1, 1]);
210 assert_sum_eq(&[1; 100]);
211 assert_sum_eq(&[1; 1024]);
212 assert_sum_eq(&[1; 1024 * 1024]);
213 }
214
215 #[test]
random()216 fn random() {
217 let mut random = [0; 1024 * 1024];
218 rand::thread_rng().fill(&mut random[..]);
219
220 assert_sum_eq(&random[..1]);
221 assert_sum_eq(&random[..100]);
222 assert_sum_eq(&random[..1024]);
223 assert_sum_eq(&random[..1024 * 1024]);
224 }
225
226 /// Example calculation from https://en.wikipedia.org/wiki/Adler-32.
227 #[test]
wiki()228 fn wiki() {
229 assert_sum_eq(b"Wikipedia");
230 }
231
assert_sum_eq(data: &[u8])232 fn assert_sum_eq(data: &[u8]) {
233 if let Some(update) = super::get_imp() {
234 let (a, b) = update(1, 0, data);
235 let left = u32::from(b) << 16 | u32::from(a);
236 let right = adler::adler32_slice(data);
237
238 assert_eq!(left, right, "len({})", data.len());
239 }
240 }
241 }
242