1 // Copyright 2022 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifndef ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
16 #define ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
17
18 #include <cstdint>
19
20 #include "absl/base/config.h"
21
22 // -------------------------------------------------------------------------
23 // Many x86 and ARM machines have CRC acceleration hardware.
24 // We can do a faster version of Extend() on such machines.
25 // We define a translation layer for both x86 and ARM for the ease of use and
26 // most performance gains.
27
28 // This implementation requires 64-bit CRC instructions (part of SSE 4.2) and
29 // PCLMULQDQ instructions. 32-bit builds with SSE 4.2 do exist, so the
30 // __x86_64__ condition is necessary.
31 #if defined(__x86_64__) && defined(__SSE4_2__) && defined(__PCLMUL__)
32
33 #include <x86intrin.h>
34 #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
35
36 #elif defined(_MSC_VER) && !defined(__clang__) && defined(__AVX__) && \
37 defined(_M_AMD64)
38
39 // MSVC AVX (/arch:AVX) implies SSE 4.2 and PCLMULQDQ.
40 #include <intrin.h>
41 #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
42
43 #elif defined(__aarch64__) && defined(__LITTLE_ENDIAN__) && \
44 defined(__ARM_FEATURE_CRC32) && defined(ABSL_INTERNAL_HAVE_ARM_NEON) && \
45 defined(__ARM_FEATURE_CRYPTO)
46
47 #include <arm_acle.h>
48 #include <arm_neon.h>
49 #define ABSL_CRC_INTERNAL_HAVE_ARM_SIMD
50
51 #endif
52
53 namespace absl {
54 ABSL_NAMESPACE_BEGIN
55 namespace crc_internal {
56
57 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
58 defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
59
60 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
61 using V128 = uint64x2_t;
62 #else
63 // Note: Do not use __m128i_u, it is not portable.
64 // Use V128_LoadU() perform an unaligned load from __m128i*.
65 using V128 = __m128i;
66 #endif
67
68 // Starting with the initial value in |crc|, accumulates a CRC32 value for
69 // unsigned integers of different sizes.
70 uint32_t CRC32_u8(uint32_t crc, uint8_t v);
71
72 uint32_t CRC32_u16(uint32_t crc, uint16_t v);
73
74 uint32_t CRC32_u32(uint32_t crc, uint32_t v);
75
76 uint32_t CRC32_u64(uint32_t crc, uint64_t v);
77
78 // Loads 128 bits of integer data. |src| must be 16-byte aligned.
79 V128 V128_Load(const V128* src);
80
81 // Load 128 bits of integer data. |src| does not need to be aligned.
82 V128 V128_LoadU(const V128* src);
83
84 // Store 128 bits of integer data. |src| must be 16-byte aligned.
85 void V128_Store(V128* dst, V128 data);
86
87 // Polynomially multiplies the high 64 bits of |l| and |r|.
88 V128 V128_PMulHi(const V128 l, const V128 r);
89
90 // Polynomially multiplies the low 64 bits of |l| and |r|.
91 V128 V128_PMulLow(const V128 l, const V128 r);
92
93 // Polynomially multiplies the low 64 bits of |r| and high 64 bits of |l|.
94 V128 V128_PMul01(const V128 l, const V128 r);
95
96 // Polynomially multiplies the low 64 bits of |l| and high 64 bits of |r|.
97 V128 V128_PMul10(const V128 l, const V128 r);
98
99 // Produces a XOR operation of |l| and |r|.
100 V128 V128_Xor(const V128 l, const V128 r);
101
102 // Produces an AND operation of |l| and |r|.
103 V128 V128_And(const V128 l, const V128 r);
104
105 // Sets the lower half of a 128 bit register to the given 64-bit value and
106 // zeroes the upper half.
107 // dst[63:0] := |r|
108 // dst[127:64] := |0|
109 V128 V128_From64WithZeroFill(const uint64_t r);
110
111 // Shift |l| right by |imm| bytes while shifting in zeros.
112 template <int imm>
113 V128 V128_ShiftRight(const V128 l);
114
115 // Extracts a 32-bit integer from |l|, selected with |imm|.
116 template <int imm>
117 int V128_Extract32(const V128 l);
118
119 // Extracts a 64-bit integer from |l|, selected with |imm|.
120 template <int imm>
121 uint64_t V128_Extract64(const V128 l);
122
123 // Extracts the low 64 bits from V128.
124 int64_t V128_Low64(const V128 l);
125
126 // Add packed 64-bit integers in |l| and |r|.
127 V128 V128_Add64(const V128 l, const V128 r);
128
129 #endif
130
131 #if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
132
CRC32_u8(uint32_t crc,uint8_t v)133 inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) {
134 return _mm_crc32_u8(crc, v);
135 }
136
CRC32_u16(uint32_t crc,uint16_t v)137 inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
138 return _mm_crc32_u16(crc, v);
139 }
140
CRC32_u32(uint32_t crc,uint32_t v)141 inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
142 return _mm_crc32_u32(crc, v);
143 }
144
CRC32_u64(uint32_t crc,uint64_t v)145 inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
146 return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
147 }
148
V128_Load(const V128 * src)149 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
150
V128_LoadU(const V128 * src)151 inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
152
V128_Store(V128 * dst,V128 data)153 inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }
154
V128_PMulHi(const V128 l,const V128 r)155 inline V128 V128_PMulHi(const V128 l, const V128 r) {
156 return _mm_clmulepi64_si128(l, r, 0x11);
157 }
158
V128_PMulLow(const V128 l,const V128 r)159 inline V128 V128_PMulLow(const V128 l, const V128 r) {
160 return _mm_clmulepi64_si128(l, r, 0x00);
161 }
162
V128_PMul01(const V128 l,const V128 r)163 inline V128 V128_PMul01(const V128 l, const V128 r) {
164 return _mm_clmulepi64_si128(l, r, 0x01);
165 }
166
V128_PMul10(const V128 l,const V128 r)167 inline V128 V128_PMul10(const V128 l, const V128 r) {
168 return _mm_clmulepi64_si128(l, r, 0x10);
169 }
170
V128_Xor(const V128 l,const V128 r)171 inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
172
V128_And(const V128 l,const V128 r)173 inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
174
V128_From64WithZeroFill(const uint64_t r)175 inline V128 V128_From64WithZeroFill(const uint64_t r) {
176 return _mm_set_epi64x(static_cast<int64_t>(0), static_cast<int64_t>(r));
177 }
178
179 template <int imm>
V128_ShiftRight(const V128 l)180 inline V128 V128_ShiftRight(const V128 l) {
181 return _mm_srli_si128(l, imm);
182 }
183
184 template <int imm>
V128_Extract32(const V128 l)185 inline int V128_Extract32(const V128 l) {
186 return _mm_extract_epi32(l, imm);
187 }
188
189 template <int imm>
V128_Extract64(const V128 l)190 inline uint64_t V128_Extract64(const V128 l) {
191 return static_cast<uint64_t>(_mm_extract_epi64(l, imm));
192 }
193
V128_Low64(const V128 l)194 inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
195
V128_Add64(const V128 l,const V128 r)196 inline V128 V128_Add64(const V128 l, const V128 r) {
197 return _mm_add_epi64(l, r);
198 }
199
200 #elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
201
CRC32_u8(uint32_t crc,uint8_t v)202 inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) { return __crc32cb(crc, v); }
203
CRC32_u16(uint32_t crc,uint16_t v)204 inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
205 return __crc32ch(crc, v);
206 }
207
CRC32_u32(uint32_t crc,uint32_t v)208 inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
209 return __crc32cw(crc, v);
210 }
211
CRC32_u64(uint32_t crc,uint64_t v)212 inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
213 return __crc32cd(crc, v);
214 }
215
V128_Load(const V128 * src)216 inline V128 V128_Load(const V128* src) {
217 return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
218 }
219
V128_LoadU(const V128 * src)220 inline V128 V128_LoadU(const V128* src) {
221 return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
222 }
223
V128_Store(V128 * dst,V128 data)224 inline void V128_Store(V128* dst, V128 data) {
225 vst1q_u64(reinterpret_cast<uint64_t*>(dst), data);
226 }
227
228 // Using inline assembly as clang does not generate the pmull2 instruction and
229 // performance drops by 15-20%.
230 // TODO(b/193678732): Investigate why there is a slight performance hit when
231 // using intrinsics instead of inline assembly.
V128_PMulHi(const V128 l,const V128 r)232 inline V128 V128_PMulHi(const V128 l, const V128 r) {
233 uint64x2_t res;
234 __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t"
235 : "=w"(res)
236 : "w"(l), "w"(r));
237 return res;
238 }
239
240 // TODO(b/193678732): Investigate why the compiler decides to move the constant
241 // loop multiplicands from GPR to Neon registers every loop iteration.
V128_PMulLow(const V128 l,const V128 r)242 inline V128 V128_PMulLow(const V128 l, const V128 r) {
243 uint64x2_t res;
244 __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t"
245 : "=w"(res)
246 : "w"(l), "w"(r));
247 return res;
248 }
249
V128_PMul01(const V128 l,const V128 r)250 inline V128 V128_PMul01(const V128 l, const V128 r) {
251 return reinterpret_cast<V128>(vmull_p64(
252 reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(l))),
253 reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
254 }
255
V128_PMul10(const V128 l,const V128 r)256 inline V128 V128_PMul10(const V128 l, const V128 r) {
257 return reinterpret_cast<V128>(vmull_p64(
258 reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
259 reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(r)))));
260 }
261
V128_Xor(const V128 l,const V128 r)262 inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); }
263
V128_And(const V128 l,const V128 r)264 inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); }
265
V128_From64WithZeroFill(const uint64_t r)266 inline V128 V128_From64WithZeroFill(const uint64_t r){
267 constexpr uint64x2_t kZero = {0, 0};
268 return vsetq_lane_u64(r, kZero, 0);
269 }
270
271
272 template <int imm>
V128_ShiftRight(const V128 l)273 inline V128 V128_ShiftRight(const V128 l) {
274 return vreinterpretq_u64_s8(
275 vextq_s8(vreinterpretq_s8_u64(l), vdupq_n_s8(0), imm));
276 }
277
278 template <int imm>
V128_Extract32(const V128 l)279 inline int V128_Extract32(const V128 l) {
280 return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
281 }
282
283 template <int imm>
V128_Extract64(const V128 l)284 inline uint64_t V128_Extract64(const V128 l) {
285 return vgetq_lane_u64(l, imm);
286 }
287
V128_Low64(const V128 l)288 inline int64_t V128_Low64(const V128 l) {
289 return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
290 }
291
V128_Add64(const V128 l,const V128 r)292 inline V128 V128_Add64(const V128 l, const V128 r) { return vaddq_u64(l, r); }
293
294 #endif
295
296 } // namespace crc_internal
297 ABSL_NAMESPACE_END
298 } // namespace absl
299
300 #endif // ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
301