• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
16 #define ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
17 
18 #include <cstdint>
19 
20 #include "absl/base/config.h"
21 
22 // -------------------------------------------------------------------------
23 // Many x86 and ARM machines have CRC acceleration hardware.
24 // We can do a faster version of Extend() on such machines.
25 // We define a translation layer for both x86 and ARM for the ease of use and
26 // most performance gains.
27 
28 // This implementation requires 64-bit CRC instructions (part of SSE 4.2) and
29 // PCLMULQDQ instructions. 32-bit builds with SSE 4.2 do exist, so the
30 // __x86_64__ condition is necessary.
31 #if defined(__x86_64__) && defined(__SSE4_2__) && defined(__PCLMUL__)
32 
33 #include <x86intrin.h>
34 #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
35 
36 #elif defined(_MSC_VER) && !defined(__clang__) && defined(__AVX__)
37 
38 // MSVC AVX (/arch:AVX) implies SSE 4.2 and PCLMULQDQ.
39 #include <intrin.h>
40 #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
41 
42 #elif defined(__aarch64__) && defined(__LITTLE_ENDIAN__) && \
43     defined(__ARM_FEATURE_CRC32) && defined(ABSL_INTERNAL_HAVE_ARM_NEON) &&  \
44     defined(__ARM_FEATURE_CRYPTO)
45 
46 #include <arm_acle.h>
47 #include <arm_neon.h>
48 #define ABSL_CRC_INTERNAL_HAVE_ARM_SIMD
49 
50 #endif
51 
52 namespace absl {
53 ABSL_NAMESPACE_BEGIN
54 namespace crc_internal {
55 
56 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
57     defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
58 
59 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
60 using V128 = uint64x2_t;
61 #else
62 // Note: Do not use __m128i_u, it is not portable.
63 // Use V128_LoadU() perform an unaligned load from __m128i*.
64 using V128 = __m128i;
65 #endif
66 
67 // Starting with the initial value in |crc|, accumulates a CRC32 value for
68 // unsigned integers of different sizes.
69 uint32_t CRC32_u8(uint32_t crc, uint8_t v);
70 
71 uint32_t CRC32_u16(uint32_t crc, uint16_t v);
72 
73 uint32_t CRC32_u32(uint32_t crc, uint32_t v);
74 
75 uint32_t CRC32_u64(uint32_t crc, uint64_t v);
76 
77 // Loads 128 bits of integer data. |src| must be 16-byte aligned.
78 V128 V128_Load(const V128* src);
79 
80 // Load 128 bits of integer data. |src| does not need to be aligned.
81 V128 V128_LoadU(const V128* src);
82 
83 // Store 128 bits of integer data. |src| must be 16-byte aligned.
84 void V128_Store(V128* dst, V128 data);
85 
86 // Polynomially multiplies the high 64 bits of |l| and |r|.
87 V128 V128_PMulHi(const V128 l, const V128 r);
88 
89 // Polynomially multiplies the low 64 bits of |l| and |r|.
90 V128 V128_PMulLow(const V128 l, const V128 r);
91 
92 // Polynomially multiplies the low 64 bits of |r| and high 64 bits of |l|.
93 V128 V128_PMul01(const V128 l, const V128 r);
94 
95 // Polynomially multiplies the low 64 bits of |l| and high 64 bits of |r|.
96 V128 V128_PMul10(const V128 l, const V128 r);
97 
98 // Produces a XOR operation of |l| and |r|.
99 V128 V128_Xor(const V128 l, const V128 r);
100 
101 // Produces an AND operation of |l| and |r|.
102 V128 V128_And(const V128 l, const V128 r);
103 
104 // Sets two 64 bit integers to one 128 bit vector. The order is reverse.
105 // dst[63:0] := |r|
106 // dst[127:64] := |l|
107 V128 V128_From2x64(const uint64_t l, const uint64_t r);
108 
109 // Shift |l| right by |imm| bytes while shifting in zeros.
110 template <int imm>
111 V128 V128_ShiftRight(const V128 l);
112 
113 // Extracts a 32-bit integer from |l|, selected with |imm|.
114 template <int imm>
115 int V128_Extract32(const V128 l);
116 
117 // Extracts a 64-bit integer from |l|, selected with |imm|.
118 template <int imm>
119 uint64_t V128_Extract64(const V128 l);
120 
121 // Extracts the low 64 bits from V128.
122 int64_t V128_Low64(const V128 l);
123 
124 // Left-shifts packed 64-bit integers in l by r.
125 V128 V128_ShiftLeft64(const V128 l, const V128 r);
126 
127 #endif
128 
129 #if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
130 
CRC32_u8(uint32_t crc,uint8_t v)131 inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) {
132   return _mm_crc32_u8(crc, v);
133 }
134 
CRC32_u16(uint32_t crc,uint16_t v)135 inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
136   return _mm_crc32_u16(crc, v);
137 }
138 
CRC32_u32(uint32_t crc,uint32_t v)139 inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
140   return _mm_crc32_u32(crc, v);
141 }
142 
CRC32_u64(uint32_t crc,uint64_t v)143 inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
144   return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
145 }
146 
V128_Load(const V128 * src)147 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
148 
V128_LoadU(const V128 * src)149 inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
150 
V128_Store(V128 * dst,V128 data)151 inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }
152 
V128_PMulHi(const V128 l,const V128 r)153 inline V128 V128_PMulHi(const V128 l, const V128 r) {
154   return _mm_clmulepi64_si128(l, r, 0x11);
155 }
156 
V128_PMulLow(const V128 l,const V128 r)157 inline V128 V128_PMulLow(const V128 l, const V128 r) {
158   return _mm_clmulepi64_si128(l, r, 0x00);
159 }
160 
V128_PMul01(const V128 l,const V128 r)161 inline V128 V128_PMul01(const V128 l, const V128 r) {
162   return _mm_clmulepi64_si128(l, r, 0x01);
163 }
164 
V128_PMul10(const V128 l,const V128 r)165 inline V128 V128_PMul10(const V128 l, const V128 r) {
166   return _mm_clmulepi64_si128(l, r, 0x10);
167 }
168 
V128_Xor(const V128 l,const V128 r)169 inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
170 
V128_And(const V128 l,const V128 r)171 inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
172 
V128_From2x64(const uint64_t l,const uint64_t r)173 inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
174   return _mm_set_epi64x(static_cast<int64_t>(l), static_cast<int64_t>(r));
175 }
176 
177 template <int imm>
V128_ShiftRight(const V128 l)178 inline V128 V128_ShiftRight(const V128 l) {
179   return _mm_srli_si128(l, imm);
180 }
181 
182 template <int imm>
V128_Extract32(const V128 l)183 inline int V128_Extract32(const V128 l) {
184   return _mm_extract_epi32(l, imm);
185 }
186 
187 template <int imm>
V128_Extract64(const V128 l)188 inline uint64_t V128_Extract64(const V128 l) {
189   return static_cast<uint64_t>(_mm_extract_epi64(l, imm));
190 }
191 
V128_Low64(const V128 l)192 inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
193 
V128_ShiftLeft64(const V128 l,const V128 r)194 inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
195   return _mm_sll_epi64(l, r);
196 }
197 
198 #elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
199 
CRC32_u8(uint32_t crc,uint8_t v)200 inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) { return __crc32cb(crc, v); }
201 
CRC32_u16(uint32_t crc,uint16_t v)202 inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
203   return __crc32ch(crc, v);
204 }
205 
CRC32_u32(uint32_t crc,uint32_t v)206 inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
207   return __crc32cw(crc, v);
208 }
209 
CRC32_u64(uint32_t crc,uint64_t v)210 inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
211   return __crc32cd(crc, v);
212 }
213 
V128_Load(const V128 * src)214 inline V128 V128_Load(const V128* src) {
215   return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
216 }
217 
V128_LoadU(const V128 * src)218 inline V128 V128_LoadU(const V128* src) {
219   return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
220 }
221 
V128_Store(V128 * dst,V128 data)222 inline void V128_Store(V128* dst, V128 data) {
223   vst1q_u64(reinterpret_cast<uint64_t*>(dst), data);
224 }
225 
226 // Using inline assembly as clang does not generate the pmull2 instruction and
227 // performance drops by 15-20%.
228 // TODO(b/193678732): Investigate why there is a slight performance hit when
229 // using intrinsics instead of inline assembly.
V128_PMulHi(const V128 l,const V128 r)230 inline V128 V128_PMulHi(const V128 l, const V128 r) {
231   uint64x2_t res;
232   __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t"
233                        : "=w"(res)
234                        : "w"(l), "w"(r));
235   return res;
236 }
237 
238 // TODO(b/193678732): Investigate why the compiler decides to move the constant
239 // loop multiplicands from GPR to Neon registers every loop iteration.
V128_PMulLow(const V128 l,const V128 r)240 inline V128 V128_PMulLow(const V128 l, const V128 r) {
241   uint64x2_t res;
242   __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t"
243                        : "=w"(res)
244                        : "w"(l), "w"(r));
245   return res;
246 }
247 
V128_PMul01(const V128 l,const V128 r)248 inline V128 V128_PMul01(const V128 l, const V128 r) {
249   return reinterpret_cast<V128>(vmull_p64(
250       reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(l))),
251       reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
252 }
253 
V128_PMul10(const V128 l,const V128 r)254 inline V128 V128_PMul10(const V128 l, const V128 r) {
255   return reinterpret_cast<V128>(vmull_p64(
256       reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
257       reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(r)))));
258 }
259 
V128_Xor(const V128 l,const V128 r)260 inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); }
261 
V128_And(const V128 l,const V128 r)262 inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); }
263 
V128_From2x64(const uint64_t l,const uint64_t r)264 inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
265   return vcombine_u64(vcreate_u64(r), vcreate_u64(l));
266 }
267 
268 template <int imm>
V128_ShiftRight(const V128 l)269 inline V128 V128_ShiftRight(const V128 l) {
270   return vreinterpretq_u64_s8(
271       vextq_s8(vreinterpretq_s8_u64(l), vdupq_n_s8(0), imm));
272 }
273 
274 template <int imm>
V128_Extract32(const V128 l)275 inline int V128_Extract32(const V128 l) {
276   return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
277 }
278 
279 template <int imm>
V128_Extract64(const V128 l)280 inline uint64_t V128_Extract64(const V128 l) {
281   return vgetq_lane_u64(l, imm);
282 }
283 
V128_Low64(const V128 l)284 inline int64_t V128_Low64(const V128 l) {
285   return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
286 }
287 
V128_ShiftLeft64(const V128 l,const V128 r)288 inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
289   return vshlq_u64(l, vreinterpretq_s64_u64(r));
290 }
291 
292 #endif
293 
294 }  // namespace crc_internal
295 ABSL_NAMESPACE_END
296 }  // namespace absl
297 
298 #endif  // ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
299