1 // Copyright 2022 The Abseil Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer
16 // registers because XMM registers do not support the CRC instruction (yet).
17 // While copying, compute the running CRC of the data being copied.
18 //
19 // It is assumed that any CPU running this code has SSE4.2 instructions
20 // available (for CRC32C). This file will do nothing if that is not true.
21 //
22 // The CRC instruction has a 3-byte latency, and we are stressing the ALU ports
23 // here (unlike a traditional memcopy, which has almost no ALU use), so we will
24 // need to copy in such a way that the CRC unit is used efficiently. We have two
25 // regimes in this code:
26 // 1. For operations of size < kCrcSmallSize, do the CRC then the memcpy
27 // 2. For operations of size > kCrcSmallSize:
28 // a) compute an initial CRC + copy on a small amount of data to align the
29 // destination pointer on a 16-byte boundary.
30 // b) Split the data into 3 main regions and a tail (smaller than 48 bytes)
31 // c) Do the copy and CRC of the 3 main regions, interleaving (start with
32 // full cache line copies for each region, then move to single 16 byte
33 // pieces per region).
34 // d) Combine the CRCs with CRC32C::Concat.
35 // e) Copy the tail and extend the CRC with the CRC of the tail.
36 // This method is not ideal for op sizes between ~1k and ~8k because CRC::Concat
37 // takes a significant amount of time. A medium-sized approach could be added
38 // using 3 CRCs over fixed-size blocks where the zero-extensions required for
39 // CRC32C::Concat can be precomputed.
40
41 #ifdef __SSE4_2__
42 #include <immintrin.h>
43 #endif
44
45 #ifdef _MSC_VER
46 #include <intrin.h>
47 #endif
48
49 #include <array>
50 #include <cstddef>
51 #include <cstdint>
52 #include <cstring>
53 #include <memory>
54
55 #include "absl/base/attributes.h"
56 #include "absl/base/config.h"
57 #include "absl/base/optimization.h"
58 #include "absl/base/prefetch.h"
59 #include "absl/crc/crc32c.h"
60 #include "absl/crc/internal/cpu_detect.h"
61 #include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
62 #include "absl/crc/internal/crc_memcpy.h"
63 #include "absl/strings/string_view.h"
64
65 #if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \
66 defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
67
68 namespace absl {
69 ABSL_NAMESPACE_BEGIN
70 namespace crc_internal {
71
72 namespace {
73
ShortCrcCopy(char * dst,const char * src,std::size_t length,crc32c_t crc)74 inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
75 crc32c_t crc) {
76 // Small copy: just go 1 byte at a time: being nice to the branch predictor
77 // is more important here than anything else
78 uint32_t crc_uint32 = static_cast<uint32_t>(crc);
79 for (std::size_t i = 0; i < length; i++) {
80 uint8_t data = *reinterpret_cast<const uint8_t*>(src);
81 crc_uint32 = CRC32_u8(crc_uint32, data);
82 *reinterpret_cast<uint8_t*>(dst) = data;
83 ++src;
84 ++dst;
85 }
86 return crc32c_t{crc_uint32};
87 }
88
89 constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t);
90
91 // Common function for copying the tails of multiple large regions.
92 // Disable ubsan for benign unaligned access. See b/254108538.
93 template <size_t vec_regions, size_t int_regions>
LargeTailCopy(crc32c_t * crcs,char ** dst,const char ** src,size_t region_size,size_t copy_rounds)94 ABSL_ATTRIBUTE_NO_SANITIZE_UNDEFINED inline void LargeTailCopy(
95 crc32c_t* crcs, char** dst, const char** src, size_t region_size,
96 size_t copy_rounds) {
97 std::array<V128, vec_regions> data;
98 std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
99
100 while (copy_rounds > 0) {
101 for (size_t i = 0; i < vec_regions; i++) {
102 size_t region = i;
103
104 auto* vsrc = reinterpret_cast<const V128*>(*src + region_size * region);
105 auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region);
106
107 // Load the blocks, unaligned
108 data[i] = V128_LoadU(vsrc);
109
110 // Store the blocks, aligned
111 V128_Store(vdst, data[i]);
112
113 // Compute the running CRC
114 crcs[region] = crc32c_t{static_cast<uint32_t>(
115 CRC32_u64(static_cast<uint32_t>(crcs[region]),
116 static_cast<uint64_t>(V128_Extract64<0>(data[i]))))};
117 crcs[region] = crc32c_t{static_cast<uint32_t>(
118 CRC32_u64(static_cast<uint32_t>(crcs[region]),
119 static_cast<uint64_t>(V128_Extract64<1>(data[i]))))};
120 }
121
122 for (size_t i = 0; i < int_regions; i++) {
123 size_t region = vec_regions + i;
124
125 auto* usrc =
126 reinterpret_cast<const uint64_t*>(*src + region_size * region);
127 auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);
128
129 for (size_t j = 0; j < kIntLoadsPerVec; j++) {
130 size_t data_index = i * kIntLoadsPerVec + j;
131
132 int_data[data_index] = *(usrc + j);
133 crcs[region] = crc32c_t{CRC32_u64(static_cast<uint32_t>(crcs[region]),
134 int_data[data_index])};
135
136 *(udst + j) = int_data[data_index];
137 }
138 }
139
140 // Increment pointers
141 *src += sizeof(V128);
142 *dst += sizeof(V128);
143 --copy_rounds;
144 }
145 }
146
147 } // namespace
148
149 template <size_t vec_regions, size_t int_regions>
150 class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
151 public:
152 AcceleratedCrcMemcpyEngine() = default;
153 AcceleratedCrcMemcpyEngine(const AcceleratedCrcMemcpyEngine&) = delete;
154 AcceleratedCrcMemcpyEngine operator=(const AcceleratedCrcMemcpyEngine&) =
155 delete;
156
157 crc32c_t Compute(void* __restrict dst, const void* __restrict src,
158 std::size_t length, crc32c_t initial_crc) const override;
159 };
160
161 // Disable ubsan for benign unaligned access. See b/254108538.
162 template <size_t vec_regions, size_t int_regions>
163 ABSL_ATTRIBUTE_NO_SANITIZE_UNDEFINED crc32c_t
Compute(void * __restrict dst,const void * __restrict src,std::size_t length,crc32c_t initial_crc) const164 AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
165 void* __restrict dst, const void* __restrict src, std::size_t length,
166 crc32c_t initial_crc) const {
167 constexpr std::size_t kRegions = vec_regions + int_regions;
168 static_assert(kRegions > 0, "Must specify at least one region.");
169 constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
170 constexpr std::size_t kBlockSize = sizeof(V128);
171 constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
172
173 // Number of blocks per cacheline.
174 constexpr std::size_t kBlocksPerCacheLine = ABSL_CACHELINE_SIZE / kBlockSize;
175
176 char* dst_bytes = static_cast<char*>(dst);
177 const char* src_bytes = static_cast<const char*>(src);
178
179 // Make sure that one prefetch per big block is enough to cover the whole
180 // dataset, and we don't prefetch too much.
181 static_assert(ABSL_CACHELINE_SIZE % kBlockSize == 0,
182 "Cache lines are not divided evenly into blocks, may have "
183 "unintended behavior!");
184
185 // Experimentally-determined boundary between a small and large copy.
186 // Below this number, spin-up and concatenation of CRCs takes enough time that
187 // it kills the throughput gains of using 3 regions and wide vectors.
188 constexpr size_t kCrcSmallSize = 256;
189
190 // Experimentally-determined prefetch distance. Main loop copies will
191 // prefeth data 2 cache lines ahead.
192 constexpr std::size_t kPrefetchAhead = 2 * ABSL_CACHELINE_SIZE;
193
194 // Small-size CRC-memcpy : just do CRC + memcpy
195 if (length < kCrcSmallSize) {
196 crc32c_t crc =
197 ExtendCrc32c(initial_crc, absl::string_view(src_bytes, length));
198 memcpy(dst, src, length);
199 return crc;
200 }
201
202 // Start work on the CRC: undo the XOR from the previous calculation or set up
203 // the initial value of the CRC.
204 initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor};
205
206 // Do an initial alignment copy, so we can use aligned store instructions to
207 // the destination pointer. We align the destination pointer because the
208 // penalty for an unaligned load is small compared to the penalty of an
209 // unaligned store on modern CPUs.
210 std::size_t bytes_from_last_aligned =
211 reinterpret_cast<uintptr_t>(dst) & (kBlockSize - 1);
212 if (bytes_from_last_aligned != 0) {
213 std::size_t bytes_for_alignment = kBlockSize - bytes_from_last_aligned;
214
215 // Do the short-sized copy and CRC.
216 initial_crc =
217 ShortCrcCopy(dst_bytes, src_bytes, bytes_for_alignment, initial_crc);
218 src_bytes += bytes_for_alignment;
219 dst_bytes += bytes_for_alignment;
220 length -= bytes_for_alignment;
221 }
222
223 // We are going to do the copy and CRC in kRegions regions to make sure that
224 // we can saturate the CRC unit. The CRCs will be combined at the end of the
225 // run. Copying will use the SSE registers, and we will extract words from
226 // the SSE registers to add to the CRC. Initially, we run the loop one full
227 // cache line per region at a time, in order to insert prefetches.
228
229 // Initialize CRCs for kRegions regions.
230 crc32c_t crcs[kRegions];
231 crcs[0] = initial_crc;
232 for (size_t i = 1; i < kRegions; i++) {
233 crcs[i] = crc32c_t{kCrcDataXor};
234 }
235
236 // Find the number of rounds to copy and the region size. Also compute the
237 // tail size here.
238 size_t copy_rounds = length / kCopyRoundSize;
239
240 // Find the size of each region and the size of the tail.
241 const std::size_t region_size = copy_rounds * kBlockSize;
242 const std::size_t tail_size = length - (kRegions * region_size);
243
244 // Holding registers for data in each region.
245 std::array<V128, vec_regions> vec_data;
246 std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;
247
248 // Main loop.
249 while (copy_rounds > kBlocksPerCacheLine) {
250 // Prefetch kPrefetchAhead bytes ahead of each pointer.
251 for (size_t i = 0; i < kRegions; i++) {
252 absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i);
253 #ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
254 // TODO(b/297082454): investigate dropping prefetch on x86.
255 absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i);
256 #endif
257 }
258
259 // Load and store data, computing CRC on the way.
260 for (size_t i = 0; i < kBlocksPerCacheLine; i++) {
261 // Copy and CRC the data for the CRC regions.
262 for (size_t j = 0; j < vec_regions; j++) {
263 // Cycle which regions get vector load/store and integer load/store, to
264 // engage prefetching logic around vector load/stores and save issue
265 // slots by using the integer registers.
266 size_t region = (j + i) % kRegions;
267
268 auto* vsrc =
269 reinterpret_cast<const V128*>(src_bytes + region_size * region);
270 auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region);
271
272 // Load and CRC data.
273 vec_data[j] = V128_LoadU(vsrc + i);
274 crcs[region] = crc32c_t{static_cast<uint32_t>(
275 CRC32_u64(static_cast<uint32_t>(crcs[region]),
276 static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))};
277 crcs[region] = crc32c_t{static_cast<uint32_t>(
278 CRC32_u64(static_cast<uint32_t>(crcs[region]),
279 static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))};
280
281 // Store the data.
282 V128_Store(vdst + i, vec_data[j]);
283 }
284
285 // Preload the partial CRCs for the CLMUL subregions.
286 for (size_t j = 0; j < int_regions; j++) {
287 // Cycle which regions get vector load/store and integer load/store, to
288 // engage prefetching logic around vector load/stores and save issue
289 // slots by using the integer registers.
290 size_t region = (j + vec_regions + i) % kRegions;
291
292 auto* usrc =
293 reinterpret_cast<const uint64_t*>(src_bytes + region_size * region);
294 auto* udst =
295 reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);
296
297 for (size_t k = 0; k < kIntLoadsPerVec; k++) {
298 size_t data_index = j * kIntLoadsPerVec + k;
299
300 // Load and CRC the data.
301 int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
302 crcs[region] = crc32c_t{CRC32_u64(static_cast<uint32_t>(crcs[region]),
303 int_data[data_index])};
304
305 // Store the data.
306 *(udst + i * kIntLoadsPerVec + k) = int_data[data_index];
307 }
308 }
309 }
310
311 // Increment pointers
312 src_bytes += kBlockSize * kBlocksPerCacheLine;
313 dst_bytes += kBlockSize * kBlocksPerCacheLine;
314 copy_rounds -= kBlocksPerCacheLine;
315 }
316
317 // Copy and CRC the tails of each region.
318 LargeTailCopy<vec_regions, int_regions>(crcs, &dst_bytes, &src_bytes,
319 region_size, copy_rounds);
320
321 // Move the source and destination pointers to the end of the region
322 src_bytes += region_size * (kRegions - 1);
323 dst_bytes += region_size * (kRegions - 1);
324
325 // Copy and CRC the tail through the XMM registers.
326 std::size_t tail_blocks = tail_size / kBlockSize;
327 LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0,
328 tail_blocks);
329
330 // Final tail copy for under 16 bytes.
331 crcs[kRegions - 1] =
332 ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize,
333 crcs[kRegions - 1]);
334
335 if (kRegions == 1) {
336 // If there is only one region, finalize and return its CRC.
337 return crc32c_t{static_cast<uint32_t>(crcs[0]) ^ kCrcDataXor};
338 }
339
340 // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the
341 // XOR done before doing block copy + CRCs.
342 for (size_t i = 0; i + 1 < kRegions; i++) {
343 crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor};
344 }
345
346 // Build a CRC of the first kRegions - 1 regions.
347 crc32c_t full_crc = crcs[0];
348 for (size_t i = 1; i + 1 < kRegions; i++) {
349 full_crc = ConcatCrc32c(full_crc, crcs[i], region_size);
350 }
351
352 // Finalize and concatenate the final CRC, then return.
353 crcs[kRegions - 1] =
354 crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor};
355 return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size);
356 }
357
GetArchSpecificEngines()358 CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
359 #ifdef UNDEFINED_BEHAVIOR_SANITIZER
360 // UBSAN does not play nicely with unaligned loads (which we use a lot).
361 // Get the underlying architecture.
362 CpuType cpu_type = GetCpuType();
363 switch (cpu_type) {
364 case CpuType::kAmdRome:
365 case CpuType::kAmdNaples:
366 case CpuType::kAmdMilan:
367 case CpuType::kAmdGenoa:
368 case CpuType::kAmdRyzenV3000:
369 case CpuType::kIntelCascadelakeXeon:
370 case CpuType::kIntelSkylakeXeon:
371 case CpuType::kIntelSkylake:
372 case CpuType::kIntelBroadwell:
373 case CpuType::kIntelHaswell:
374 case CpuType::kIntelIvybridge:
375 return {
376 /*.temporal=*/new FallbackCrcMemcpyEngine(),
377 /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
378 };
379 // INTEL_SANDYBRIDGE performs better with SSE than AVX.
380 case CpuType::kIntelSandybridge:
381 return {
382 /*.temporal=*/new FallbackCrcMemcpyEngine(),
383 /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(),
384 };
385 default:
386 return {/*.temporal=*/new FallbackCrcMemcpyEngine(),
387 /*.non_temporal=*/new FallbackCrcMemcpyEngine()};
388 }
389 #else
390 // Get the underlying architecture.
391 CpuType cpu_type = GetCpuType();
392 switch (cpu_type) {
393 // On Zen 2, PEXTRQ uses 2 micro-ops, including one on the vector store port
394 // which data movement from the vector registers to the integer registers
395 // (where CRC32C happens) to crowd the same units as vector stores. As a
396 // result, using that path exclusively causes bottlenecking on this port.
397 // We can avoid this bottleneck by using the integer side of the CPU for
398 // most operations rather than the vector side. We keep a vector region to
399 // engage some of the prefetching logic in the cache hierarchy which seems
400 // to give vector instructions special treatment. These prefetch units see
401 // strided access to each region, and do the right thing.
402 case CpuType::kAmdRome:
403 case CpuType::kAmdNaples:
404 case CpuType::kAmdMilan:
405 case CpuType::kAmdGenoa:
406 case CpuType::kAmdRyzenV3000:
407 return {
408 /*.temporal=*/new AcceleratedCrcMemcpyEngine<1, 2>(),
409 /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
410 };
411 // PCLMULQDQ is slow and we don't have wide enough issue width to take
412 // advantage of it. For an unknown architecture, don't risk using CLMULs.
413 case CpuType::kIntelCascadelakeXeon:
414 case CpuType::kIntelSkylakeXeon:
415 case CpuType::kIntelSkylake:
416 case CpuType::kIntelBroadwell:
417 case CpuType::kIntelHaswell:
418 case CpuType::kIntelIvybridge:
419 return {
420 /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(),
421 /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
422 };
423 // INTEL_SANDYBRIDGE performs better with SSE than AVX.
424 case CpuType::kIntelSandybridge:
425 return {
426 /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(),
427 /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(),
428 };
429 default:
430 return {/*.temporal=*/new FallbackCrcMemcpyEngine(),
431 /*.non_temporal=*/new FallbackCrcMemcpyEngine()};
432 }
433 #endif // UNDEFINED_BEHAVIOR_SANITIZER
434 }
435
436 // For testing, allow the user to specify which engine they want.
GetTestEngine(int vector,int integer)437 std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
438 int integer) {
439 if (vector == 3 && integer == 0) {
440 return std::make_unique<AcceleratedCrcMemcpyEngine<3, 0>>();
441 } else if (vector == 1 && integer == 2) {
442 return std::make_unique<AcceleratedCrcMemcpyEngine<1, 2>>();
443 } else if (vector == 1 && integer == 0) {
444 return std::make_unique<AcceleratedCrcMemcpyEngine<1, 0>>();
445 }
446 return nullptr;
447 }
448
449 } // namespace crc_internal
450 ABSL_NAMESPACE_END
451 } // namespace absl
452
453 #endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE ||
454 // ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
455