1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <stddef.h>
17 #include <stdint.h>
18
19 // SSE4.2 accelerated CRC32c.
20
21 // See if the SSE4.2 crc32c instruction is available.
22 #undef USE_SSE_CRC32C
23 #ifdef __SSE4_2__
24 #if defined(__x86_64__) && defined(__GNUC__) && \
25 (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
26 #define USE_SSE_CRC32C 1
27 #elif defined(__x86_64__) && defined(__clang__)
28 #if __has_builtin(__builtin_cpu_supports)
29 #define USE_SSE_CRC32C 1
30 #endif
31 #endif
32 #endif /* __SSE4_2__ */
33
34 // This version of Apple clang has a bug:
35 // https://llvm.org/bugs/show_bug.cgi?id=25510
36 #if defined(__APPLE__) && (__clang_major__ <= 8)
37 #undef USE_SSE_CRC32C
38 #endif
39
40 #ifdef USE_SSE_CRC32C
41 #include <nmmintrin.h>
42 #endif
43
44 namespace tensorflow {
45 namespace crc32c {
46
47 #ifndef USE_SSE_CRC32C
48
CanAccelerate()49 bool CanAccelerate() { return false; }
AcceleratedExtend(uint32_t crc,const char * buf,size_t size)50 uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size) {
51 // Should not be called.
52 return 0;
53 }
54
55 #else
56
57 // SSE4.2 optimized crc32c computation.
58 bool CanAccelerate() { return __builtin_cpu_supports("sse4.2"); }
59
60 uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size) {
61 const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
62 const uint8_t *e = p + size;
63 uint32_t l = crc ^ 0xffffffffu;
64
65 // Advance p until aligned to 8-bytes..
66 // Point x at first 7-byte aligned byte in string. This might be
67 // just past the end of the string.
68 const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
69 const uint8_t *x = reinterpret_cast<const uint8_t *>(((pval + 7) >> 3) << 3);
70 if (x <= e) {
71 // Process bytes until finished or p is 8-byte aligned
72 while (p != x) {
73 l = _mm_crc32_u8(l, *p);
74 p++;
75 }
76 }
77
78 // Process bytes 16 at a time
79 uint64_t l64 = l;
80 while ((e - p) >= 16) {
81 l64 = _mm_crc32_u64(l64, *reinterpret_cast<const uint64_t *>(p));
82 l64 = _mm_crc32_u64(l64, *reinterpret_cast<const uint64_t *>(p + 8));
83 p += 16;
84 }
85
86 // Process remaining bytes one at a time.
87 l = l64;
88 while (p < e) {
89 l = _mm_crc32_u8(l, *p);
90 p++;
91 }
92
93 return l ^ 0xffffffffu;
94 }
95
96 #endif
97
98 } // namespace crc32c
99 } // namespace tensorflow
100