• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- x86 implementation of memory function building blocks -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file provides x86 specific building blocks to compose memory functions.
10 //
11 //===----------------------------------------------------------------------===//
12 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
13 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
14 
15 #include "src/__support/macros/properties/architectures.h"
16 
17 #if defined(LIBC_TARGET_ARCH_IS_X86)
18 
19 #include "src/__support/common.h"
20 #include "src/string/memory_utils/op_builtin.h"
21 #include "src/string/memory_utils/op_generic.h"
22 
23 #if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) ||      \
24     defined(__SSE2__)
25 #include <immintrin.h>
26 #endif
27 
28 // Define fake functions to prevent the compiler from failing on undefined
29 // functions in case the CPU extension is not present.
30 #if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__))
31 #define _mm512_cmpneq_epi8_mask(A, B) 0
32 #endif
33 #if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__))
34 #define _mm256_movemask_epi8(A) 0
35 #endif
36 #if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__))
37 #define _mm_movemask_epi8(A) 0
38 #endif
39 
40 namespace LIBC_NAMESPACE::x86 {
41 
42 // A set of constants to check compile time features.
43 LIBC_INLINE_VAR constexpr bool K_SSE2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
44 LIBC_INLINE_VAR constexpr bool K_SSE41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__);
45 LIBC_INLINE_VAR constexpr bool K_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);
46 LIBC_INLINE_VAR constexpr bool K_AVX2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
47 LIBC_INLINE_VAR constexpr bool K_AVX512_F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
48 LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
49 
50 ///////////////////////////////////////////////////////////////////////////////
51 // Memcpy repmovsb implementation
52 struct Memcpy {
repmovsbMemcpy53   LIBC_INLINE static void repmovsb(void *dst, const void *src, size_t count) {
54     asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
55   }
56 };
57 
58 } // namespace LIBC_NAMESPACE::x86
59 
60 namespace LIBC_NAMESPACE::generic {
61 
62 ///////////////////////////////////////////////////////////////////////////////
63 // Specializations for uint16_t
64 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
65 template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
66   return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
67 }
68 template <>
69 LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
70   return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
71 }
72 template <>
73 LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
74   return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
75          static_cast<int32_t>(load_be<uint16_t>(p2, offset));
76 }
77 template <>
78 LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset);
79 
80 ///////////////////////////////////////////////////////////////////////////////
81 // Specializations for uint32_t
82 template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {};
83 template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
84   return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset);
85 }
86 template <>
87 LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
88   return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
89 }
90 template <>
91 LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
92   const auto a = load_be<uint32_t>(p1, offset);
93   const auto b = load_be<uint32_t>(p2, offset);
94   return cmp_uint32_t(a, b);
95 }
96 template <>
97 LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset);
98 
99 ///////////////////////////////////////////////////////////////////////////////
100 // Specializations for uint64_t
101 template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {};
102 template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
103   return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset);
104 }
105 template <>
106 LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
107   return !eq<uint64_t>(p1, p2, offset);
108 }
109 template <>
110 LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset);
111 template <>
112 LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
113                                                size_t offset) {
114   const auto a = load_be<uint64_t>(p1, offset);
115   const auto b = load_be<uint64_t>(p2, offset);
116   return cmp_neq_uint64_t(a, b);
117 }
118 
119 // SIMD types are defined with attributes. e.g., '__m128i' is defined as
120 // long long  __attribute__((__vector_size__(16), __aligned__(16)))
121 // When we use these SIMD types in template specialization GCC complains:
122 // "ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]"
123 // Therefore, we disable this warning in this file.
124 #pragma GCC diagnostic push
125 #pragma GCC diagnostic ignored "-Wignored-attributes"
126 
127 ///////////////////////////////////////////////////////////////////////////////
128 // Specializations for __m128i
129 #if defined(__SSE4_1__)
130 template <> struct is_vector<__m128i> : cpp::true_type {};
131 template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
132 LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
133   return _mm_max_epu8(a, b);
134 }
135 LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
136   return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
137                                               8, 9, 10, 11, 12, 13, 14, 15));
138 }
139 LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
140   return static_cast<uint16_t>(
141       _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
142 }
143 template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
144   const auto a = load<__m128i>(p1, offset);
145   const auto b = load<__m128i>(p2, offset);
146   const auto xored = _mm_xor_si128(a, b);
147   return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0
148 }
149 template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
150   const auto a = load<__m128i>(p1, offset);
151   const auto b = load<__m128i>(p2, offset);
152   const auto xored = _mm_xor_si128(a, b);
153   return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0
154 }
155 template <>
156 LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
157   const auto a = load<__m128i>(p1, offset);
158   const auto b = load<__m128i>(p2, offset);
159   const auto vmax = bytewise_max(a, b);
160   const auto le = big_endian_cmp_mask(vmax, b);
161   const auto ge = big_endian_cmp_mask(vmax, a);
162   static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>);
163   return static_cast<int32_t>(ge) - static_cast<int32_t>(le);
164 }
165 #endif // __SSE4_1__
166 
167 ///////////////////////////////////////////////////////////////////////////////
168 // Specializations for __m256i
169 #if defined(__AVX__)
170 template <> struct is_vector<__m256i> : cpp::true_type {};
171 template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
172 template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
173   const auto a = load<__m256i>(p1, offset);
174   const auto b = load<__m256i>(p2, offset);
175   const auto xored = _mm256_castps_si256(
176       _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
177   return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0
178 }
179 template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
180   const auto a = load<__m256i>(p1, offset);
181   const auto b = load<__m256i>(p2, offset);
182   const auto xored = _mm256_castps_si256(
183       _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
184   return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0
185 }
186 #endif // __AVX__
187 
188 #if defined(__AVX2__)
189 LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
190   return _mm256_max_epu8(a, b);
191 }
192 LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
193   // Bytewise comparison of 'max' and 'value'.
194   const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value);
195   // Because x86 is little endian, bytes in the vector must be reversed before
196   // using movemask.
197 #if defined(__AVX512VBMI__) && defined(__AVX512VL__)
198   // When AVX512BMI is available we can completely reverse the vector through
199   // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a);
200   const __m256i big_endian_byte_mask =
201       _mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,         //
202                                               8, 9, 10, 11, 12, 13, 14, 15,   //
203                                               16, 17, 18, 19, 20, 21, 22, 23, //
204                                               24, 25, 26, 27, 28, 29, 30, 31),
205                               little_endian_byte_mask);
206   // And turn the byte vector mask into an 'uint32_t' for direct scalar
207   // comparison.
208   return _mm256_movemask_epi8(big_endian_byte_mask);
209 #else
210   // We can't byte-reverse '__m256i' in a single instruction with AVX2.
211   // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane
212   // leading to:
213   // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
214   //           31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
215   // So we first shuffle each 16-byte lane leading to half-reversed vector mask.
216   const __m256i half_reversed = _mm256_shuffle_epi8(
217       little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,       //
218                                                8, 9, 10, 11, 12, 13, 14, 15, //
219                                                0, 1, 2, 3, 4, 5, 6, 7,       //
220                                                8, 9, 10, 11, 12, 13, 14, 15));
221   // Then we turn the vector into an uint32_t.
222   const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed);
223   // And swap the lower and upper parts. This is optimized into a single `rorx`
224   // instruction.
225   return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16);
226 #endif
227 }
228 template <>
229 LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
230   const auto a = load<__m256i>(p1, offset);
231   const auto b = load<__m256i>(p2, offset);
232   const auto vmax = bytewise_max(a, b);
233   const auto le = big_endian_cmp_mask(vmax, b);
234   const auto ge = big_endian_cmp_mask(vmax, a);
235   static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>);
236   return cmp_neq_uint64_t(ge, le);
237 }
238 #endif // __AVX2__
239 
240 ///////////////////////////////////////////////////////////////////////////////
241 // Specializations for __m512i
242 #if defined(__AVX512BW__)
243 template <> struct is_vector<__m512i> : cpp::true_type {};
244 template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
245 LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
246   return _mm512_max_epu8(a, b);
247 }
248 LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
249   // The AVX512BMI version is disabled due to bad codegen.
250   // https://github.com/llvm/llvm-project/issues/77459
251   // https://github.com/llvm/llvm-project/pull/77081
252   // TODO: Re-enable when clang version meets the fixed version.
253 #if false && defined(__AVX512VBMI__)
254   // When AVX512BMI is available we can completely reverse the vector through
255   // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
256   const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,         //
257                                        8, 9, 10, 11, 12, 13, 14, 15,   //
258                                        16, 17, 18, 19, 20, 21, 22, 23, //
259                                        24, 25, 26, 27, 28, 29, 30, 31, //
260                                        32, 33, 34, 35, 36, 37, 38, 39, //
261                                        40, 41, 42, 43, 44, 45, 46, 47, //
262                                        48, 49, 50, 51, 52, 53, 54, 55, //
263                                        56, 57, 58, 59, 60, 61, 62, 63);
264   // Then we compute the mask for equal bytes.
265   return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), //
266                                 _mm512_permutexvar_epi8(indices, value));
267 #else
268   // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__.
269   // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane.
270   // So we only reverse groups of 8 bytes, these groups are necessarily within a
271   // 16-byte lane.
272   // zmm = | 16 bytes  | 16 bytes  | 16 bytes  | 16 bytes  |
273   // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> |
274   const __m512i indices = _mm512_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, //
275                                           0, 1, 2, 3, 4, 5, 6, 7,       //
276                                           8, 9, 10, 11, 12, 13, 14, 15, //
277                                           0, 1, 2, 3, 4, 5, 6, 7,       //
278                                           8, 9, 10, 11, 12, 13, 14, 15, //
279                                           0, 1, 2, 3, 4, 5, 6, 7,       //
280                                           8, 9, 10, 11, 12, 13, 14, 15, //
281                                           0, 1, 2, 3, 4, 5, 6, 7);
282   // Then we compute the mask for equal bytes. In this mask the bits of each
283   // byte are already reversed but the byte themselves should be reversed, this
284   // is done by using a bswap instruction.
285   return __builtin_bswap64(
286       _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), //
287                              _mm512_shuffle_epi8(value, indices)));
288 
289 #endif
290 }
291 template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
292   const auto a = load<__m512i>(p1, offset);
293   const auto b = load<__m512i>(p2, offset);
294   return _mm512_cmpneq_epi8_mask(a, b) == 0;
295 }
296 template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
297   const auto a = load<__m512i>(p1, offset);
298   const auto b = load<__m512i>(p2, offset);
299   const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b);
300   return static_cast<uint32_t>(xored >> 32) |
301          static_cast<uint32_t>(xored & 0xFFFFFFFF);
302 }
303 template <>
304 LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
305   const auto a = load<__m512i>(p1, offset);
306   const auto b = load<__m512i>(p2, offset);
307   const auto vmax = bytewise_max(a, b);
308   const auto le = big_endian_cmp_mask(vmax, b);
309   const auto ge = big_endian_cmp_mask(vmax, a);
310   static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>);
311   return cmp_neq_uint64_t(ge, le);
312 }
313 #endif // __AVX512BW__
314 
315 #pragma GCC diagnostic pop
316 
317 } // namespace LIBC_NAMESPACE::generic
318 
319 #endif // LIBC_TARGET_ARCH_IS_X86
320 
321 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
322