1 //===-- x86 implementation of memory function building blocks -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file provides x86 specific building blocks to compose memory functions.
10 //
11 //===----------------------------------------------------------------------===//
12 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
13 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
14
15 #include "src/__support/macros/attributes.h" // LIBC_INLINE
16 #include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL
17 #include "src/__support/macros/properties/architectures.h"
18
19 #if defined(LIBC_TARGET_ARCH_IS_X86)
20
21 #include "src/__support/common.h"
22 #include "src/string/memory_utils/op_builtin.h"
23 #include "src/string/memory_utils/op_generic.h"
24
25 #if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) || \
26 defined(__SSE2__)
27 #include <immintrin.h>
28 #endif
29
30 // Define fake functions to prevent the compiler from failing on undefined
31 // functions in case the CPU extension is not present.
32 #if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__))
33 #undef _mm512_cmpneq_epi8_mask
34 #define _mm512_cmpneq_epi8_mask(A, B) 0
35 #endif
36 #if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__))
37 #undef _mm256_movemask_epi8
38 #define _mm256_movemask_epi8(A) 0
39 #endif
40 #if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__))
41 #undef _mm_movemask_epi8
42 #define _mm_movemask_epi8(A) 0
43 #endif
44
45 namespace LIBC_NAMESPACE_DECL {
46 namespace x86 {
47
48 // A set of constants to check compile time features.
49 LIBC_INLINE_VAR constexpr bool K_SSE2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
50 LIBC_INLINE_VAR constexpr bool K_SSE41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__);
51 LIBC_INLINE_VAR constexpr bool K_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);
52 LIBC_INLINE_VAR constexpr bool K_AVX2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
53 LIBC_INLINE_VAR constexpr bool K_AVX512_F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
54 LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
55
56 ///////////////////////////////////////////////////////////////////////////////
57 // Memcpy repmovsb implementation
58 struct Memcpy {
repmovsbMemcpy59 LIBC_INLINE static void repmovsb(void *dst, const void *src, size_t count) {
60 asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
61 }
62 };
63
64 } // namespace x86
65 } // namespace LIBC_NAMESPACE_DECL
66
67 namespace LIBC_NAMESPACE_DECL {
68 namespace generic {
69
70 // Not equals: returns non-zero iff values at head or tail differ.
71 // This function typically loads more data than necessary when the two buffer
72 // differs.
73 template <typename T>
branchless_head_tail_neq(CPtr p1,CPtr p2,size_t count)74 LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) {
75 static_assert(cpp::is_integral_v<T>);
76 return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T));
77 }
78
79 ///////////////////////////////////////////////////////////////////////////////
80 // Specializations for uint16_t
81 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
82 template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
83 return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
84 }
85 template <>
86 LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
87 return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
88 }
89 template <>
90 LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
91 return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
92 static_cast<int32_t>(load_be<uint16_t>(p2, offset));
93 }
94 template <>
95 LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset);
96
97 ///////////////////////////////////////////////////////////////////////////////
98 // Specializations for uint32_t
99 template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {};
100 template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
101 return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset);
102 }
103 template <>
104 LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
105 return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
106 }
107 template <>
108 LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
109 const auto a = load_be<uint32_t>(p1, offset);
110 const auto b = load_be<uint32_t>(p2, offset);
111 return cmp_uint32_t(a, b);
112 }
113 template <>
114 LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset);
115
116 ///////////////////////////////////////////////////////////////////////////////
117 // Specializations for uint64_t
118 template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {};
119 template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
120 return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset);
121 }
122 template <>
123 LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
124 return !eq<uint64_t>(p1, p2, offset);
125 }
126 template <>
127 LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset);
128 template <>
129 LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
130 size_t offset) {
131 const auto a = load_be<uint64_t>(p1, offset);
132 const auto b = load_be<uint64_t>(p2, offset);
133 return cmp_neq_uint64_t(a, b);
134 }
135
136 // SIMD types are defined with attributes. e.g., '__m128i' is defined as
137 // long long __attribute__((__vector_size__(16), __aligned__(16)))
138 // When we use these SIMD types in template specialization GCC complains:
139 // "ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]"
140 // Therefore, we disable this warning in this file.
141 #pragma GCC diagnostic push
142 #pragma GCC diagnostic ignored "-Wignored-attributes"
143
144 ///////////////////////////////////////////////////////////////////////////////
145 // Specializations for __m128i
146 #if defined(__SSE4_1__)
147 template <> struct is_vector<__m128i> : cpp::true_type {};
148 template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
149 LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
150 const auto a = load<__m128i>(p1, offset);
151 const auto b = load<__m128i>(p2, offset);
152 return _mm_xor_si128(a, b);
153 }
154 LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
155 return _mm_max_epu8(a, b);
156 }
157 LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
158 return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
159 8, 9, 10, 11, 12, 13, 14, 15));
160 }
161 LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
162 return static_cast<uint16_t>(
163 _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
164 }
165 LIBC_INLINE bool is_zero(__m128i value) {
166 return _mm_testz_si128(value, value) == 1;
167 }
168 template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
169 return is_zero(load_and_xor_m128i(p1, p2, offset));
170 }
171 template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
172 return !is_zero(load_and_xor_m128i(p1, p2, offset));
173 }
174 template <>
175 LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
176 size_t count) {
177 const __m128i head = load_and_xor_m128i(p1, p2, 0);
178 const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
179 return !is_zero(_mm_or_si128(head, tail));
180 }
181 template <>
182 LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
183 const auto a = load<__m128i>(p1, offset);
184 const auto b = load<__m128i>(p2, offset);
185 const auto vmax = bytewise_max(a, b);
186 const auto le = big_endian_cmp_mask(vmax, b);
187 const auto ge = big_endian_cmp_mask(vmax, a);
188 static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>);
189 return static_cast<int32_t>(ge) - static_cast<int32_t>(le);
190 }
191 #endif // __SSE4_1__
192
193 ///////////////////////////////////////////////////////////////////////////////
194 // Specializations for __m256i
195 #if defined(__AVX__)
196 template <> struct is_vector<__m256i> : cpp::true_type {};
197 template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
198 LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
199 return _mm256_castps_si256(
200 _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
201 }
202 LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
203 return _mm256_castps_si256(
204 _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
205 }
206 LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
207 const auto a = load<__m256i>(p1, offset);
208 const auto b = load<__m256i>(p2, offset);
209 return xor_m256i(a, b);
210 }
211 LIBC_INLINE bool is_zero(__m256i value) {
212 return _mm256_testz_si256(value, value) == 1;
213 }
214 template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
215 return is_zero(load_and_xor_m256i(p1, p2, offset));
216 }
217 template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
218 return !is_zero(load_and_xor_m256i(p1, p2, offset));
219 }
220 template <>
221 LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
222 size_t count) {
223 const __m256i head = load_and_xor_m256i(p1, p2, 0);
224 const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
225 return !is_zero(or_m256i(head, tail));
226 }
227 #endif // __AVX__
228
229 #if defined(__AVX2__)
230 LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
231 return _mm256_max_epu8(a, b);
232 }
233 LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
234 // Bytewise comparison of 'max' and 'value'.
235 const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value);
236 // Because x86 is little endian, bytes in the vector must be reversed before
237 // using movemask.
238 #if defined(__AVX512VBMI__) && defined(__AVX512VL__)
239 // When AVX512BMI is available we can completely reverse the vector through
240 // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a);
241 const __m256i big_endian_byte_mask =
242 _mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
243 8, 9, 10, 11, 12, 13, 14, 15, //
244 16, 17, 18, 19, 20, 21, 22, 23, //
245 24, 25, 26, 27, 28, 29, 30, 31),
246 little_endian_byte_mask);
247 // And turn the byte vector mask into an 'uint32_t' for direct scalar
248 // comparison.
249 return _mm256_movemask_epi8(big_endian_byte_mask);
250 #else
251 // We can't byte-reverse '__m256i' in a single instruction with AVX2.
252 // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane
253 // leading to:
254 // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
255 // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
256 // So we first shuffle each 16-byte lane leading to half-reversed vector mask.
257 const __m256i half_reversed = _mm256_shuffle_epi8(
258 little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
259 8, 9, 10, 11, 12, 13, 14, 15, //
260 0, 1, 2, 3, 4, 5, 6, 7, //
261 8, 9, 10, 11, 12, 13, 14, 15));
262 // Then we turn the vector into an uint32_t.
263 const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed);
264 // And swap the lower and upper parts. This is optimized into a single `rorx`
265 // instruction.
266 return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16);
267 #endif
268 }
269 template <>
270 LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
271 const auto a = load<__m256i>(p1, offset);
272 const auto b = load<__m256i>(p2, offset);
273 const auto vmax = bytewise_max(a, b);
274 const auto le = big_endian_cmp_mask(vmax, b);
275 const auto ge = big_endian_cmp_mask(vmax, a);
276 static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>);
277 return cmp_neq_uint64_t(ge, le);
278 }
279 #endif // __AVX2__
280
281 ///////////////////////////////////////////////////////////////////////////////
282 // Specializations for __m512i
283 #if defined(__AVX512BW__)
284 template <> struct is_vector<__m512i> : cpp::true_type {};
285 template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
286 LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
287 return _mm512_max_epu8(a, b);
288 }
289 LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
290 // The AVX512BMI version is disabled due to bad codegen.
291 // https://github.com/llvm/llvm-project/issues/77459
292 // https://github.com/llvm/llvm-project/pull/77081
293 // TODO: Re-enable when clang version meets the fixed version.
294 #if false && defined(__AVX512VBMI__)
295 // When AVX512BMI is available we can completely reverse the vector through
296 // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
297 const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
298 8, 9, 10, 11, 12, 13, 14, 15, //
299 16, 17, 18, 19, 20, 21, 22, 23, //
300 24, 25, 26, 27, 28, 29, 30, 31, //
301 32, 33, 34, 35, 36, 37, 38, 39, //
302 40, 41, 42, 43, 44, 45, 46, 47, //
303 48, 49, 50, 51, 52, 53, 54, 55, //
304 56, 57, 58, 59, 60, 61, 62, 63);
305 // Then we compute the mask for equal bytes.
306 return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), //
307 _mm512_permutexvar_epi8(indices, value));
308 #else
309 // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__.
310 // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane.
311 // So we only reverse groups of 8 bytes, these groups are necessarily within a
312 // 16-byte lane.
313 // zmm = | 16 bytes | 16 bytes | 16 bytes | 16 bytes |
314 // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> |
315 const __m512i indices = _mm512_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, //
316 0, 1, 2, 3, 4, 5, 6, 7, //
317 8, 9, 10, 11, 12, 13, 14, 15, //
318 0, 1, 2, 3, 4, 5, 6, 7, //
319 8, 9, 10, 11, 12, 13, 14, 15, //
320 0, 1, 2, 3, 4, 5, 6, 7, //
321 8, 9, 10, 11, 12, 13, 14, 15, //
322 0, 1, 2, 3, 4, 5, 6, 7);
323 // Then we compute the mask for equal bytes. In this mask the bits of each
324 // byte are already reversed but the byte themselves should be reversed, this
325 // is done by using a bswap instruction.
326 return __builtin_bswap64(
327 _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), //
328 _mm512_shuffle_epi8(value, indices)));
329
330 #endif
331 }
332 template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
333 const auto a = load<__m512i>(p1, offset);
334 const auto b = load<__m512i>(p2, offset);
335 return _mm512_cmpneq_epi8_mask(a, b) == 0;
336 }
337 template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
338 const auto a = load<__m512i>(p1, offset);
339 const auto b = load<__m512i>(p2, offset);
340 return _mm512_cmpneq_epi8_mask(a, b) != 0;
341 }
342 LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
343 const auto a = load<__m512i>(p1, offset);
344 const auto b = load<__m512i>(p2, offset);
345 return _mm512_xor_epi64(a, b);
346 }
347 LIBC_INLINE bool is_zero(__m512i value) {
348 return _mm512_test_epi32_mask(value, value) == 0;
349 }
350 template <>
351 LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
352 size_t count) {
353 const __m512i head = load_and_xor_m512i(p1, p2, 0);
354 const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
355 return !is_zero(_mm512_or_epi64(head, tail));
356 }
357 template <>
358 LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
359 const auto a = load<__m512i>(p1, offset);
360 const auto b = load<__m512i>(p2, offset);
361 const auto vmax = bytewise_max(a, b);
362 const auto le = big_endian_cmp_mask(vmax, b);
363 const auto ge = big_endian_cmp_mask(vmax, a);
364 static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>);
365 return cmp_neq_uint64_t(ge, le);
366 }
367 #endif // __AVX512BW__
368
369 #pragma GCC diagnostic pop
370
371 } // namespace generic
372 } // namespace LIBC_NAMESPACE_DECL
373
374 #endif // LIBC_TARGET_ARCH_IS_X86
375
376 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
377