1 //===-- x86 implementation of memory function building blocks -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file provides x86 specific building blocks to compose memory functions. 10 // 11 //===----------------------------------------------------------------------===// 12 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H 13 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H 14 15 #include "src/__support/macros/properties/architectures.h" 16 17 #if defined(LIBC_TARGET_ARCH_IS_X86) 18 19 #include "src/__support/common.h" 20 #include "src/string/memory_utils/op_builtin.h" 21 #include "src/string/memory_utils/op_generic.h" 22 23 #if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) || \ 24 defined(__SSE2__) 25 #include <immintrin.h> 26 #endif 27 28 // Define fake functions to prevent the compiler from failing on undefined 29 // functions in case the CPU extension is not present. 30 #if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__)) 31 #define _mm512_cmpneq_epi8_mask(A, B) 0 32 #endif 33 #if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__)) 34 #define _mm256_movemask_epi8(A) 0 35 #endif 36 #if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__)) 37 #define _mm_movemask_epi8(A) 0 38 #endif 39 40 namespace LIBC_NAMESPACE::x86 { 41 42 // A set of constants to check compile time features. 43 LIBC_INLINE_VAR constexpr bool K_SSE2 = LLVM_LIBC_IS_DEFINED(__SSE2__); 44 LIBC_INLINE_VAR constexpr bool K_SSE41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__); 45 LIBC_INLINE_VAR constexpr bool K_AVX = LLVM_LIBC_IS_DEFINED(__AVX__); 46 LIBC_INLINE_VAR constexpr bool K_AVX2 = LLVM_LIBC_IS_DEFINED(__AVX2__); 47 LIBC_INLINE_VAR constexpr bool K_AVX512_F = LLVM_LIBC_IS_DEFINED(__AVX512F__); 48 LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__); 49 50 /////////////////////////////////////////////////////////////////////////////// 51 // Memcpy repmovsb implementation 52 struct Memcpy { repmovsbMemcpy53 LIBC_INLINE static void repmovsb(void *dst, const void *src, size_t count) { 54 asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); 55 } 56 }; 57 58 } // namespace LIBC_NAMESPACE::x86 59 60 namespace LIBC_NAMESPACE::generic { 61 62 /////////////////////////////////////////////////////////////////////////////// 63 // Specializations for uint16_t 64 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {}; 65 template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { 66 return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset); 67 } 68 template <> 69 LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { 70 return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset); 71 } 72 template <> 73 LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) { 74 return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) - 75 static_cast<int32_t>(load_be<uint16_t>(p2, offset)); 76 } 77 template <> 78 LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset); 79 80 /////////////////////////////////////////////////////////////////////////////// 81 // Specializations for uint32_t 82 template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {}; 83 template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { 84 return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset); 85 } 86 template <> 87 LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { 88 return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset); 89 } 90 template <> 91 LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) { 92 const auto a = load_be<uint32_t>(p1, offset); 93 const auto b = load_be<uint32_t>(p2, offset); 94 return cmp_uint32_t(a, b); 95 } 96 template <> 97 LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset); 98 99 /////////////////////////////////////////////////////////////////////////////// 100 // Specializations for uint64_t 101 template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {}; 102 template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { 103 return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset); 104 } 105 template <> 106 LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { 107 return !eq<uint64_t>(p1, p2, offset); 108 } 109 template <> 110 LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset); 111 template <> 112 LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2, 113 size_t offset) { 114 const auto a = load_be<uint64_t>(p1, offset); 115 const auto b = load_be<uint64_t>(p2, offset); 116 return cmp_neq_uint64_t(a, b); 117 } 118 119 // SIMD types are defined with attributes. e.g., '__m128i' is defined as 120 // long long __attribute__((__vector_size__(16), __aligned__(16))) 121 // When we use these SIMD types in template specialization GCC complains: 122 // "ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]" 123 // Therefore, we disable this warning in this file. 124 #pragma GCC diagnostic push 125 #pragma GCC diagnostic ignored "-Wignored-attributes" 126 127 /////////////////////////////////////////////////////////////////////////////// 128 // Specializations for __m128i 129 #if defined(__SSE4_1__) 130 template <> struct is_vector<__m128i> : cpp::true_type {}; 131 template <> struct cmp_is_expensive<__m128i> : cpp::true_type {}; 132 LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) { 133 return _mm_max_epu8(a, b); 134 } 135 LIBC_INLINE __m128i bytewise_reverse(__m128i value) { 136 return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // 137 8, 9, 10, 11, 12, 13, 14, 15)); 138 } 139 LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) { 140 return static_cast<uint16_t>( 141 _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value)))); 142 } 143 template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) { 144 const auto a = load<__m128i>(p1, offset); 145 const auto b = load<__m128i>(p2, offset); 146 const auto xored = _mm_xor_si128(a, b); 147 return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0 148 } 149 template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { 150 const auto a = load<__m128i>(p1, offset); 151 const auto b = load<__m128i>(p2, offset); 152 const auto xored = _mm_xor_si128(a, b); 153 return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0 154 } 155 template <> 156 LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { 157 const auto a = load<__m128i>(p1, offset); 158 const auto b = load<__m128i>(p2, offset); 159 const auto vmax = bytewise_max(a, b); 160 const auto le = big_endian_cmp_mask(vmax, b); 161 const auto ge = big_endian_cmp_mask(vmax, a); 162 static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>); 163 return static_cast<int32_t>(ge) - static_cast<int32_t>(le); 164 } 165 #endif // __SSE4_1__ 166 167 /////////////////////////////////////////////////////////////////////////////// 168 // Specializations for __m256i 169 #if defined(__AVX__) 170 template <> struct is_vector<__m256i> : cpp::true_type {}; 171 template <> struct cmp_is_expensive<__m256i> : cpp::true_type {}; 172 template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) { 173 const auto a = load<__m256i>(p1, offset); 174 const auto b = load<__m256i>(p2, offset); 175 const auto xored = _mm256_castps_si256( 176 _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); 177 return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0 178 } 179 template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { 180 const auto a = load<__m256i>(p1, offset); 181 const auto b = load<__m256i>(p2, offset); 182 const auto xored = _mm256_castps_si256( 183 _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); 184 return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0 185 } 186 #endif // __AVX__ 187 188 #if defined(__AVX2__) 189 LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) { 190 return _mm256_max_epu8(a, b); 191 } 192 LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) { 193 // Bytewise comparison of 'max' and 'value'. 194 const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value); 195 // Because x86 is little endian, bytes in the vector must be reversed before 196 // using movemask. 197 #if defined(__AVX512VBMI__) && defined(__AVX512VL__) 198 // When AVX512BMI is available we can completely reverse the vector through 199 // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a); 200 const __m256i big_endian_byte_mask = 201 _mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // 202 8, 9, 10, 11, 12, 13, 14, 15, // 203 16, 17, 18, 19, 20, 21, 22, 23, // 204 24, 25, 26, 27, 28, 29, 30, 31), 205 little_endian_byte_mask); 206 // And turn the byte vector mask into an 'uint32_t' for direct scalar 207 // comparison. 208 return _mm256_movemask_epi8(big_endian_byte_mask); 209 #else 210 // We can't byte-reverse '__m256i' in a single instruction with AVX2. 211 // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane 212 // leading to: 213 // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 214 // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16] 215 // So we first shuffle each 16-byte lane leading to half-reversed vector mask. 216 const __m256i half_reversed = _mm256_shuffle_epi8( 217 little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // 218 8, 9, 10, 11, 12, 13, 14, 15, // 219 0, 1, 2, 3, 4, 5, 6, 7, // 220 8, 9, 10, 11, 12, 13, 14, 15)); 221 // Then we turn the vector into an uint32_t. 222 const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed); 223 // And swap the lower and upper parts. This is optimized into a single `rorx` 224 // instruction. 225 return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16); 226 #endif 227 } 228 template <> 229 LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { 230 const auto a = load<__m256i>(p1, offset); 231 const auto b = load<__m256i>(p2, offset); 232 const auto vmax = bytewise_max(a, b); 233 const auto le = big_endian_cmp_mask(vmax, b); 234 const auto ge = big_endian_cmp_mask(vmax, a); 235 static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>); 236 return cmp_neq_uint64_t(ge, le); 237 } 238 #endif // __AVX2__ 239 240 /////////////////////////////////////////////////////////////////////////////// 241 // Specializations for __m512i 242 #if defined(__AVX512BW__) 243 template <> struct is_vector<__m512i> : cpp::true_type {}; 244 template <> struct cmp_is_expensive<__m512i> : cpp::true_type {}; 245 LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) { 246 return _mm512_max_epu8(a, b); 247 } 248 LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) { 249 // The AVX512BMI version is disabled due to bad codegen. 250 // https://github.com/llvm/llvm-project/issues/77459 251 // https://github.com/llvm/llvm-project/pull/77081 252 // TODO: Re-enable when clang version meets the fixed version. 253 #if false && defined(__AVX512VBMI__) 254 // When AVX512BMI is available we can completely reverse the vector through 255 // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a); 256 const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // 257 8, 9, 10, 11, 12, 13, 14, 15, // 258 16, 17, 18, 19, 20, 21, 22, 23, // 259 24, 25, 26, 27, 28, 29, 30, 31, // 260 32, 33, 34, 35, 36, 37, 38, 39, // 261 40, 41, 42, 43, 44, 45, 46, 47, // 262 48, 49, 50, 51, 52, 53, 54, 55, // 263 56, 57, 58, 59, 60, 61, 62, 63); 264 // Then we compute the mask for equal bytes. 265 return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), // 266 _mm512_permutexvar_epi8(indices, value)); 267 #else 268 // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__. 269 // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane. 270 // So we only reverse groups of 8 bytes, these groups are necessarily within a 271 // 16-byte lane. 272 // zmm = | 16 bytes | 16 bytes | 16 bytes | 16 bytes | 273 // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> | 274 const __m512i indices = _mm512_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, // 275 0, 1, 2, 3, 4, 5, 6, 7, // 276 8, 9, 10, 11, 12, 13, 14, 15, // 277 0, 1, 2, 3, 4, 5, 6, 7, // 278 8, 9, 10, 11, 12, 13, 14, 15, // 279 0, 1, 2, 3, 4, 5, 6, 7, // 280 8, 9, 10, 11, 12, 13, 14, 15, // 281 0, 1, 2, 3, 4, 5, 6, 7); 282 // Then we compute the mask for equal bytes. In this mask the bits of each 283 // byte are already reversed but the byte themselves should be reversed, this 284 // is done by using a bswap instruction. 285 return __builtin_bswap64( 286 _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), // 287 _mm512_shuffle_epi8(value, indices))); 288 289 #endif 290 } 291 template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) { 292 const auto a = load<__m512i>(p1, offset); 293 const auto b = load<__m512i>(p2, offset); 294 return _mm512_cmpneq_epi8_mask(a, b) == 0; 295 } 296 template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) { 297 const auto a = load<__m512i>(p1, offset); 298 const auto b = load<__m512i>(p2, offset); 299 const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b); 300 return static_cast<uint32_t>(xored >> 32) | 301 static_cast<uint32_t>(xored & 0xFFFFFFFF); 302 } 303 template <> 304 LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) { 305 const auto a = load<__m512i>(p1, offset); 306 const auto b = load<__m512i>(p2, offset); 307 const auto vmax = bytewise_max(a, b); 308 const auto le = big_endian_cmp_mask(vmax, b); 309 const auto ge = big_endian_cmp_mask(vmax, a); 310 static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>); 311 return cmp_neq_uint64_t(ge, le); 312 } 313 #endif // __AVX512BW__ 314 315 #pragma GCC diagnostic pop 316 317 } // namespace LIBC_NAMESPACE::generic 318 319 #endif // LIBC_TARGET_ARCH_IS_X86 320 321 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H 322