• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- aarch64 implementation of memory function building blocks ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file provides aarch64 specific building blocks to compose memory
10 // functions.
11 //
12 //===----------------------------------------------------------------------===//
13 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
14 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
15 
16 #include "src/__support/macros/properties/architectures.h"
17 
18 #if defined(LIBC_TARGET_ARCH_IS_AARCH64)
19 
20 #include "src/__support/CPP/type_traits.h" // cpp::always_false
21 #include "src/__support/common.h"
22 #include "src/string/memory_utils/op_generic.h"
23 
24 #ifdef __ARM_NEON
25 #include <arm_neon.h>
26 #endif //__ARM_NEON
27 
28 namespace LIBC_NAMESPACE::aarch64 {
29 
30 LIBC_INLINE_VAR constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
31 
32 namespace neon {
33 
34 struct BzeroCacheLine {
35   static constexpr size_t SIZE = 64;
36 
blockBzeroCacheLine37   LIBC_INLINE static void block(Ptr dst, uint8_t) {
38 #if __SIZEOF_POINTER__ == 4
39     asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
40 #else
41     asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
42 #endif
43   }
44 
loop_and_tailBzeroCacheLine45   LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
46     size_t offset = 0;
47     do {
48       block(dst + offset, value);
49       offset += SIZE;
50     } while (offset < count - SIZE);
51     // Unaligned store, we can't use 'dc zva' here.
52     generic::Memset<generic_v512>::tail(dst, value, count);
53   }
54 };
55 
hasZva()56 LIBC_INLINE bool hasZva() {
57   uint64_t zva_val;
58   asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
59   // DC ZVA is permitted if DZP, bit [4] is zero.
60   // BS, bits [3:0] is log2 of the block count in words.
61   // So the next line checks whether the instruction is permitted and block
62   // count is 16 words (i.e. 64 bytes).
63   return (zva_val & 0b11111) == 0b00100;
64 }
65 
66 } // namespace neon
67 
68 ///////////////////////////////////////////////////////////////////////////////
69 // Bcmp
70 template <size_t Size> struct Bcmp {
71   static constexpr size_t SIZE = Size;
72   static constexpr size_t BlockSize = 32;
73 
as_u8Bcmp74   LIBC_INLINE static const unsigned char *as_u8(CPtr ptr) {
75     return reinterpret_cast<const unsigned char *>(ptr);
76   }
77 
blockBcmp78   LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
79     if constexpr (Size == 16) {
80       auto _p1 = as_u8(p1);
81       auto _p2 = as_u8(p2);
82       uint8x16_t a = vld1q_u8(_p1);
83       uint8x16_t n = vld1q_u8(_p2);
84       uint8x16_t an = veorq_u8(a, n);
85       uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an));
86       return vmaxv_u32(an_reduced);
87     } else if constexpr (Size == 32) {
88       auto _p1 = as_u8(p1);
89       auto _p2 = as_u8(p2);
90       uint8x16_t a = vld1q_u8(_p1);
91       uint8x16_t b = vld1q_u8(_p1 + 16);
92       uint8x16_t n = vld1q_u8(_p2);
93       uint8x16_t o = vld1q_u8(_p2 + 16);
94       uint8x16_t an = veorq_u8(a, n);
95       uint8x16_t bo = veorq_u8(b, o);
96       // anbo = (a ^ n) | (b ^ o).  At least one byte is nonzero if there is
97       // a difference between the two buffers.  We reduce this value down to 4
98       // bytes in two steps. First, calculate the saturated move value when
99       // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
100       // a single 32 bit nonzero value if a mismatch occurred.
101       uint8x16_t anbo = vorrq_u8(an, bo);
102       uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
103       return vmaxv_u32(anbo_reduced);
104     } else if constexpr ((Size % BlockSize) == 0) {
105       for (size_t offset = 0; offset < Size; offset += BlockSize)
106         if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
107           return value;
108     } else {
109       static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
110     }
111     return BcmpReturnType::zero();
112   }
113 
tailBcmp114   LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
115     return block(p1 + count - SIZE, p2 + count - SIZE);
116   }
117 
head_tailBcmp118   LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
119     if constexpr (Size == 16) {
120       auto _p1 = as_u8(p1);
121       auto _p2 = as_u8(p2);
122       uint8x16_t a = vld1q_u8(_p1);
123       uint8x16_t b = vld1q_u8(_p1 + count - 16);
124       uint8x16_t n = vld1q_u8(_p2);
125       uint8x16_t o = vld1q_u8(_p2 + count - 16);
126       uint8x16_t an = veorq_u8(a, n);
127       uint8x16_t bo = veorq_u8(b, o);
128       // anbo = (a ^ n) | (b ^ o)
129       uint8x16_t anbo = vorrq_u8(an, bo);
130       uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
131       return vmaxv_u32(anbo_reduced);
132     } else if constexpr (Size == 32) {
133       auto _p1 = as_u8(p1);
134       auto _p2 = as_u8(p2);
135       uint8x16_t a = vld1q_u8(_p1);
136       uint8x16_t b = vld1q_u8(_p1 + 16);
137       uint8x16_t c = vld1q_u8(_p1 + count - 16);
138       uint8x16_t d = vld1q_u8(_p1 + count - 32);
139       uint8x16_t n = vld1q_u8(_p2);
140       uint8x16_t o = vld1q_u8(_p2 + 16);
141       uint8x16_t p = vld1q_u8(_p2 + count - 16);
142       uint8x16_t q = vld1q_u8(_p2 + count - 32);
143       uint8x16_t an = veorq_u8(a, n);
144       uint8x16_t bo = veorq_u8(b, o);
145       uint8x16_t cp = veorq_u8(c, p);
146       uint8x16_t dq = veorq_u8(d, q);
147       uint8x16_t anbo = vorrq_u8(an, bo);
148       uint8x16_t cpdq = vorrq_u8(cp, dq);
149       // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)).  Reduce this to
150       // a nonzero 32 bit value if a mismatch occurred.
151       uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
152       uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
153       return vmaxv_u32(abnocpdq_reduced);
154     } else {
155       static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
156     }
157     return BcmpReturnType::zero();
158   }
159 
loop_and_tailBcmp160   LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
161                                                   size_t count) {
162     static_assert(Size > 1, "a loop of size 1 does not need tail");
163     size_t offset = 0;
164     do {
165       if (auto value = block(p1 + offset, p2 + offset))
166         return value;
167       offset += SIZE;
168     } while (offset < count - SIZE);
169     return tail(p1, p2, count);
170   }
171 };
172 
173 } // namespace LIBC_NAMESPACE::aarch64
174 
175 namespace LIBC_NAMESPACE::generic {
176 
177 ///////////////////////////////////////////////////////////////////////////////
178 // Specializations for uint16_t
179 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
180 template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
181   return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
182 }
183 template <>
184 LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
185   return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
186 }
187 template <>
188 LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
189   return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
190          static_cast<int32_t>(load_be<uint16_t>(p2, offset));
191 }
192 
193 ///////////////////////////////////////////////////////////////////////////////
194 // Specializations for uint32_t
195 template <> struct cmp_is_expensive<uint32_t> : cpp::false_type {};
196 template <>
197 LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
198   return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
199 }
200 template <>
201 LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
202   const auto a = load_be<uint32_t>(p1, offset);
203   const auto b = load_be<uint32_t>(p2, offset);
204   return a > b ? 1 : a < b ? -1 : 0;
205 }
206 
207 ///////////////////////////////////////////////////////////////////////////////
208 // Specializations for uint64_t
209 template <> struct cmp_is_expensive<uint64_t> : cpp::false_type {};
210 template <>
211 LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
212   return load<uint64_t>(p1, offset) != load<uint64_t>(p2, offset);
213 }
214 template <>
215 LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
216   const auto a = load_be<uint64_t>(p1, offset);
217   const auto b = load_be<uint64_t>(p2, offset);
218   if (a != b)
219     return a > b ? 1 : -1;
220   return MemcmpReturnType::zero();
221 }
222 
223 ///////////////////////////////////////////////////////////////////////////////
224 // Specializations for uint8x16_t
225 template <> struct is_vector<uint8x16_t> : cpp::true_type {};
226 template <> struct cmp_is_expensive<uint8x16_t> : cpp::false_type {};
227 template <>
228 LIBC_INLINE uint32_t neq<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
229   for (size_t i = 0; i < 2; ++i) {
230     auto a = load<uint64_t>(p1, offset);
231     auto b = load<uint64_t>(p2, offset);
232     uint32_t cond = a != b;
233     if (cond)
234       return cond;
235     offset += sizeof(uint64_t);
236   }
237   return 0;
238 }
239 template <>
240 LIBC_INLINE MemcmpReturnType cmp<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
241   for (size_t i = 0; i < 2; ++i) {
242     auto a = load_be<uint64_t>(p1, offset);
243     auto b = load_be<uint64_t>(p2, offset);
244     if (a != b)
245       return cmp_neq_uint64_t(a, b);
246     offset += sizeof(uint64_t);
247   }
248   return MemcmpReturnType::zero();
249 }
250 
251 ///////////////////////////////////////////////////////////////////////////////
252 // Specializations for uint8x16x2_t
253 template <> struct is_vector<uint8x16x2_t> : cpp::true_type {};
254 template <> struct cmp_is_expensive<uint8x16x2_t> : cpp::false_type {};
255 template <>
256 LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2,
257                                                size_t offset) {
258   for (size_t i = 0; i < 4; ++i) {
259     auto a = load_be<uint64_t>(p1, offset);
260     auto b = load_be<uint64_t>(p2, offset);
261     if (a != b)
262       return cmp_neq_uint64_t(a, b);
263     offset += sizeof(uint64_t);
264   }
265   return MemcmpReturnType::zero();
266 }
267 } // namespace LIBC_NAMESPACE::generic
268 
269 #endif // LIBC_TARGET_ARCH_IS_AARCH64
270 
271 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
272