1 /* auto-generated on 2023-06-05 08:58:28 -0400. Do not edit! */
2 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf.cpp
3 /* begin file src/simdutf.cpp */
4 #include "simdutf.h"
5 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=implementation.cpp
6 /* begin file src/implementation.cpp */
7 #include <initializer_list>
8 #include <climits>
9
10 // Useful for debugging purposes
11 namespace simdutf {
12 namespace {
13
14 template <typename T>
toBinaryString(T b)15 std::string toBinaryString(T b) {
16 std::string binary = "";
17 T mask = T(1) << (sizeof(T) * CHAR_BIT - 1);
18 while (mask > 0) {
19 binary += ((b & mask) == 0) ? '0' : '1';
20 mask >>= 1;
21 }
22 return binary;
23 }
24 }
25 }
26
27 // Implementations
28 // The best choice should always come first!
29 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64.h
30 /* begin file src/simdutf/arm64.h */
31 #ifndef SIMDUTF_ARM64_H
32 #define SIMDUTF_ARM64_H
33
34 #ifdef SIMDUTF_FALLBACK_H
35 #error "arm64.h must be included before fallback.h"
36 #endif
37
38
39 #ifndef SIMDUTF_IMPLEMENTATION_ARM64
40 #define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64)
41 #endif
42 #define SIMDUTF_CAN_ALWAYS_RUN_ARM64 SIMDUTF_IMPLEMENTATION_ARM64 && SIMDUTF_IS_ARM64
43
44
45
46 #if SIMDUTF_IMPLEMENTATION_ARM64
47
48 namespace simdutf {
49 /**
50 * Implementation for NEON (ARMv8).
51 */
52 namespace arm64 {
53 } // namespace arm64
54 } // namespace simdutf
55
56 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/implementation.h
57 /* begin file src/simdutf/arm64/implementation.h */
58 #ifndef SIMDUTF_ARM64_IMPLEMENTATION_H
59 #define SIMDUTF_ARM64_IMPLEMENTATION_H
60
61
62 namespace simdutf {
63 namespace arm64 {
64
65 namespace {
66 using namespace simdutf;
67 }
68
69 class implementation final : public simdutf::implementation {
70 public:
implementation()71 simdutf_really_inline implementation() : simdutf::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {}
72 simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
73 simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
74 simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
75 simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
76 simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
77 simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
78 simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
79 simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
80 simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
81 simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
82 simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
83 simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
84 simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
85 simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
86 simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
87 simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
88 simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
89 simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
90 simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
91 simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
92 simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
93 simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
94 simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
95 simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
96 simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
97 simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
98 simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
99 simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
100 simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
101 simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
102 simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
103 simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
104 simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
105 simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
106 simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
107 simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
108 simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
109 simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
110 simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
111 simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
112 simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
113 void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
114 simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
115 simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
116 simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
117 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
118 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
119 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
120 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
121 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
122 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
123 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
124 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
125 };
126
127 } // namespace arm64
128 } // namespace simdutf
129
130 #endif // SIMDUTF_ARM64_IMPLEMENTATION_H
131 /* end file src/simdutf/arm64/implementation.h */
132
133 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h
134 /* begin file src/simdutf/arm64/begin.h */
135 // redefining SIMDUTF_IMPLEMENTATION to "arm64"
136 // #define SIMDUTF_IMPLEMENTATION arm64
137 /* end file src/simdutf/arm64/begin.h */
138
139 // Declarations
140 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/intrinsics.h
141 /* begin file src/simdutf/arm64/intrinsics.h */
142 #ifndef SIMDUTF_ARM64_INTRINSICS_H
143 #define SIMDUTF_ARM64_INTRINSICS_H
144
145
146 // This should be the correct header whether
147 // you use visual studio or other compilers.
148 #include <arm_neon.h>
149
150 #endif // SIMDUTF_ARM64_INTRINSICS_H
151 /* end file src/simdutf/arm64/intrinsics.h */
152 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/bitmanipulation.h
153 /* begin file src/simdutf/arm64/bitmanipulation.h */
154 #ifndef SIMDUTF_ARM64_BITMANIPULATION_H
155 #define SIMDUTF_ARM64_BITMANIPULATION_H
156
157 namespace simdutf {
158 namespace arm64 {
159 namespace {
160
161 /* result might be undefined when input_num is zero */
count_ones(uint64_t input_num)162 simdutf_really_inline int count_ones(uint64_t input_num) {
163 return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
164 }
165
166 } // unnamed namespace
167 } // namespace arm64
168 } // namespace simdutf
169
170 #endif // SIMDUTF_ARM64_BITMANIPULATION_H
171 /* end file src/simdutf/arm64/bitmanipulation.h */
172 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd.h
173 /* begin file src/simdutf/arm64/simd.h */
174 #ifndef SIMDUTF_ARM64_SIMD_H
175 #define SIMDUTF_ARM64_SIMD_H
176
177 #include <type_traits>
178
179
180 namespace simdutf {
181 namespace arm64 {
182 namespace {
183 namespace simd {
184
185 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
186 namespace {
187 // Start of private section with Visual Studio workaround
188
189
190 /**
191 * make_uint8x16_t initializes a SIMD register (uint8x16_t).
192 * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
193 * is not recognized under Visual Studio! This is a workaround.
194 * Using a std::initializer_list<uint8_t> as a parameter resulted in
195 * inefficient code. With the current approach, if the parameters are
196 * compile-time constants,
197 * GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
198 * You should not use this function except for compile-time constants:
199 * it is not efficient.
200 */
make_uint8x16_t(uint8_t x1,uint8_t x2,uint8_t x3,uint8_t x4,uint8_t x5,uint8_t x6,uint8_t x7,uint8_t x8,uint8_t x9,uint8_t x10,uint8_t x11,uint8_t x12,uint8_t x13,uint8_t x14,uint8_t x15,uint8_t x16)201 simdutf_really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
202 uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
203 uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
204 uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
205 // Doing a load like so end ups generating worse code.
206 // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
207 // x9, x10,x11,x12,x13,x14,x15,x16};
208 // return vld1q_u8(array);
209 uint8x16_t x{};
210 // incredibly, Visual Studio does not allow x[0] = x1
211 x = vsetq_lane_u8(x1, x, 0);
212 x = vsetq_lane_u8(x2, x, 1);
213 x = vsetq_lane_u8(x3, x, 2);
214 x = vsetq_lane_u8(x4, x, 3);
215 x = vsetq_lane_u8(x5, x, 4);
216 x = vsetq_lane_u8(x6, x, 5);
217 x = vsetq_lane_u8(x7, x, 6);
218 x = vsetq_lane_u8(x8, x, 7);
219 x = vsetq_lane_u8(x9, x, 8);
220 x = vsetq_lane_u8(x10, x, 9);
221 x = vsetq_lane_u8(x11, x, 10);
222 x = vsetq_lane_u8(x12, x, 11);
223 x = vsetq_lane_u8(x13, x, 12);
224 x = vsetq_lane_u8(x14, x, 13);
225 x = vsetq_lane_u8(x15, x, 14);
226 x = vsetq_lane_u8(x16, x, 15);
227 return x;
228 }
229
230 // We have to do the same work for make_int8x16_t
make_int8x16_t(int8_t x1,int8_t x2,int8_t x3,int8_t x4,int8_t x5,int8_t x6,int8_t x7,int8_t x8,int8_t x9,int8_t x10,int8_t x11,int8_t x12,int8_t x13,int8_t x14,int8_t x15,int8_t x16)231 simdutf_really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
232 int8_t x5, int8_t x6, int8_t x7, int8_t x8,
233 int8_t x9, int8_t x10, int8_t x11, int8_t x12,
234 int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
235 // Doing a load like so end ups generating worse code.
236 // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
237 // x9, x10,x11,x12,x13,x14,x15,x16};
238 // return vld1q_s8(array);
239 int8x16_t x{};
240 // incredibly, Visual Studio does not allow x[0] = x1
241 x = vsetq_lane_s8(x1, x, 0);
242 x = vsetq_lane_s8(x2, x, 1);
243 x = vsetq_lane_s8(x3, x, 2);
244 x = vsetq_lane_s8(x4, x, 3);
245 x = vsetq_lane_s8(x5, x, 4);
246 x = vsetq_lane_s8(x6, x, 5);
247 x = vsetq_lane_s8(x7, x, 6);
248 x = vsetq_lane_s8(x8, x, 7);
249 x = vsetq_lane_s8(x9, x, 8);
250 x = vsetq_lane_s8(x10, x, 9);
251 x = vsetq_lane_s8(x11, x, 10);
252 x = vsetq_lane_s8(x12, x, 11);
253 x = vsetq_lane_s8(x13, x, 12);
254 x = vsetq_lane_s8(x14, x, 13);
255 x = vsetq_lane_s8(x15, x, 14);
256 x = vsetq_lane_s8(x16, x, 15);
257 return x;
258 }
259
make_uint8x8_t(uint8_t x1,uint8_t x2,uint8_t x3,uint8_t x4,uint8_t x5,uint8_t x6,uint8_t x7,uint8_t x8)260 simdutf_really_inline uint8x8_t make_uint8x8_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
261 uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8) {
262 uint8x8_t x{};
263 x = vset_lane_u8(x1, x, 0);
264 x = vset_lane_u8(x2, x, 1);
265 x = vset_lane_u8(x3, x, 2);
266 x = vset_lane_u8(x4, x, 3);
267 x = vset_lane_u8(x5, x, 4);
268 x = vset_lane_u8(x6, x, 5);
269 x = vset_lane_u8(x7, x, 6);
270 x = vset_lane_u8(x8, x, 7);
271 return x;
272 }
273
make_uint16x8_t(uint16_t x1,uint16_t x2,uint16_t x3,uint16_t x4,uint16_t x5,uint16_t x6,uint16_t x7,uint16_t x8)274 simdutf_really_inline uint16x8_t make_uint16x8_t(uint16_t x1, uint16_t x2, uint16_t x3, uint16_t x4,
275 uint16_t x5, uint16_t x6, uint16_t x7, uint16_t x8) {
276 uint16x8_t x{};
277 x = vsetq_lane_u16(x1, x, 0);
278 x = vsetq_lane_u16(x2, x, 1);
279 x = vsetq_lane_u16(x3, x, 2);
280 x = vsetq_lane_u16(x4, x, 3);
281 x = vsetq_lane_u16(x5, x, 4);
282 x = vsetq_lane_u16(x6, x, 5);
283 x = vsetq_lane_u16(x7, x, 6);
284 x = vsetq_lane_u16(x8, x, 7);;
285 return x;
286 }
287
make_int16x8_t(int16_t x1,int16_t x2,int16_t x3,int16_t x4,int16_t x5,int16_t x6,int16_t x7,int16_t x8)288 simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1, int16_t x2, int16_t x3, int16_t x4,
289 int16_t x5, int16_t x6, int16_t x7, int16_t x8) {
290 uint16x8_t x{};
291 x = vsetq_lane_s16(x1, x, 0);
292 x = vsetq_lane_s16(x2, x, 1);
293 x = vsetq_lane_s16(x3, x, 2);
294 x = vsetq_lane_s16(x4, x, 3);
295 x = vsetq_lane_s16(x5, x, 4);
296 x = vsetq_lane_s16(x6, x, 5);
297 x = vsetq_lane_s16(x7, x, 6);
298 x = vsetq_lane_s16(x8, x, 7);;
299 return x;
300 }
301
302
303 // End of private section with Visual Studio workaround
304 } // namespace
305 #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
306
307
308 template<typename T>
309 struct simd8;
310
311 //
312 // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
313 //
314 template<typename T, typename Mask=simd8<bool>>
315 struct base_u8 {
316 uint8x16_t value;
317 static const int SIZE = sizeof(value);
318
319 // Conversion from/to SIMD register
base_u8simdutf::arm64::__anone55652eb0411::simd::base_u8320 simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {}
operator const uint8x16_t&simdutf::arm64::__anone55652eb0411::simd::base_u8321 simdutf_really_inline operator const uint8x16_t&() const { return this->value; }
operator uint8x16_t&simdutf::arm64::__anone55652eb0411::simd::base_u8322 simdutf_really_inline operator uint8x16_t&() { return this->value; }
firstsimdutf::arm64::__anone55652eb0411::simd::base_u8323 simdutf_really_inline T first() const { return vgetq_lane_u8(*this,0); }
lastsimdutf::arm64::__anone55652eb0411::simd::base_u8324 simdutf_really_inline T last() const { return vgetq_lane_u8(*this,15); }
325
326 // Bit operations
operator |simdutf::arm64::__anone55652eb0411::simd::base_u8327 simdutf_really_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
operator &simdutf::arm64::__anone55652eb0411::simd::base_u8328 simdutf_really_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
operator ^simdutf::arm64::__anone55652eb0411::simd::base_u8329 simdutf_really_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
bit_andnotsimdutf::arm64::__anone55652eb0411::simd::base_u8330 simdutf_really_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
operator ~simdutf::arm64::__anone55652eb0411::simd::base_u8331 simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
operator |=simdutf::arm64::__anone55652eb0411::simd::base_u8332 simdutf_really_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
operator &=simdutf::arm64::__anone55652eb0411::simd::base_u8333 simdutf_really_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
operator ^=simdutf::arm64::__anone55652eb0411::simd::base_u8334 simdutf_really_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
335
operator ==(const simd8<T> lhs,const simd8<T> rhs)336 friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); }
337
338 template<int N=1>
prevsimdutf::arm64::__anone55652eb0411::simd::base_u8339 simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
340 return vextq_u8(prev_chunk, *this, 16 - N);
341 }
342 };
343
344 // SIMD byte mask type (returned by things like eq and gt)
345 template<>
346 struct simd8<bool>: base_u8<bool> {
347 typedef uint16_t bitmask_t;
348 typedef uint32_t bitmask2_t;
349
splatsimdutf::arm64::__anone55652eb0411::simd::simd8350 static simdutf_really_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
351
simd8simdutf::arm64::__anone55652eb0411::simd::simd8352 simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
353 // False constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8354 simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {}
355 // Splat constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8356 simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
storesimdutf::arm64::__anone55652eb0411::simd::simd8357 simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
358
359 // We return uint32_t instead of uint16_t because that seems to be more efficient for most
360 // purposes (cutting it down to uint16_t costs performance in some compilers).
to_bitmasksimdutf::arm64::__anone55652eb0411::simd::simd8361 simdutf_really_inline uint32_t to_bitmask() const {
362 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
363 const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
364 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
365 #else
366 const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
367 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
368 #endif
369 auto minput = *this & bit_mask;
370 uint8x16_t tmp = vpaddq_u8(minput, minput);
371 tmp = vpaddq_u8(tmp, tmp);
372 tmp = vpaddq_u8(tmp, tmp);
373 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
374 }
375
376 // Returns 4-bit out of each byte, alternating between the high 4 bits and low bits
377 // result it is 64 bit.
378 // This method is expected to be faster than none() and is equivalent
379 // when the vector register is the result of a comparison, with byte
380 // values 0xff and 0x00.
to_bitmask64simdutf::arm64::__anone55652eb0411::simd::simd8381 simdutf_really_inline uint64_t to_bitmask64() const {
382 return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
383 }
384
anysimdutf::arm64::__anone55652eb0411::simd::simd8385 simdutf_really_inline bool any() const { return vmaxvq_u8(*this) != 0; }
nonesimdutf::arm64::__anone55652eb0411::simd::simd8386 simdutf_really_inline bool none() const { return vmaxvq_u8(*this) == 0; }
allsimdutf::arm64::__anone55652eb0411::simd::simd8387 simdutf_really_inline bool all() const { return vminvq_u8(*this) == 0xFF; }
388
389
390 };
391
392 // Unsigned bytes
393 template<>
394 struct simd8<uint8_t>: base_u8<uint8_t> {
splatsimdutf::arm64::__anone55652eb0411::simd::simd8395 static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) { return vmovq_n_u8(_value); }
zerosimdutf::arm64::__anone55652eb0411::simd::simd8396 static simdutf_really_inline simd8<uint8_t> zero() { return vdupq_n_u8(0); }
loadsimdutf::arm64::__anone55652eb0411::simd::simd8397 static simdutf_really_inline simd8<uint8_t> load(const uint8_t* values) { return vld1q_u8(values); }
simd8simdutf::arm64::__anone55652eb0411::simd::simd8398 simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
399 // Zero constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8400 simdutf_really_inline simd8() : simd8(zero()) {}
401 // Array constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8402 simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
403 // Splat constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8404 simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
405 // Member-by-member initialization
406 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
simd8simdutf::arm64::__anone55652eb0411::simd::simd8407 simdutf_really_inline simd8(
408 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
409 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
410 ) : simd8(make_uint8x16_t(
411 v0, v1, v2, v3, v4, v5, v6, v7,
412 v8, v9, v10,v11,v12,v13,v14,v15
413 )) {}
414 #else
simd8simdutf::arm64::__anone55652eb0411::simd::simd8415 simdutf_really_inline simd8(
416 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
417 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
418 ) : simd8(uint8x16_t{
419 v0, v1, v2, v3, v4, v5, v6, v7,
420 v8, v9, v10,v11,v12,v13,v14,v15
421 }) {}
422 #endif
423
424 // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::arm64::__anone55652eb0411::simd::simd8425 simdutf_really_inline static simd8<uint8_t> repeat_16(
426 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
427 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
428 ) {
429 return simd8<uint8_t>(
430 v0, v1, v2, v3, v4, v5, v6, v7,
431 v8, v9, v10,v11,v12,v13,v14,v15
432 );
433 }
434
435 // Store to array
storesimdutf::arm64::__anone55652eb0411::simd::simd8436 simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
437
438 // Saturated math
saturating_addsimdutf::arm64::__anone55652eb0411::simd::simd8439 simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
saturating_subsimdutf::arm64::__anone55652eb0411::simd::simd8440 simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
441
442 // Addition/subtraction are the same for signed and unsigned
operator +simdutf::arm64::__anone55652eb0411::simd::simd8443 simdutf_really_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
operator -simdutf::arm64::__anone55652eb0411::simd::simd8444 simdutf_really_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
operator +=simdutf::arm64::__anone55652eb0411::simd::simd8445 simdutf_really_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
operator -=simdutf::arm64::__anone55652eb0411::simd::simd8446 simdutf_really_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
447
448 // Order-specific operations
max_valsimdutf::arm64::__anone55652eb0411::simd::simd8449 simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd8450 simdutf_really_inline uint8_t min_val() const { return vminvq_u8(*this); }
max_valsimdutf::arm64::__anone55652eb0411::simd::simd8451 simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return vmaxq_u8(*this, other); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd8452 simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return vminq_u8(*this, other); }
operator <=simdutf::arm64::__anone55652eb0411::simd::simd8453 simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }
operator >=simdutf::arm64::__anone55652eb0411::simd::simd8454 simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(*this, other); }
operator <simdutf::arm64::__anone55652eb0411::simd::simd8455 simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(*this, other); }
operator >simdutf::arm64::__anone55652eb0411::simd::simd8456 simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(*this, other); }
457 // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
gt_bitssimdutf::arm64::__anone55652eb0411::simd::simd8458 simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
459 // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
lt_bitssimdutf::arm64::__anone55652eb0411::simd::simd8460 simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
461
462 // Bit-specific operations
any_bits_setsimdutf::arm64::__anone55652eb0411::simd::simd8463 simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
is_asciisimdutf::arm64::__anone55652eb0411::simd::simd8464 simdutf_really_inline bool is_ascii() const { return this->max_val() < 0b10000000u; }
465
any_bits_set_anywheresimdutf::arm64::__anone55652eb0411::simd::simd8466 simdutf_really_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; }
any_bits_set_anywheresimdutf::arm64::__anone55652eb0411::simd::simd8467 simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
468 template<int N>
shrsimdutf::arm64::__anone55652eb0411::simd::simd8469 simdutf_really_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
470 template<int N>
shlsimdutf::arm64::__anone55652eb0411::simd::simd8471 simdutf_really_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
472
473 // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
474 template<typename L>
lookup_16simdutf::arm64::__anone55652eb0411::simd::simd8475 simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
476 return lookup_table.apply_lookup_16_to(*this);
477 }
478
479
480 template<typename L>
lookup_16simdutf::arm64::__anone55652eb0411::simd::simd8481 simdutf_really_inline simd8<L> lookup_16(
482 L replace0, L replace1, L replace2, L replace3,
483 L replace4, L replace5, L replace6, L replace7,
484 L replace8, L replace9, L replace10, L replace11,
485 L replace12, L replace13, L replace14, L replace15) const {
486 return lookup_16(simd8<L>::repeat_16(
487 replace0, replace1, replace2, replace3,
488 replace4, replace5, replace6, replace7,
489 replace8, replace9, replace10, replace11,
490 replace12, replace13, replace14, replace15
491 ));
492 }
493
494 template<typename T>
apply_lookup_16_tosimdutf::arm64::__anone55652eb0411::simd::simd8495 simdutf_really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) const {
496 return vqtbl1q_u8(*this, simd8<uint8_t>(original));
497 }
498 };
499
500 // Signed bytes
501 template<>
502 struct simd8<int8_t> {
503 int8x16_t value;
504
splatsimdutf::arm64::__anone55652eb0411::simd::simd8505 static simdutf_really_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
zerosimdutf::arm64::__anone55652eb0411::simd::simd8506 static simdutf_really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
loadsimdutf::arm64::__anone55652eb0411::simd::simd8507 static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
508 template <endianness big_endian>
store_ascii_as_utf16simdutf::arm64::__anone55652eb0411::simd::simd8509 simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
510 uint16x8_t first = vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value)));
511 uint16x8_t second = vmovl_high_u8(vreinterpretq_u8_s8(this->value));
512 if (!match_system(big_endian)) {
513 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
514 const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
515 #else
516 const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
517 #endif
518 first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(first), swap));
519 second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(second), swap));
520 }
521 vst1q_u16(reinterpret_cast<uint16_t*>(p), first);
522 vst1q_u16(reinterpret_cast<uint16_t*>(p + 8), second);
523 }
store_ascii_as_utf32simdutf::arm64::__anone55652eb0411::simd::simd8524 simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const {
525 vst1q_u32(reinterpret_cast<uint32_t*>(p), vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value))))));
526 vst1q_u32(reinterpret_cast<uint32_t*>(p + 4), vmovl_high_u16(vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value)))));
527 vst1q_u32(reinterpret_cast<uint32_t*>(p + 8), vmovl_u16(vget_low_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value)))));
528 vst1q_u32(reinterpret_cast<uint32_t*>(p + 12), vmovl_high_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value))));
529 }
530 // Conversion from/to SIMD register
simd8simdutf::arm64::__anone55652eb0411::simd::simd8531 simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {}
operator const int8x16_t&simdutf::arm64::__anone55652eb0411::simd::simd8532 simdutf_really_inline operator const int8x16_t&() const { return this->value; }
operator const uint8x16_tsimdutf::arm64::__anone55652eb0411::simd::simd8533 simdutf_really_inline operator const uint8x16_t() const { return vreinterpretq_u8_s8(this->value); }
operator int8x16_t&simdutf::arm64::__anone55652eb0411::simd::simd8534 simdutf_really_inline operator int8x16_t&() { return this->value; }
535
536 // Zero constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8537 simdutf_really_inline simd8() : simd8(zero()) {}
538 // Splat constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8539 simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
540 // Array constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8541 simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
542 // Member-by-member initialization
543 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
simd8simdutf::arm64::__anone55652eb0411::simd::simd8544 simdutf_really_inline simd8(
545 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
546 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
547 ) : simd8(make_int8x16_t(
548 v0, v1, v2, v3, v4, v5, v6, v7,
549 v8, v9, v10,v11,v12,v13,v14,v15
550 )) {}
551 #else
simd8simdutf::arm64::__anone55652eb0411::simd::simd8552 simdutf_really_inline simd8(
553 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
554 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
555 ) : simd8(int8x16_t{
556 v0, v1, v2, v3, v4, v5, v6, v7,
557 v8, v9, v10,v11,v12,v13,v14,v15
558 }) {}
559 #endif
560 // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::arm64::__anone55652eb0411::simd::simd8561 simdutf_really_inline static simd8<int8_t> repeat_16(
562 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
563 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
564 ) {
565 return simd8<int8_t>(
566 v0, v1, v2, v3, v4, v5, v6, v7,
567 v8, v9, v10,v11,v12,v13,v14,v15
568 );
569 }
570
571 // Store to array
storesimdutf::arm64::__anone55652eb0411::simd::simd8572 simdutf_really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, value); }
573 // Explicit conversion to/from unsigned
574 //
575 // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
576 // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
577 // and relatively ugly and hard to read.
578 #ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
simd8simdutf::arm64::__anone55652eb0411::simd::simd8579 simdutf_really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
580 #endif
operator simd8<uint8_t>simdutf::arm64::__anone55652eb0411::simd::simd8581 simdutf_really_inline operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
582
operator |simdutf::arm64::__anone55652eb0411::simd::simd8583 simdutf_really_inline simd8<int8_t> operator|(const simd8<int8_t> other) const { return vorrq_s8(value, other.value); }
operator &simdutf::arm64::__anone55652eb0411::simd::simd8584 simdutf_really_inline simd8<int8_t> operator&(const simd8<int8_t> other) const { return vandq_s8(value, other.value); }
operator ^simdutf::arm64::__anone55652eb0411::simd::simd8585 simdutf_really_inline simd8<int8_t> operator^(const simd8<int8_t> other) const { return veorq_s8(value, other.value); }
bit_andnotsimdutf::arm64::__anone55652eb0411::simd::simd8586 simdutf_really_inline simd8<int8_t> bit_andnot(const simd8<int8_t> other) const { return vbicq_s8(value, other.value); }
587
588 // Math
operator +simdutf::arm64::__anone55652eb0411::simd::simd8589 simdutf_really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(value, other.value); }
operator -simdutf::arm64::__anone55652eb0411::simd::simd8590 simdutf_really_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(value, other.value); }
operator +=simdutf::arm64::__anone55652eb0411::simd::simd8591 simdutf_really_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
operator -=simdutf::arm64::__anone55652eb0411::simd::simd8592 simdutf_really_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
593
max_valsimdutf::arm64::__anone55652eb0411::simd::simd8594 simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd8595 simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); }
is_asciisimdutf::arm64::__anone55652eb0411::simd::simd8596 simdutf_really_inline bool is_ascii() const { return this->min_val() >= 0; }
597
598 // Order-sensitive comparisons
max_valsimdutf::arm64::__anone55652eb0411::simd::simd8599 simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return vmaxq_s8(value, other.value); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd8600 simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return vminq_s8(value, other.value); }
operator >simdutf::arm64::__anone55652eb0411::simd::simd8601 simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(value, other.value); }
operator <simdutf::arm64::__anone55652eb0411::simd::simd8602 simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(value, other.value); }
operator ==simdutf::arm64::__anone55652eb0411::simd::simd8603 simdutf_really_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(value, other.value); }
604
605 template<int N=1>
prevsimdutf::arm64::__anone55652eb0411::simd::simd8606 simdutf_really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
607 return vextq_s8(prev_chunk, *this, 16 - N);
608 }
609
610 // Perform a lookup assuming no value is larger than 16
611 template<typename L>
lookup_16simdutf::arm64::__anone55652eb0411::simd::simd8612 simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
613 return lookup_table.apply_lookup_16_to(*this);
614 }
615 template<typename L>
lookup_16simdutf::arm64::__anone55652eb0411::simd::simd8616 simdutf_really_inline simd8<L> lookup_16(
617 L replace0, L replace1, L replace2, L replace3,
618 L replace4, L replace5, L replace6, L replace7,
619 L replace8, L replace9, L replace10, L replace11,
620 L replace12, L replace13, L replace14, L replace15) const {
621 return lookup_16(simd8<L>::repeat_16(
622 replace0, replace1, replace2, replace3,
623 replace4, replace5, replace6, replace7,
624 replace8, replace9, replace10, replace11,
625 replace12, replace13, replace14, replace15
626 ));
627 }
628
629 template<typename T>
apply_lookup_16_tosimdutf::arm64::__anone55652eb0411::simd::simd8630 simdutf_really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
631 return vqtbl1q_s8(*this, simd8<uint8_t>(original));
632 }
633 };
634
635 template<typename T>
636 struct simd8x64 {
637 static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
638 static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
639 simd8<T> chunks[NUM_CHUNKS];
640
641 simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
642 simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
643 simd8x64() = delete; // no default constructor allowed
644
simd8x64simdutf::arm64::__anone55652eb0411::simd::simd8x64645 simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
simd8x64simdutf::arm64::__anone55652eb0411::simd::simd8x64646 simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
647
storesimdutf::arm64::__anone55652eb0411::simd::simd8x64648 simdutf_really_inline void store(T* ptr) const {
649 this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
650 this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
651 this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
652 this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
653 }
654
655
operator |=simdutf::arm64::__anone55652eb0411::simd::simd8x64656 simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
657 this->chunks[0] |= other.chunks[0];
658 this->chunks[1] |= other.chunks[1];
659 this->chunks[2] |= other.chunks[2];
660 this->chunks[3] |= other.chunks[3];
661 return *this;
662 }
663
reduce_orsimdutf::arm64::__anone55652eb0411::simd::simd8x64664 simdutf_really_inline simd8<T> reduce_or() const {
665 return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
666 }
667
is_asciisimdutf::arm64::__anone55652eb0411::simd::simd8x64668 simdutf_really_inline bool is_ascii() const {
669 return reduce_or().is_ascii();
670 }
671
672 template <endianness endian>
store_ascii_as_utf16simdutf::arm64::__anone55652eb0411::simd::simd8x64673 simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
674 this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
675 this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
676 this->chunks[2].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*2);
677 this->chunks[3].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*3);
678 }
679
store_ascii_as_utf32simdutf::arm64::__anone55652eb0411::simd::simd8x64680 simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
681 this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
682 this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
683 this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*2);
684 this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*3);
685 }
686
to_bitmasksimdutf::arm64::__anone55652eb0411::simd::simd8x64687 simdutf_really_inline uint64_t to_bitmask() const {
688 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
689 const uint8x16_t bit_mask = make_uint8x16_t(
690 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
691 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
692 );
693 #else
694 const uint8x16_t bit_mask = {
695 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
696 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
697 };
698 #endif
699 // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
700 uint8x16_t sum0 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), vandq_u8(uint8x16_t(this->chunks[1]), bit_mask));
701 uint8x16_t sum1 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), vandq_u8(uint8x16_t(this->chunks[3]), bit_mask));
702 sum0 = vpaddq_u8(sum0, sum1);
703 sum0 = vpaddq_u8(sum0, sum0);
704 return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
705 }
706
eqsimdutf::arm64::__anone55652eb0411::simd::simd8x64707 simdutf_really_inline uint64_t eq(const T m) const {
708 const simd8<T> mask = simd8<T>::splat(m);
709 return simd8x64<bool>(
710 this->chunks[0] == mask,
711 this->chunks[1] == mask,
712 this->chunks[2] == mask,
713 this->chunks[3] == mask
714 ).to_bitmask();
715 }
716
lteqsimdutf::arm64::__anone55652eb0411::simd::simd8x64717 simdutf_really_inline uint64_t lteq(const T m) const {
718 const simd8<T> mask = simd8<T>::splat(m);
719 return simd8x64<bool>(
720 this->chunks[0] <= mask,
721 this->chunks[1] <= mask,
722 this->chunks[2] <= mask,
723 this->chunks[3] <= mask
724 ).to_bitmask();
725 }
726
in_rangesimdutf::arm64::__anone55652eb0411::simd::simd8x64727 simdutf_really_inline uint64_t in_range(const T low, const T high) const {
728 const simd8<T> mask_low = simd8<T>::splat(low);
729 const simd8<T> mask_high = simd8<T>::splat(high);
730
731 return simd8x64<bool>(
732 (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
733 (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
734 (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
735 (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
736 ).to_bitmask();
737 }
not_in_rangesimdutf::arm64::__anone55652eb0411::simd::simd8x64738 simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
739 const simd8<T> mask_low = simd8<T>::splat(low);
740 const simd8<T> mask_high = simd8<T>::splat(high);
741 return simd8x64<bool>(
742 (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
743 (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
744 (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
745 (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
746 ).to_bitmask();
747 }
ltsimdutf::arm64::__anone55652eb0411::simd::simd8x64748 simdutf_really_inline uint64_t lt(const T m) const {
749 const simd8<T> mask = simd8<T>::splat(m);
750 return simd8x64<bool>(
751 this->chunks[0] < mask,
752 this->chunks[1] < mask,
753 this->chunks[2] < mask,
754 this->chunks[3] < mask
755 ).to_bitmask();
756 }
gtsimdutf::arm64::__anone55652eb0411::simd::simd8x64757 simdutf_really_inline uint64_t gt(const T m) const {
758 const simd8<T> mask = simd8<T>::splat(m);
759 return simd8x64<bool>(
760 this->chunks[0] > mask,
761 this->chunks[1] > mask,
762 this->chunks[2] > mask,
763 this->chunks[3] > mask
764 ).to_bitmask();
765 }
gteqsimdutf::arm64::__anone55652eb0411::simd::simd8x64766 simdutf_really_inline uint64_t gteq(const T m) const {
767 const simd8<T> mask = simd8<T>::splat(m);
768 return simd8x64<bool>(
769 this->chunks[0] >= mask,
770 this->chunks[1] >= mask,
771 this->chunks[2] >= mask,
772 this->chunks[3] >= mask
773 ).to_bitmask();
774 }
gteq_unsignedsimdutf::arm64::__anone55652eb0411::simd::simd8x64775 simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
776 const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
777 return simd8x64<bool>(
778 simd8<uint8_t>(uint8x16_t(this->chunks[0])) >= mask,
779 simd8<uint8_t>(uint8x16_t(this->chunks[1])) >= mask,
780 simd8<uint8_t>(uint8x16_t(this->chunks[2])) >= mask,
781 simd8<uint8_t>(uint8x16_t(this->chunks[3])) >= mask
782 ).to_bitmask();
783 }
784 }; // struct simd8x64<T>
785 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd16-inl.h
786 /* begin file src/simdutf/arm64/simd16-inl.h */
787 template<typename T>
788 struct simd16;
789
790 template<typename T, typename Mask=simd16<bool>>
791 struct base_u16 {
792 uint16x8_t value;
793 static const int SIZE = sizeof(value);
794
795 // Conversion from/to SIMD register
796 simdutf_really_inline base_u16() = default;
base_u16simdutf::arm64::__anone55652eb0411::simd::base_u16797 simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {}
operator const uint16x8_t&simdutf::arm64::__anone55652eb0411::simd::base_u16798 simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
operator uint16x8_t&simdutf::arm64::__anone55652eb0411::simd::base_u16799 simdutf_really_inline operator uint16x8_t&() { return this->value; }
800 // Bit operations
operator |simdutf::arm64::__anone55652eb0411::simd::base_u16801 simdutf_really_inline simd16<T> operator|(const simd16<T> other) const { return vorrq_u16(*this, other); }
operator &simdutf::arm64::__anone55652eb0411::simd::base_u16802 simdutf_really_inline simd16<T> operator&(const simd16<T> other) const { return vandq_u16(*this, other); }
operator ^simdutf::arm64::__anone55652eb0411::simd::base_u16803 simdutf_really_inline simd16<T> operator^(const simd16<T> other) const { return veorq_u16(*this, other); }
bit_andnotsimdutf::arm64::__anone55652eb0411::simd::base_u16804 simdutf_really_inline simd16<T> bit_andnot(const simd16<T> other) const { return vbicq_u16(*this, other); }
operator ~simdutf::arm64::__anone55652eb0411::simd::base_u16805 simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
operator |=simdutf::arm64::__anone55652eb0411::simd::base_u16806 simdutf_really_inline simd16<T>& operator|=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
operator &=simdutf::arm64::__anone55652eb0411::simd::base_u16807 simdutf_really_inline simd16<T>& operator&=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
operator ^=simdutf::arm64::__anone55652eb0411::simd::base_u16808 simdutf_really_inline simd16<T>& operator^=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
809
operator ==(const simd16<T> lhs,const simd16<T> rhs)810 friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return vceqq_u16(lhs, rhs); }
811
812 template<int N=1>
prevsimdutf::arm64::__anone55652eb0411::simd::base_u16813 simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
814 return vextq_u18(prev_chunk, *this, 8 - N);
815 }
816 };
817
818 template<typename T, typename Mask=simd16<bool>>
819 struct base16: base_u16<T> {
820 typedef uint16_t bitmask_t;
821 typedef uint32_t bitmask2_t;
822
base16simdutf::arm64::__anone55652eb0411::simd::base16823 simdutf_really_inline base16() : base_u16<T>() {}
base16simdutf::arm64::__anone55652eb0411::simd::base16824 simdutf_really_inline base16(const uint16x8_t _value) : base_u16<T>(_value) {}
825 template <typename Pointer>
base16simdutf::arm64::__anone55652eb0411::simd::base16826 simdutf_really_inline base16(const Pointer* ptr) : base16(vld1q_u16(ptr)) {}
827
828 static const int SIZE = sizeof(base_u16<T>::value);
829
830 template<int N=1>
prevsimdutf::arm64::__anone55652eb0411::simd::base16831 simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
832 return vextq_u18(prev_chunk, *this, 8 - N);
833 }
834 };
835
836 // SIMD byte mask type (returned by things like eq and gt)
837 template<>
838 struct simd16<bool>: base16<bool> {
splatsimdutf::arm64::__anone55652eb0411::simd::simd16839 static simdutf_really_inline simd16<bool> splat(bool _value) { return vmovq_n_u16(uint16_t(-(!!_value))); }
840
simd16simdutf::arm64::__anone55652eb0411::simd::simd16841 simdutf_really_inline simd16<bool>() : base16() {}
simd16simdutf::arm64::__anone55652eb0411::simd::simd16842 simdutf_really_inline simd16<bool>(const uint16x8_t _value) : base16<bool>(_value) {}
843 // Splat constructor
simd16simdutf::arm64::__anone55652eb0411::simd::simd16844 simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
845
846 };
847
848 template<typename T>
849 struct base16_numeric: base16<T> {
splatsimdutf::arm64::__anone55652eb0411::simd::base16_numeric850 static simdutf_really_inline simd16<T> splat(T _value) { return vmovq_n_u16(_value); }
zerosimdutf::arm64::__anone55652eb0411::simd::base16_numeric851 static simdutf_really_inline simd16<T> zero() { return vdupq_n_u16(0); }
loadsimdutf::arm64::__anone55652eb0411::simd::base16_numeric852 static simdutf_really_inline simd16<T> load(const T values[8]) {
853 return vld1q_u16(reinterpret_cast<const uint16_t*>(values));
854 }
855
base16_numericsimdutf::arm64::__anone55652eb0411::simd::base16_numeric856 simdutf_really_inline base16_numeric() : base16<T>() {}
base16_numericsimdutf::arm64::__anone55652eb0411::simd::base16_numeric857 simdutf_really_inline base16_numeric(const uint16x8_t _value) : base16<T>(_value) {}
858
859 // Store to array
storesimdutf::arm64::__anone55652eb0411::simd::base16_numeric860 simdutf_really_inline void store(T dst[8]) const { return vst1q_u16(dst, *this); }
861
862 // Override to distinguish from bool version
operator ~simdutf::arm64::__anone55652eb0411::simd::base16_numeric863 simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
864
865 // Addition/subtraction are the same for signed and unsigned
operator +simdutf::arm64::__anone55652eb0411::simd::base16_numeric866 simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return vaddq_u8(*this, other); }
operator -simdutf::arm64::__anone55652eb0411::simd::base16_numeric867 simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return vsubq_u8(*this, other); }
operator +=simdutf::arm64::__anone55652eb0411::simd::base16_numeric868 simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
operator -=simdutf::arm64::__anone55652eb0411::simd::base16_numeric869 simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
870 };
871
872 // Signed words
873 template<>
874 struct simd16<int16_t> : base16_numeric<int16_t> {
simd16simdutf::arm64::__anone55652eb0411::simd::simd16875 simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
876 #ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
simd16simdutf::arm64::__anone55652eb0411::simd::simd16877 simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<int16_t>(_value) {}
878 #endif
simd16simdutf::arm64::__anone55652eb0411::simd::simd16879 simdutf_really_inline simd16(const int16x8_t _value) : base16_numeric<int16_t>(vreinterpretq_u16_s16(_value)) {}
880
881 // Splat constructor
simd16simdutf::arm64::__anone55652eb0411::simd::simd16882 simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
883 // Array constructor
simd16simdutf::arm64::__anone55652eb0411::simd::simd16884 simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
simd16simdutf::arm64::__anone55652eb0411::simd::simd16885 simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
886 simdutf_really_inline operator simd16<uint16_t>() const;
operator const uint16x8_t&simdutf::arm64::__anone55652eb0411::simd::simd16887 simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
operator const int16x8_tsimdutf::arm64::__anone55652eb0411::simd::simd16888 simdutf_really_inline operator const int16x8_t() const { return vreinterpretq_s16_u16(this->value); }
889
max_valsimdutf::arm64::__anone55652eb0411::simd::simd16890 simdutf_really_inline int16_t max_val() const { return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd16891 simdutf_really_inline int16_t min_val() const { return vminvq_s16(vreinterpretq_s16_u16(this->value)); }
892 // Order-sensitive comparisons
max_valsimdutf::arm64::__anone55652eb0411::simd::simd16893 simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd16894 simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
operator >simdutf::arm64::__anone55652eb0411::simd::simd16895 simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return vcgtq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
operator <simdutf::arm64::__anone55652eb0411::simd::simd16896 simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return vcltq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
897 };
898
899
900
901
902 // Unsigned words
903 template<>
904 struct simd16<uint16_t>: base16_numeric<uint16_t> {
simd16simdutf::arm64::__anone55652eb0411::simd::simd16905 simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
simd16simdutf::arm64::__anone55652eb0411::simd::simd16906 simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<uint16_t>(_value) {}
907
908 // Splat constructor
simd16simdutf::arm64::__anone55652eb0411::simd::simd16909 simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
910 // Array constructor
simd16simdutf::arm64::__anone55652eb0411::simd::simd16911 simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
simd16simdutf::arm64::__anone55652eb0411::simd::simd16912 simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
913
914
max_valsimdutf::arm64::__anone55652eb0411::simd::simd16915 simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd16916 simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); }
917 // Saturated math
saturating_addsimdutf::arm64::__anone55652eb0411::simd::simd16918 simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return vqaddq_u16(*this, other); }
saturating_subsimdutf::arm64::__anone55652eb0411::simd::simd16919 simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return vqsubq_u16(*this, other); }
920
921 // Order-specific operations
max_valsimdutf::arm64::__anone55652eb0411::simd::simd16922 simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return vmaxq_u16(*this, other); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd16923 simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return vminq_u16(*this, other); }
924 // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::arm64::__anone55652eb0411::simd::simd16925 simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
926 // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::arm64::__anone55652eb0411::simd::simd16927 simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::arm64::__anone55652eb0411::simd::simd16928 simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return vcleq_u16(*this, other); }
operator >=simdutf::arm64::__anone55652eb0411::simd::simd16929 simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return vcgeq_u16(*this, other); }
operator >simdutf::arm64::__anone55652eb0411::simd::simd16930 simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return vcgtq_u16(*this, other); }
operator <simdutf::arm64::__anone55652eb0411::simd::simd16931 simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return vcltq_u16(*this, other); }
932
933 // Bit-specific operations
bits_not_setsimdutf::arm64::__anone55652eb0411::simd::simd16934 simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
935 template<int N>
shrsimdutf::arm64::__anone55652eb0411::simd::simd16936 simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(vshrq_n_u16(*this, N)); }
937 template<int N>
shlsimdutf::arm64::__anone55652eb0411::simd::simd16938 simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(vshlq_n_u16(*this, N)); }
939
940 // logical operations
operator |simdutf::arm64::__anone55652eb0411::simd::simd16941 simdutf_really_inline simd16<uint16_t> operator|(const simd16<uint16_t> other) const { return vorrq_u16(*this, other); }
operator &simdutf::arm64::__anone55652eb0411::simd::simd16942 simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> other) const { return vandq_u16(*this, other); }
operator ^simdutf::arm64::__anone55652eb0411::simd::simd16943 simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> other) const { return veorq_u16(*this, other); }
944
945 // Pack with the unsigned saturation two uint16_t words into single uint8_t vector
packsimdutf::arm64::__anone55652eb0411::simd::simd16946 static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
947 return vqmovn_high_u16(vqmovn_u16(v0), v1);
948 }
949
950 // Change the endianness
swap_bytessimdutf::arm64::__anone55652eb0411::simd::simd16951 simdutf_really_inline simd16<uint16_t> swap_bytes() const {
952 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
953 const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
954 #else
955 const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
956 #endif
957 return vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(*this), swap));
958 }
959 };
operator simd16<uint16_t>() const960 simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
961
962
963 template<typename T>
964 struct simd16x32 {
965 static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
966 static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
967 simd16<T> chunks[NUM_CHUNKS];
968
969 simd16x32(const simd16x32<T>& o) = delete; // no copy allowed
970 simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
971 simd16x32() = delete; // no default constructor allowed
972
simd16x32simdutf::arm64::__anone55652eb0411::simd::simd16x32973 simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
simd16x32simdutf::arm64::__anone55652eb0411::simd::simd16x32974 simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
975
storesimdutf::arm64::__anone55652eb0411::simd::simd16x32976 simdutf_really_inline void store(T* ptr) const {
977 this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
978 this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
979 this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
980 this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
981 }
982
reduce_orsimdutf::arm64::__anone55652eb0411::simd::simd16x32983 simdutf_really_inline simd16<T> reduce_or() const {
984 return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
985 }
986
is_asciisimdutf::arm64::__anone55652eb0411::simd::simd16x32987 simdutf_really_inline bool is_ascii() const {
988 return reduce_or().is_ascii();
989 }
990
store_ascii_as_utf16simdutf::arm64::__anone55652eb0411::simd::simd16x32991 simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
992 this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
993 this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
994 this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
995 this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
996 }
997
to_bitmasksimdutf::arm64::__anone55652eb0411::simd::simd16x32998 simdutf_really_inline uint64_t to_bitmask() const {
999 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
1000 const uint8x16_t bit_mask = make_uint8x16_t(
1001 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
1002 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
1003 );
1004 #else
1005 const uint8x16_t bit_mask = {
1006 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
1007 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
1008 };
1009 #endif
1010 // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
1011 uint8x16_t sum0 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask)));
1012 uint8x16_t sum1 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask)));
1013 sum0 = vpaddq_u8(sum0, sum1);
1014 sum0 = vpaddq_u8(sum0, sum0);
1015 return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
1016 }
1017
swap_bytessimdutf::arm64::__anone55652eb0411::simd::simd16x321018 simdutf_really_inline void swap_bytes() {
1019 this->chunks[0] = this->chunks[0].swap_bytes();
1020 this->chunks[1] = this->chunks[1].swap_bytes();
1021 this->chunks[2] = this->chunks[2].swap_bytes();
1022 this->chunks[3] = this->chunks[3].swap_bytes();
1023 }
1024
eqsimdutf::arm64::__anone55652eb0411::simd::simd16x321025 simdutf_really_inline uint64_t eq(const T m) const {
1026 const simd16<T> mask = simd16<T>::splat(m);
1027 return simd16x32<bool>(
1028 this->chunks[0] == mask,
1029 this->chunks[1] == mask,
1030 this->chunks[2] == mask,
1031 this->chunks[3] == mask
1032 ).to_bitmask();
1033 }
1034
lteqsimdutf::arm64::__anone55652eb0411::simd::simd16x321035 simdutf_really_inline uint64_t lteq(const T m) const {
1036 const simd16<T> mask = simd16<T>::splat(m);
1037 return simd16x32<bool>(
1038 this->chunks[0] <= mask,
1039 this->chunks[1] <= mask,
1040 this->chunks[2] <= mask,
1041 this->chunks[3] <= mask
1042 ).to_bitmask();
1043 }
1044
in_rangesimdutf::arm64::__anone55652eb0411::simd::simd16x321045 simdutf_really_inline uint64_t in_range(const T low, const T high) const {
1046 const simd16<T> mask_low = simd16<T>::splat(low);
1047 const simd16<T> mask_high = simd16<T>::splat(high);
1048
1049 return simd16x32<bool>(
1050 (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
1051 (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
1052 (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
1053 (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
1054 ).to_bitmask();
1055 }
not_in_rangesimdutf::arm64::__anone55652eb0411::simd::simd16x321056 simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
1057 const simd16<T> mask_low = simd16<T>::splat(low);
1058 const simd16<T> mask_high = simd16<T>::splat(high);
1059 return simd16x32<bool>(
1060 (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
1061 (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
1062 (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
1063 (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
1064 ).to_bitmask();
1065 }
ltsimdutf::arm64::__anone55652eb0411::simd::simd16x321066 simdutf_really_inline uint64_t lt(const T m) const {
1067 const simd16<T> mask = simd16<T>::splat(m);
1068 return simd16x32<bool>(
1069 this->chunks[0] < mask,
1070 this->chunks[1] < mask,
1071 this->chunks[2] < mask,
1072 this->chunks[3] < mask
1073 ).to_bitmask();
1074 }
1075
1076 }; // struct simd16x32<T>
1077 template<>
not_in_range(const uint16_t low,const uint16_t high) const1078 simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(const uint16_t low, const uint16_t high) const {
1079 const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
1080 const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
1081 simd16x32<uint16_t> x(
1082 simd16<uint16_t>((this->chunks[0] > mask_high) | (this->chunks[0] < mask_low)),
1083 simd16<uint16_t>((this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)),
1084 simd16<uint16_t>((this->chunks[2] > mask_high) | (this->chunks[2] < mask_low)),
1085 simd16<uint16_t>((this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
1086 );
1087 return x.to_bitmask();
1088 }
1089 /* end file src/simdutf/arm64/simd16-inl.h */
1090 } // namespace simd
1091 } // unnamed namespace
1092 } // namespace arm64
1093 } // namespace simdutf
1094
1095 #endif // SIMDUTF_ARM64_SIMD_H
1096 /* end file src/simdutf/arm64/simd.h */
1097
1098 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h
1099 /* begin file src/simdutf/arm64/end.h */
1100 /* end file src/simdutf/arm64/end.h */
1101
1102 #endif // SIMDUTF_IMPLEMENTATION_ARM64
1103
1104 #endif // SIMDUTF_ARM64_H
1105 /* end file src/simdutf/arm64.h */
1106 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/icelake.h
1107 /* begin file src/simdutf/icelake.h */
1108 #ifndef SIMDUTF_ICELAKE_H
1109 #define SIMDUTF_ICELAKE_H
1110
1111
1112
1113 #ifdef __has_include
1114 // How do we detect that a compiler supports vbmi2?
1115 // For sure if the following header is found, we are ok?
1116 #if __has_include(<avx512vbmi2intrin.h>)
1117 #define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1
1118 #endif
1119 #endif
1120
1121 #ifdef _MSC_VER
1122 #if _MSC_VER >= 1920
1123 // Visual Studio 2019 and up support VBMI2 under x64 even if the header
1124 // avx512vbmi2intrin.h is not found.
1125 #define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1
1126 #endif
1127 #endif
1128
1129 // We allow icelake on x64 as long as the compiler is known to support VBMI2.
1130 #ifndef SIMDUTF_IMPLEMENTATION_ICELAKE
1131 #define SIMDUTF_IMPLEMENTATION_ICELAKE ((SIMDUTF_IS_X86_64) && (SIMDUTF_COMPILER_SUPPORTS_VBMI2))
1132 #endif
1133
1134 // To see why (__BMI__) && (__LZCNT__) are not part of this next line, see
1135 // https://github.com/simdutf/simdutf/issues/1247
1136 #define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && (SIMDUTF_HAS_AVX512F && \
1137 SIMDUTF_HAS_AVX512DQ && \
1138 SIMDUTF_HAS_AVX512VL && \
1139 SIMDUTF_HAS_AVX512VBMI2) && (!SIMDUTF_IS_32BITS))
1140
1141 #if SIMDUTF_IMPLEMENTATION_ICELAKE
1142 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
1143 #define SIMDUTF_TARGET_ICELAKE
1144 #else
1145 #define SIMDUTF_TARGET_ICELAKE SIMDUTF_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt")
1146 #endif
1147
1148 namespace simdutf {
1149 namespace icelake {
1150 } // namespace icelake
1151 } // namespace simdutf
1152
1153
1154
1155 //
1156 // These two need to be included outside SIMDUTF_TARGET_REGION
1157 //
1158 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/icelake/intrinsics.h
1159 /* begin file src/simdutf/icelake/intrinsics.h */
1160 #ifndef SIMDUTF_ICELAKE_INTRINSICS_H
1161 #define SIMDUTF_ICELAKE_INTRINSICS_H
1162
1163
1164 #ifdef SIMDUTF_VISUAL_STUDIO
1165 // under clang within visual studio, this will include <x86intrin.h>
1166 #include <intrin.h> // visual studio or clang
1167 #include <immintrin.h>
1168 #else
1169
1170 #if SIMDUTF_GCC11ORMORE
1171 // We should not get warnings while including <x86intrin.h> yet we do
1172 // under some versions of GCC.
1173 // If the x86intrin.h header has uninitialized values that are problematic,
1174 // it is a GCC issue, we want to ignore these warnigns.
1175 SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
1176 #endif
1177
1178 #include <x86intrin.h> // elsewhere
1179
1180
1181 #if SIMDUTF_GCC11ORMORE
1182 // cancels the suppression of the -Wuninitialized
1183 SIMDUTF_POP_DISABLE_WARNINGS
1184 #endif
1185
1186 #ifndef _tzcnt_u64
1187 #define _tzcnt_u64(x) __tzcnt_u64(x)
1188 #endif // _tzcnt_u64
1189 #endif // SIMDUTF_VISUAL_STUDIO
1190
1191 #ifdef SIMDUTF_CLANG_VISUAL_STUDIO
1192 /**
1193 * You are not supposed, normally, to include these
1194 * headers directly. Instead you should either include intrin.h
1195 * or x86intrin.h. However, when compiling with clang
1196 * under Windows (i.e., when _MSC_VER is set), these headers
1197 * only get included *if* the corresponding features are detected
1198 * from macros:
1199 * e.g., if __AVX2__ is set... in turn, we normally set these
1200 * macros by compiling against the corresponding architecture
1201 * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
1202 * software with these advanced instructions. In simdutf, we
1203 * want to compile the whole program for a generic target,
1204 * and only target our specific kernels. As a workaround,
1205 * we directly include the needed headers. These headers would
1206 * normally guard against such usage, but we carefully included
1207 * <x86intrin.h> (or <intrin.h>) before, so the headers
1208 * are fooled.
1209 */
1210 #include <bmiintrin.h> // for _blsr_u64
1211 #include <bmi2intrin.h> // for _pext_u64, _pdep_u64
1212 #include <lzcntintrin.h> // for __lzcnt64
1213 #include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
1214 #include <smmintrin.h>
1215 #include <tmmintrin.h>
1216 #include <avxintrin.h>
1217 #include <avx2intrin.h>
1218 // Important: we need the AVX-512 headers:
1219 #include <avx512fintrin.h>
1220 #include <avx512dqintrin.h>
1221 #include <avx512cdintrin.h>
1222 #include <avx512bwintrin.h>
1223 #include <avx512vlintrin.h>
1224 #include <avx512vlbwintrin.h>
1225 #include <avx512vbmiintrin.h>
1226 #include <avx512vbmi2intrin.h>
1227 // unfortunately, we may not get _blsr_u64, but, thankfully, clang
1228 // has it as a macro.
1229 #ifndef _blsr_u64
1230 // we roll our own
1231 #define _blsr_u64(n) ((n - 1) & n)
1232 #endif // _blsr_u64
1233 #endif // SIMDUTF_CLANG_VISUAL_STUDIO
1234
1235
1236
1237 #if defined(__GNUC__) && !defined(__clang__)
1238
1239 #if __GNUC__ == 8
1240 #define SIMDUTF_GCC8 1
1241 #elif __GNUC__ == 9
1242 #define SIMDUTF_GCC9 1
1243 #endif // __GNUC__ == 8 || __GNUC__ == 9
1244
1245 #endif // defined(__GNUC__) && !defined(__clang__)
1246
1247 #if SIMDUTF_GCC8
1248 #pragma GCC push_options
1249 #pragma GCC target("avx512f")
1250 /**
1251 * GCC 8 fails to provide _mm512_set_epi8. We roll our own.
1252 */
_mm512_set_epi8(uint8_t a0,uint8_t a1,uint8_t a2,uint8_t a3,uint8_t a4,uint8_t a5,uint8_t a6,uint8_t a7,uint8_t a8,uint8_t a9,uint8_t a10,uint8_t a11,uint8_t a12,uint8_t a13,uint8_t a14,uint8_t a15,uint8_t a16,uint8_t a17,uint8_t a18,uint8_t a19,uint8_t a20,uint8_t a21,uint8_t a22,uint8_t a23,uint8_t a24,uint8_t a25,uint8_t a26,uint8_t a27,uint8_t a28,uint8_t a29,uint8_t a30,uint8_t a31,uint8_t a32,uint8_t a33,uint8_t a34,uint8_t a35,uint8_t a36,uint8_t a37,uint8_t a38,uint8_t a39,uint8_t a40,uint8_t a41,uint8_t a42,uint8_t a43,uint8_t a44,uint8_t a45,uint8_t a46,uint8_t a47,uint8_t a48,uint8_t a49,uint8_t a50,uint8_t a51,uint8_t a52,uint8_t a53,uint8_t a54,uint8_t a55,uint8_t a56,uint8_t a57,uint8_t a58,uint8_t a59,uint8_t a60,uint8_t a61,uint8_t a62,uint8_t a63)1253 inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
1254 return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
1255 uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
1256 uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
1257 uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
1258 uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
1259 uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
1260 uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
1261 uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
1262 }
1263 #pragma GCC pop_options
1264 #endif // SIMDUTF_GCC8
1265
1266 #endif // SIMDUTF_HASWELL_INTRINSICS_H
1267 /* end file src/simdutf/icelake/intrinsics.h */
1268 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/icelake/implementation.h
1269 /* begin file src/simdutf/icelake/implementation.h */
1270 #ifndef SIMDUTF_ICELAKE_IMPLEMENTATION_H
1271 #define SIMDUTF_ICELAKE_IMPLEMENTATION_H
1272
1273
1274 namespace simdutf {
1275 namespace icelake {
1276
1277 namespace {
1278 using namespace simdutf;
1279 }
1280
1281 class implementation final : public simdutf::implementation {
1282 public:
implementation()1283 simdutf_really_inline implementation() : simdutf::implementation(
1284 "icelake",
1285 "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 extensions)",
1286 internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2 ) {}
1287 simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
1288 simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
1289 simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
1290 simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
1291 simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
1292 simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
1293 simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
1294 simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
1295 simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
1296 simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
1297 simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
1298 simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1299 simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1300 simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1301 simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1302 simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1303 simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1304 simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
1305 simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
1306 simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1307 simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1308 simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1309 simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1310 simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1311 simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1312 simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1313 simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1314 simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1315 simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1316 simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1317 simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1318 simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1319 simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1320 simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1321 simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1322 simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1323 simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1324 simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1325 simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1326 simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1327 simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1328 void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
1329 simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
1330 simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
1331 simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
1332 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1333 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1334 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1335 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1336 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
1337 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1338 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1339 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
1340 };
1341
1342 } // namespace icelake
1343 } // namespace simdutf
1344
1345 #endif // SIMDUTF_ICELAKE_IMPLEMENTATION_H
1346 /* end file src/simdutf/icelake/implementation.h */
1347
1348 //
1349 // The rest need to be inside the region
1350 //
1351 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/icelake/begin.h
1352 /* begin file src/simdutf/icelake/begin.h */
1353 // redefining SIMDUTF_IMPLEMENTATION to "icelake"
1354 // #define SIMDUTF_IMPLEMENTATION icelake
1355
1356 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
1357 // nothing needed.
1358 #else
1359 SIMDUTF_TARGET_ICELAKE
1360 #endif
1361
1362 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
1363 SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
1364 #endif // end of workaround
1365 /* end file src/simdutf/icelake/begin.h */
1366 // Declarations
1367 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/icelake/bitmanipulation.h
1368 /* begin file src/simdutf/icelake/bitmanipulation.h */
1369 #ifndef SIMDUTF_ICELAKE_BITMANIPULATION_H
1370 #define SIMDUTF_ICELAKE_BITMANIPULATION_H
1371
1372 namespace simdutf {
1373 namespace icelake {
1374 namespace {
1375
1376 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
count_ones(uint64_t input_num)1377 simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
1378 // note: we do not support legacy 32-bit Windows
1379 return __popcnt64(input_num);// Visual Studio wants two underscores
1380 }
1381 #else
1382 simdutf_really_inline long long int count_ones(uint64_t input_num) {
1383 return _popcnt64(input_num);
1384 }
1385 #endif
1386
1387 } // unnamed namespace
1388 } // namespace icelake
1389 } // namespace simdutf
1390
1391 #endif // SIMDUTF_ICELAKE_BITMANIPULATION_H
1392 /* end file src/simdutf/icelake/bitmanipulation.h */
1393 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/icelake/end.h
1394 /* begin file src/simdutf/icelake/end.h */
1395 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
1396 // nothing needed.
1397 #else
1398 SIMDUTF_UNTARGET_REGION
1399 #endif
1400
1401
1402 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
1403 SIMDUTF_POP_DISABLE_WARNINGS
1404 #endif // end of workaround
1405 /* end file src/simdutf/icelake/end.h */
1406
1407
1408
1409 #endif // SIMDUTF_IMPLEMENTATION_ICELAKE
1410 #endif // SIMDUTF_ICELAKE_H
1411 /* end file src/simdutf/icelake.h */
1412 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell.h
1413 /* begin file src/simdutf/haswell.h */
1414 #ifndef SIMDUTF_HASWELL_H
1415 #define SIMDUTF_HASWELL_H
1416
1417 #ifdef SIMDUTF_WESTMERE_H
1418 #error "haswell.h must be included before westmere.h"
1419 #endif
1420 #ifdef SIMDUTF_FALLBACK_H
1421 #error "haswell.h must be included before fallback.h"
1422 #endif
1423
1424
1425 // Default Haswell to on if this is x86-64. Even if we're not compiled for it, it could be selected
1426 // at runtime.
1427 #ifndef SIMDUTF_IMPLEMENTATION_HASWELL
1428 //
1429 // You do not want to restrict it like so: SIMDUTF_IS_X86_64 && __AVX2__
1430 // because we want to rely on *runtime dispatch*.
1431 //
1432 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
1433 #define SIMDUTF_IMPLEMENTATION_HASWELL 0
1434 #else
1435 #define SIMDUTF_IMPLEMENTATION_HASWELL (SIMDUTF_IS_X86_64)
1436 #endif
1437
1438 #endif
1439 // To see why (__BMI__) && (__LZCNT__) are not part of this next line, see
1440 // https://github.com/simdutf/simdutf/issues/1247
1441 #define SIMDUTF_CAN_ALWAYS_RUN_HASWELL ((SIMDUTF_IMPLEMENTATION_HASWELL) && (SIMDUTF_IS_X86_64) && (__AVX2__))
1442
1443 #if SIMDUTF_IMPLEMENTATION_HASWELL
1444
1445 #define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,lzcnt,popcnt")
1446
1447 namespace simdutf {
1448 /**
1449 * Implementation for Haswell (Intel AVX2).
1450 */
1451 namespace haswell {
1452 } // namespace haswell
1453 } // namespace simdutf
1454
1455 //
1456 // These two need to be included outside SIMDUTF_TARGET_REGION
1457 //
1458 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/implementation.h
1459 /* begin file src/simdutf/haswell/implementation.h */
1460 #ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H
1461 #define SIMDUTF_HASWELL_IMPLEMENTATION_H
1462
1463
1464 // The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
1465 namespace simdutf {
1466 namespace haswell {
1467
1468 using namespace simdutf;
1469
1470 class implementation final : public simdutf::implementation {
1471 public:
implementation()1472 simdutf_really_inline implementation() : simdutf::implementation(
1473 "haswell",
1474 "Intel/AMD AVX2",
1475 internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2
1476 ) {}
1477 simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
1478 simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
1479 simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
1480 simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
1481 simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
1482 simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
1483 simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
1484 simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
1485 simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
1486 simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
1487 simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
1488 simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1489 simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1490 simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1491 simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1492 simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1493 simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1494 simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
1495 simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
1496 simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1497 simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1498 simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1499 simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1500 simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1501 simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1502 simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1503 simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1504 simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1505 simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1506 simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1507 simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1508 simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1509 simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1510 simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1511 simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1512 simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1513 simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1514 simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1515 simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1516 simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1517 simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1518 void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
1519 simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
1520 simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
1521 simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
1522 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1523 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1524 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1525 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1526 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
1527 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1528 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1529 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
1530 };
1531
1532 } // namespace haswell
1533 } // namespace simdutf
1534
1535 #endif // SIMDUTF_HASWELL_IMPLEMENTATION_H
1536 /* end file src/simdutf/haswell/implementation.h */
1537 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/intrinsics.h
1538 /* begin file src/simdutf/haswell/intrinsics.h */
1539 #ifndef SIMDUTF_HASWELL_INTRINSICS_H
1540 #define SIMDUTF_HASWELL_INTRINSICS_H
1541
1542
1543 #ifdef SIMDUTF_VISUAL_STUDIO
1544 // under clang within visual studio, this will include <x86intrin.h>
1545 #include <intrin.h> // visual studio or clang
1546 #else
1547
1548 #if SIMDUTF_GCC11ORMORE
1549 // We should not get warnings while including <x86intrin.h> yet we do
1550 // under some versions of GCC.
1551 // If the x86intrin.h header has uninitialized values that are problematic,
1552 // it is a GCC issue, we want to ignore these warnigns.
1553 SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
1554 #endif
1555
1556 #include <x86intrin.h> // elsewhere
1557
1558
1559 #if SIMDUTF_GCC11ORMORE
1560 // cancels the suppression of the -Wuninitialized
1561 SIMDUTF_POP_DISABLE_WARNINGS
1562 #endif
1563
1564 #endif // SIMDUTF_VISUAL_STUDIO
1565
1566 #ifdef SIMDUTF_CLANG_VISUAL_STUDIO
1567 /**
1568 * You are not supposed, normally, to include these
1569 * headers directly. Instead you should either include intrin.h
1570 * or x86intrin.h. However, when compiling with clang
1571 * under Windows (i.e., when _MSC_VER is set), these headers
1572 * only get included *if* the corresponding features are detected
1573 * from macros:
1574 * e.g., if __AVX2__ is set... in turn, we normally set these
1575 * macros by compiling against the corresponding architecture
1576 * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
1577 * software with these advanced instructions. In simdutf, we
1578 * want to compile the whole program for a generic target,
1579 * and only target our specific kernels. As a workaround,
1580 * we directly include the needed headers. These headers would
1581 * normally guard against such usage, but we carefully included
1582 * <x86intrin.h> (or <intrin.h>) before, so the headers
1583 * are fooled.
1584 */
1585 #include <bmiintrin.h> // for _blsr_u64
1586 #include <lzcntintrin.h> // for __lzcnt64
1587 #include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
1588 #include <smmintrin.h>
1589 #include <tmmintrin.h>
1590 #include <avxintrin.h>
1591 #include <avx2intrin.h>
1592 // unfortunately, we may not get _blsr_u64, but, thankfully, clang
1593 // has it as a macro.
1594 #ifndef _blsr_u64
1595 // we roll our own
1596 #define _blsr_u64(n) ((n - 1) & n)
1597 #endif // _blsr_u64
1598 #endif // SIMDUTF_CLANG_VISUAL_STUDIO
1599
1600 #endif // SIMDUTF_HASWELL_INTRINSICS_H
1601 /* end file src/simdutf/haswell/intrinsics.h */
1602
1603 //
1604 // The rest need to be inside the region
1605 //
1606 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h
1607 /* begin file src/simdutf/haswell/begin.h */
1608 // redefining SIMDUTF_IMPLEMENTATION to "haswell"
1609 // #define SIMDUTF_IMPLEMENTATION haswell
1610
1611 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
1612 // nothing needed.
1613 #else
1614 SIMDUTF_TARGET_HASWELL
1615 #endif
1616
1617 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
1618 SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
1619 #endif // end of workaround
1620 /* end file src/simdutf/haswell/begin.h */
1621 // Declarations
1622 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/bitmanipulation.h
1623 /* begin file src/simdutf/haswell/bitmanipulation.h */
1624 #ifndef SIMDUTF_HASWELL_BITMANIPULATION_H
1625 #define SIMDUTF_HASWELL_BITMANIPULATION_H
1626
1627 namespace simdutf {
1628 namespace haswell {
1629 namespace {
1630
1631 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
count_ones(uint64_t input_num)1632 simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
1633 // note: we do not support legacy 32-bit Windows
1634 return __popcnt64(input_num);// Visual Studio wants two underscores
1635 }
1636 #else
1637 simdutf_really_inline long long int count_ones(uint64_t input_num) {
1638 return _popcnt64(input_num);
1639 }
1640 #endif
1641
1642 } // unnamed namespace
1643 } // namespace haswell
1644 } // namespace simdutf
1645
1646 #endif // SIMDUTF_HASWELL_BITMANIPULATION_H
1647 /* end file src/simdutf/haswell/bitmanipulation.h */
1648 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd.h
1649 /* begin file src/simdutf/haswell/simd.h */
1650 #ifndef SIMDUTF_HASWELL_SIMD_H
1651 #define SIMDUTF_HASWELL_SIMD_H
1652
1653
1654 namespace simdutf {
1655 namespace haswell {
1656 namespace {
1657 namespace simd {
1658
1659 // Forward-declared so they can be used by splat and friends.
1660 template<typename Child>
1661 struct base {
1662 __m256i value;
1663
1664 // Zero constructor
basesimdutf::haswell::__anone55652eb0911::simd::base1665 simdutf_really_inline base() : value{__m256i()} {}
1666
1667 // Conversion from SIMD register
basesimdutf::haswell::__anone55652eb0911::simd::base1668 simdutf_really_inline base(const __m256i _value) : value(_value) {}
1669 // Conversion to SIMD register
operator const __m256i&simdutf::haswell::__anone55652eb0911::simd::base1670 simdutf_really_inline operator const __m256i&() const { return this->value; }
operator __m256i&simdutf::haswell::__anone55652eb0911::simd::base1671 simdutf_really_inline operator __m256i&() { return this->value; }
1672 template <endianness big_endian>
store_ascii_as_utf16simdutf::haswell::__anone55652eb0911::simd::base1673 simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
1674 __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this));
1675 __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this,1));
1676 if (big_endian) {
1677 const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
1678 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
1679 first = _mm256_shuffle_epi8(first, swap);
1680 second = _mm256_shuffle_epi8(second, swap);
1681 }
1682 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), first);
1683 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), second);
1684 }
store_ascii_as_utf32simdutf::haswell::__anone55652eb0911::simd::base1685 simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
1686 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this)));
1687 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr+8), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(_mm256_srli_si256(*this,8))));
1688 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this,1)));
1689 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 24), _mm256_cvtepu8_epi32(_mm_srli_si128(_mm256_extractf128_si256(*this,1),8)));
1690 }
1691 // Bit operations
operator |simdutf::haswell::__anone55652eb0911::simd::base1692 simdutf_really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
operator &simdutf::haswell::__anone55652eb0911::simd::base1693 simdutf_really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
operator ^simdutf::haswell::__anone55652eb0911::simd::base1694 simdutf_really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
bit_andnotsimdutf::haswell::__anone55652eb0911::simd::base1695 simdutf_really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
operator |=simdutf::haswell::__anone55652eb0911::simd::base1696 simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
operator &=simdutf::haswell::__anone55652eb0911::simd::base1697 simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
operator ^=simdutf::haswell::__anone55652eb0911::simd::base1698 simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
1699 };
1700
1701 // Forward-declared so they can be used by splat and friends.
1702 template<typename T>
1703 struct simd8;
1704
1705 template<typename T, typename Mask=simd8<bool>>
1706 struct base8: base<simd8<T>> {
1707 typedef uint32_t bitmask_t;
1708 typedef uint64_t bitmask2_t;
1709
base8simdutf::haswell::__anone55652eb0911::simd::base81710 simdutf_really_inline base8() : base<simd8<T>>() {}
base8simdutf::haswell::__anone55652eb0911::simd::base81711 simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
firstsimdutf::haswell::__anone55652eb0911::simd::base81712 simdutf_really_inline T first() const { return _mm256_extract_epi8(*this,0); }
lastsimdutf::haswell::__anone55652eb0911::simd::base81713 simdutf_really_inline T last() const { return _mm256_extract_epi8(*this,31); }
operator ==(const simd8<T> lhs,const simd8<T> rhs)1714 friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm256_cmpeq_epi8(lhs, rhs); }
1715
1716 static const int SIZE = sizeof(base<T>::value);
1717
1718 template<int N=1>
prevsimdutf::haswell::__anone55652eb0911::simd::base81719 simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
1720 return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
1721 }
1722 };
1723
1724 // SIMD byte mask type (returned by things like eq and gt)
1725 template<>
1726 struct simd8<bool>: base8<bool> {
splatsimdutf::haswell::__anone55652eb0911::simd::simd81727 static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); }
1728
simd8simdutf::haswell::__anone55652eb0911::simd::simd81729 simdutf_really_inline simd8<bool>() : base8() {}
simd8simdutf::haswell::__anone55652eb0911::simd::simd81730 simdutf_really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {}
1731 // Splat constructor
simd8simdutf::haswell::__anone55652eb0911::simd::simd81732 simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
1733
to_bitmasksimdutf::haswell::__anone55652eb0911::simd::simd81734 simdutf_really_inline uint32_t to_bitmask() const { return uint32_t(_mm256_movemask_epi8(*this)); }
anysimdutf::haswell::__anone55652eb0911::simd::simd81735 simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
nonesimdutf::haswell::__anone55652eb0911::simd::simd81736 simdutf_really_inline bool none() const { return _mm256_testz_si256(*this, *this); }
allsimdutf::haswell::__anone55652eb0911::simd::simd81737 simdutf_really_inline bool all() const { return static_cast<uint32_t>(_mm256_movemask_epi8(*this)) == 0xFFFFFFFF; }
operator ~simdutf::haswell::__anone55652eb0911::simd::simd81738 simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
1739 };
1740
1741 template<typename T>
1742 struct base8_numeric: base8<T> {
splatsimdutf::haswell::__anone55652eb0911::simd::base8_numeric1743 static simdutf_really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
zerosimdutf::haswell::__anone55652eb0911::simd::base8_numeric1744 static simdutf_really_inline simd8<T> zero() { return _mm256_setzero_si256(); }
loadsimdutf::haswell::__anone55652eb0911::simd::base8_numeric1745 static simdutf_really_inline simd8<T> load(const T values[32]) {
1746 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
1747 }
1748 // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::haswell::__anone55652eb0911::simd::base8_numeric1749 static simdutf_really_inline simd8<T> repeat_16(
1750 T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1751 T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
1752 ) {
1753 return simd8<T>(
1754 v0, v1, v2, v3, v4, v5, v6, v7,
1755 v8, v9, v10,v11,v12,v13,v14,v15,
1756 v0, v1, v2, v3, v4, v5, v6, v7,
1757 v8, v9, v10,v11,v12,v13,v14,v15
1758 );
1759 }
1760
base8_numericsimdutf::haswell::__anone55652eb0911::simd::base8_numeric1761 simdutf_really_inline base8_numeric() : base8<T>() {}
base8_numericsimdutf::haswell::__anone55652eb0911::simd::base8_numeric1762 simdutf_really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
1763
1764 // Store to array
storesimdutf::haswell::__anone55652eb0911::simd::base8_numeric1765 simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
1766
1767 // Addition/subtraction are the same for signed and unsigned
operator +simdutf::haswell::__anone55652eb0911::simd::base8_numeric1768 simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
operator -simdutf::haswell::__anone55652eb0911::simd::base8_numeric1769 simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
operator +=simdutf::haswell::__anone55652eb0911::simd::base8_numeric1770 simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
operator -=simdutf::haswell::__anone55652eb0911::simd::base8_numeric1771 simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
1772
1773 // Override to distinguish from bool version
operator ~simdutf::haswell::__anone55652eb0911::simd::base8_numeric1774 simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
1775
1776 // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
1777 template<typename L>
lookup_16simdutf::haswell::__anone55652eb0911::simd::base8_numeric1778 simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
1779 return _mm256_shuffle_epi8(lookup_table, *this);
1780 }
1781
1782 template<typename L>
lookup_16simdutf::haswell::__anone55652eb0911::simd::base8_numeric1783 simdutf_really_inline simd8<L> lookup_16(
1784 L replace0, L replace1, L replace2, L replace3,
1785 L replace4, L replace5, L replace6, L replace7,
1786 L replace8, L replace9, L replace10, L replace11,
1787 L replace12, L replace13, L replace14, L replace15) const {
1788 return lookup_16(simd8<L>::repeat_16(
1789 replace0, replace1, replace2, replace3,
1790 replace4, replace5, replace6, replace7,
1791 replace8, replace9, replace10, replace11,
1792 replace12, replace13, replace14, replace15
1793 ));
1794 }
1795 };
1796
1797
1798 // Signed bytes
1799 template<>
1800 struct simd8<int8_t> : base8_numeric<int8_t> {
simd8simdutf::haswell::__anone55652eb0911::simd::simd81801 simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
simd8simdutf::haswell::__anone55652eb0911::simd::simd81802 simdutf_really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
1803
1804 // Splat constructor
simd8simdutf::haswell::__anone55652eb0911::simd::simd81805 simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
1806 // Array constructor
simd8simdutf::haswell::__anone55652eb0911::simd::simd81807 simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
1808 simdutf_really_inline operator simd8<uint8_t>() const;
1809 // Member-by-member initialization
simd8simdutf::haswell::__anone55652eb0911::simd::simd81810 simdutf_really_inline simd8(
1811 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
1812 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
1813 int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
1814 int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
1815 ) : simd8(_mm256_setr_epi8(
1816 v0, v1, v2, v3, v4, v5, v6, v7,
1817 v8, v9, v10,v11,v12,v13,v14,v15,
1818 v16,v17,v18,v19,v20,v21,v22,v23,
1819 v24,v25,v26,v27,v28,v29,v30,v31
1820 )) {}
1821 // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::haswell::__anone55652eb0911::simd::simd81822 simdutf_really_inline static simd8<int8_t> repeat_16(
1823 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
1824 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
1825 ) {
1826 return simd8<int8_t>(
1827 v0, v1, v2, v3, v4, v5, v6, v7,
1828 v8, v9, v10,v11,v12,v13,v14,v15,
1829 v0, v1, v2, v3, v4, v5, v6, v7,
1830 v8, v9, v10,v11,v12,v13,v14,v15
1831 );
1832 }
is_asciisimdutf::haswell::__anone55652eb0911::simd::simd81833 simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
1834 // Order-sensitive comparisons
max_valsimdutf::haswell::__anone55652eb0911::simd::simd81835 simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
min_valsimdutf::haswell::__anone55652eb0911::simd::simd81836 simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
operator >simdutf::haswell::__anone55652eb0911::simd::simd81837 simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
operator <simdutf::haswell::__anone55652eb0911::simd::simd81838 simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
1839 };
1840
1841 // Unsigned bytes
1842 template<>
1843 struct simd8<uint8_t>: base8_numeric<uint8_t> {
simd8simdutf::haswell::__anone55652eb0911::simd::simd81844 simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
simd8simdutf::haswell::__anone55652eb0911::simd::simd81845 simdutf_really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
1846 // Splat constructor
simd8simdutf::haswell::__anone55652eb0911::simd::simd81847 simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
1848 // Array constructor
simd8simdutf::haswell::__anone55652eb0911::simd::simd81849 simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
1850 // Member-by-member initialization
simd8simdutf::haswell::__anone55652eb0911::simd::simd81851 simdutf_really_inline simd8(
1852 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
1853 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
1854 uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
1855 uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
1856 ) : simd8(_mm256_setr_epi8(
1857 v0, v1, v2, v3, v4, v5, v6, v7,
1858 v8, v9, v10,v11,v12,v13,v14,v15,
1859 v16,v17,v18,v19,v20,v21,v22,v23,
1860 v24,v25,v26,v27,v28,v29,v30,v31
1861 )) {}
1862 // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::haswell::__anone55652eb0911::simd::simd81863 simdutf_really_inline static simd8<uint8_t> repeat_16(
1864 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
1865 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
1866 ) {
1867 return simd8<uint8_t>(
1868 v0, v1, v2, v3, v4, v5, v6, v7,
1869 v8, v9, v10,v11,v12,v13,v14,v15,
1870 v0, v1, v2, v3, v4, v5, v6, v7,
1871 v8, v9, v10,v11,v12,v13,v14,v15
1872 );
1873 }
1874
1875
1876 // Saturated math
saturating_addsimdutf::haswell::__anone55652eb0911::simd::simd81877 simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
saturating_subsimdutf::haswell::__anone55652eb0911::simd::simd81878 simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
1879
1880 // Order-specific operations
max_valsimdutf::haswell::__anone55652eb0911::simd::simd81881 simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); }
min_valsimdutf::haswell::__anone55652eb0911::simd::simd81882 simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm256_min_epu8(other, *this); }
1883 // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::haswell::__anone55652eb0911::simd::simd81884 simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
1885 // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::haswell::__anone55652eb0911::simd::simd81886 simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::haswell::__anone55652eb0911::simd::simd81887 simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
operator >=simdutf::haswell::__anone55652eb0911::simd::simd81888 simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
operator >simdutf::haswell::__anone55652eb0911::simd::simd81889 simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
operator <simdutf::haswell::__anone55652eb0911::simd::simd81890 simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
1891
1892 // Bit-specific operations
bits_not_setsimdutf::haswell::__anone55652eb0911::simd::simd81893 simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
bits_not_setsimdutf::haswell::__anone55652eb0911::simd::simd81894 simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
any_bits_setsimdutf::haswell::__anone55652eb0911::simd::simd81895 simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
any_bits_setsimdutf::haswell::__anone55652eb0911::simd::simd81896 simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
is_asciisimdutf::haswell::__anone55652eb0911::simd::simd81897 simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
bits_not_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd81898 simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
any_bits_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd81899 simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
bits_not_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd81900 simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); }
any_bits_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd81901 simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
1902 template<int N>
shrsimdutf::haswell::__anone55652eb0911::simd::simd81903 simdutf_really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
1904 template<int N>
shlsimdutf::haswell::__anone55652eb0911::simd::simd81905 simdutf_really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
1906 // Get one of the bits and make a bitmask out of it.
1907 // e.g. value.get_bit<7>() gets the high bit
1908 template<int N>
get_bitsimdutf::haswell::__anone55652eb0911::simd::simd81909 simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); }
1910 };
operator simd8<uint8_t>() const1911 simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
1912
1913
1914 template<typename T>
1915 struct simd8x64 {
1916 static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
1917 static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
1918 simd8<T> chunks[NUM_CHUNKS];
1919
1920 simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
1921 simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
1922 simd8x64() = delete; // no default constructor allowed
1923
simd8x64simdutf::haswell::__anone55652eb0911::simd::simd8x641924 simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
simd8x64simdutf::haswell::__anone55652eb0911::simd::simd8x641925 simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T))} {}
1926
storesimdutf::haswell::__anone55652eb0911::simd::simd8x641927 simdutf_really_inline void store(T* ptr) const {
1928 this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
1929 this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
1930 }
1931
to_bitmasksimdutf::haswell::__anone55652eb0911::simd::simd8x641932 simdutf_really_inline uint64_t to_bitmask() const {
1933 uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
1934 uint64_t r_hi = this->chunks[1].to_bitmask();
1935 return r_lo | (r_hi << 32);
1936 }
1937
operator |=simdutf::haswell::__anone55652eb0911::simd::simd8x641938 simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T> &other) {
1939 this->chunks[0] |= other.chunks[0];
1940 this->chunks[1] |= other.chunks[1];
1941 return *this;
1942 }
1943
reduce_orsimdutf::haswell::__anone55652eb0911::simd::simd8x641944 simdutf_really_inline simd8<T> reduce_or() const {
1945 return this->chunks[0] | this->chunks[1];
1946 }
1947
is_asciisimdutf::haswell::__anone55652eb0911::simd::simd8x641948 simdutf_really_inline bool is_ascii() const {
1949 return this->reduce_or().is_ascii();
1950 }
1951
1952 template <endianness endian>
store_ascii_as_utf16simdutf::haswell::__anone55652eb0911::simd::simd8x641953 simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
1954 this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
1955 this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
1956 }
1957
store_ascii_as_utf32simdutf::haswell::__anone55652eb0911::simd::simd8x641958 simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
1959 this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
1960 this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
1961 }
1962
bit_orsimdutf::haswell::__anone55652eb0911::simd::simd8x641963 simdutf_really_inline simd8x64<T> bit_or(const T m) const {
1964 const simd8<T> mask = simd8<T>::splat(m);
1965 return simd8x64<T>(
1966 this->chunks[0] | mask,
1967 this->chunks[1] | mask
1968 );
1969 }
1970
eqsimdutf::haswell::__anone55652eb0911::simd::simd8x641971 simdutf_really_inline uint64_t eq(const T m) const {
1972 const simd8<T> mask = simd8<T>::splat(m);
1973 return simd8x64<bool>(
1974 this->chunks[0] == mask,
1975 this->chunks[1] == mask
1976 ).to_bitmask();
1977 }
1978
eqsimdutf::haswell::__anone55652eb0911::simd::simd8x641979 simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
1980 return simd8x64<bool>(
1981 this->chunks[0] == other.chunks[0],
1982 this->chunks[1] == other.chunks[1]
1983 ).to_bitmask();
1984 }
1985
lteqsimdutf::haswell::__anone55652eb0911::simd::simd8x641986 simdutf_really_inline uint64_t lteq(const T m) const {
1987 const simd8<T> mask = simd8<T>::splat(m);
1988 return simd8x64<bool>(
1989 this->chunks[0] <= mask,
1990 this->chunks[1] <= mask
1991 ).to_bitmask();
1992 }
1993
in_rangesimdutf::haswell::__anone55652eb0911::simd::simd8x641994 simdutf_really_inline uint64_t in_range(const T low, const T high) const {
1995 const simd8<T> mask_low = simd8<T>::splat(low);
1996 const simd8<T> mask_high = simd8<T>::splat(high);
1997
1998 return simd8x64<bool>(
1999 (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
2000 (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
2001 (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
2002 (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
2003 ).to_bitmask();
2004 }
not_in_rangesimdutf::haswell::__anone55652eb0911::simd::simd8x642005 simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
2006 const simd8<T> mask_low = simd8<T>::splat(low);
2007 const simd8<T> mask_high = simd8<T>::splat(high);
2008 return simd8x64<bool>(
2009 (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
2010 (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)
2011 ).to_bitmask();
2012 }
ltsimdutf::haswell::__anone55652eb0911::simd::simd8x642013 simdutf_really_inline uint64_t lt(const T m) const {
2014 const simd8<T> mask = simd8<T>::splat(m);
2015 return simd8x64<bool>(
2016 this->chunks[0] < mask,
2017 this->chunks[1] < mask
2018 ).to_bitmask();
2019 }
2020
gtsimdutf::haswell::__anone55652eb0911::simd::simd8x642021 simdutf_really_inline uint64_t gt(const T m) const {
2022 const simd8<T> mask = simd8<T>::splat(m);
2023 return simd8x64<bool>(
2024 this->chunks[0] > mask,
2025 this->chunks[1] > mask
2026 ).to_bitmask();
2027 }
gteqsimdutf::haswell::__anone55652eb0911::simd::simd8x642028 simdutf_really_inline uint64_t gteq(const T m) const {
2029 const simd8<T> mask = simd8<T>::splat(m);
2030 return simd8x64<bool>(
2031 this->chunks[0] >= mask,
2032 this->chunks[1] >= mask
2033 ).to_bitmask();
2034 }
gteq_unsignedsimdutf::haswell::__anone55652eb0911::simd::simd8x642035 simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
2036 const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
2037 return simd8x64<bool>(
2038 (simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
2039 (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask)
2040 ).to_bitmask();
2041 }
2042 }; // struct simd8x64<T>
2043
2044 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd16-inl.h
2045 /* begin file src/simdutf/haswell/simd16-inl.h */
2046 #ifdef __GNUC__
2047 #if __GNUC__ < 8
2048 #define _mm256_set_m128i(xmm1, xmm2) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
2049 #define _mm256_setr_m128i(xmm2, xmm1) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
2050 #endif
2051 #endif
2052
2053 template<typename T>
2054 struct simd16;
2055
2056 template<typename T, typename Mask=simd16<bool>>
2057 struct base16: base<simd16<T>> {
2058 using bitmask_type = uint32_t;
2059
base16simdutf::haswell::__anone55652eb0911::simd::base162060 simdutf_really_inline base16() : base<simd16<T>>() {}
base16simdutf::haswell::__anone55652eb0911::simd::base162061 simdutf_really_inline base16(const __m256i _value) : base<simd16<T>>(_value) {}
2062 template <typename Pointer>
base16simdutf::haswell::__anone55652eb0911::simd::base162063 simdutf_really_inline base16(const Pointer* ptr) : base16(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr))) {}
operator ==(const simd16<T> lhs,const simd16<T> rhs)2064 friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm256_cmpeq_epi16(lhs, rhs); }
2065
2066 /// the size of vector in bytes
2067 static const int SIZE = sizeof(base<simd16<T>>::value);
2068
2069 /// the number of elements of type T a vector can hold
2070 static const int ELEMENTS = SIZE / sizeof(T);
2071
2072 template<int N=1>
prevsimdutf::haswell::__anone55652eb0911::simd::base162073 simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
2074 return _mm256_alignr_epi8(*this, prev_chunk, 16 - N);
2075 }
2076 };
2077
2078 // SIMD byte mask type (returned by things like eq and gt)
2079 template<>
2080 struct simd16<bool>: base16<bool> {
splatsimdutf::haswell::__anone55652eb0911::simd::simd162081 static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm256_set1_epi16(uint16_t(-(!!_value))); }
2082
simd16simdutf::haswell::__anone55652eb0911::simd::simd162083 simdutf_really_inline simd16<bool>() : base16() {}
simd16simdutf::haswell::__anone55652eb0911::simd::simd162084 simdutf_really_inline simd16<bool>(const __m256i _value) : base16<bool>(_value) {}
2085 // Splat constructor
simd16simdutf::haswell::__anone55652eb0911::simd::simd162086 simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
2087
to_bitmasksimdutf::haswell::__anone55652eb0911::simd::simd162088 simdutf_really_inline bitmask_type to_bitmask() const { return _mm256_movemask_epi8(*this); }
anysimdutf::haswell::__anone55652eb0911::simd::simd162089 simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
operator ~simdutf::haswell::__anone55652eb0911::simd::simd162090 simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
2091 };
2092
2093 template<typename T>
2094 struct base16_numeric: base16<T> {
splatsimdutf::haswell::__anone55652eb0911::simd::base16_numeric2095 static simdutf_really_inline simd16<T> splat(T _value) { return _mm256_set1_epi16(_value); }
zerosimdutf::haswell::__anone55652eb0911::simd::base16_numeric2096 static simdutf_really_inline simd16<T> zero() { return _mm256_setzero_si256(); }
loadsimdutf::haswell::__anone55652eb0911::simd::base16_numeric2097 static simdutf_really_inline simd16<T> load(const T values[8]) {
2098 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
2099 }
2100
base16_numericsimdutf::haswell::__anone55652eb0911::simd::base16_numeric2101 simdutf_really_inline base16_numeric() : base16<T>() {}
base16_numericsimdutf::haswell::__anone55652eb0911::simd::base16_numeric2102 simdutf_really_inline base16_numeric(const __m256i _value) : base16<T>(_value) {}
2103
2104 // Store to array
storesimdutf::haswell::__anone55652eb0911::simd::base16_numeric2105 simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
2106
2107 // Override to distinguish from bool version
operator ~simdutf::haswell::__anone55652eb0911::simd::base16_numeric2108 simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
2109
2110 // Addition/subtraction are the same for signed and unsigned
operator +simdutf::haswell::__anone55652eb0911::simd::base16_numeric2111 simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm256_add_epi16(*this, other); }
operator -simdutf::haswell::__anone55652eb0911::simd::base16_numeric2112 simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm256_sub_epi16(*this, other); }
operator +=simdutf::haswell::__anone55652eb0911::simd::base16_numeric2113 simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
operator -=simdutf::haswell::__anone55652eb0911::simd::base16_numeric2114 simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
2115 };
2116
2117 // Signed words
2118 template<>
2119 struct simd16<int16_t> : base16_numeric<int16_t> {
simd16simdutf::haswell::__anone55652eb0911::simd::simd162120 simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
simd16simdutf::haswell::__anone55652eb0911::simd::simd162121 simdutf_really_inline simd16(const __m256i _value) : base16_numeric<int16_t>(_value) {}
2122 // Splat constructor
simd16simdutf::haswell::__anone55652eb0911::simd::simd162123 simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
2124 // Array constructor
simd16simdutf::haswell::__anone55652eb0911::simd::simd162125 simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
simd16simdutf::haswell::__anone55652eb0911::simd::simd162126 simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
2127 // Order-sensitive comparisons
max_valsimdutf::haswell::__anone55652eb0911::simd::simd162128 simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm256_max_epi16(*this, other); }
min_valsimdutf::haswell::__anone55652eb0911::simd::simd162129 simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm256_min_epi16(*this, other); }
operator >simdutf::haswell::__anone55652eb0911::simd::simd162130 simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(*this, other); }
operator <simdutf::haswell::__anone55652eb0911::simd::simd162131 simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(other, *this); }
2132 };
2133
2134 // Unsigned words
2135 template<>
2136 struct simd16<uint16_t>: base16_numeric<uint16_t> {
simd16simdutf::haswell::__anone55652eb0911::simd::simd162137 simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
simd16simdutf::haswell::__anone55652eb0911::simd::simd162138 simdutf_really_inline simd16(const __m256i _value) : base16_numeric<uint16_t>(_value) {}
2139
2140 // Splat constructor
simd16simdutf::haswell::__anone55652eb0911::simd::simd162141 simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
2142 // Array constructor
simd16simdutf::haswell::__anone55652eb0911::simd::simd162143 simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
simd16simdutf::haswell::__anone55652eb0911::simd::simd162144 simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
2145
2146 // Saturated math
saturating_addsimdutf::haswell::__anone55652eb0911::simd::simd162147 simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm256_adds_epu16(*this, other); }
saturating_subsimdutf::haswell::__anone55652eb0911::simd::simd162148 simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm256_subs_epu16(*this, other); }
2149
2150 // Order-specific operations
max_valsimdutf::haswell::__anone55652eb0911::simd::simd162151 simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm256_max_epu16(*this, other); }
min_valsimdutf::haswell::__anone55652eb0911::simd::simd162152 simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm256_min_epu16(*this, other); }
2153 // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::haswell::__anone55652eb0911::simd::simd162154 simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
2155 // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::haswell::__anone55652eb0911::simd::simd162156 simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::haswell::__anone55652eb0911::simd::simd162157 simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
operator >=simdutf::haswell::__anone55652eb0911::simd::simd162158 simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
operator >simdutf::haswell::__anone55652eb0911::simd::simd162159 simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
operator <simdutf::haswell::__anone55652eb0911::simd::simd162160 simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
2161
2162 // Bit-specific operations
bits_not_setsimdutf::haswell::__anone55652eb0911::simd::simd162163 simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
bits_not_setsimdutf::haswell::__anone55652eb0911::simd::simd162164 simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
any_bits_setsimdutf::haswell::__anone55652eb0911::simd::simd162165 simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
any_bits_setsimdutf::haswell::__anone55652eb0911::simd::simd162166 simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
2167
bits_not_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd162168 simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
any_bits_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd162169 simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
bits_not_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd162170 simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm256_testz_si256(*this, bits); }
any_bits_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd162171 simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
2172 template<int N>
shrsimdutf::haswell::__anone55652eb0911::simd::simd162173 simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm256_srli_epi16(*this, N)); }
2174 template<int N>
shlsimdutf::haswell::__anone55652eb0911::simd::simd162175 simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm256_slli_epi16(*this, N)); }
2176 // Get one of the bits and make a bitmask out of it.
2177 // e.g. value.get_bit<7>() gets the high bit
2178 template<int N>
get_bitsimdutf::haswell::__anone55652eb0911::simd::simd162179 simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15-N)); }
2180
2181 // Change the endianness
swap_bytessimdutf::haswell::__anone55652eb0911::simd::simd162182 simdutf_really_inline simd16<uint16_t> swap_bytes() const {
2183 const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
2184 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
2185 return _mm256_shuffle_epi8(*this, swap);
2186 }
2187
2188 // Pack with the unsigned saturation two uint16_t words into single uint8_t vector
packsimdutf::haswell::__anone55652eb0911::simd::simd162189 static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
2190 // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
2191 // we have to shuffle lanes in order to produce bytes in the
2192 // correct order.
2193
2194 // get the 0th lanes
2195 const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
2196 const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);
2197
2198 // get the 1st lanes
2199 const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
2200 const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);
2201
2202 // build new vectors (shuffle lanes)
2203 const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
2204 const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);
2205
2206 // pack words in linear order from v0 and v1
2207 return _mm256_packus_epi16(t0, t1);
2208 }
2209 };
2210
2211
2212 template<typename T>
2213 struct simd16x32 {
2214 static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
2215 static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
2216 simd16<T> chunks[NUM_CHUNKS];
2217
2218 simd16x32(const simd16x32<T>& o) = delete; // no copy allowed
2219 simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
2220 simd16x32() = delete; // no default constructor allowed
2221
simd16x32simdutf::haswell::__anone55652eb0911::simd::simd16x322222 simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1) : chunks{chunk0, chunk1} {}
simd16x32simdutf::haswell::__anone55652eb0911::simd::simd16x322223 simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T))} {}
2224
storesimdutf::haswell::__anone55652eb0911::simd::simd16x322225 simdutf_really_inline void store(T* ptr) const {
2226 this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
2227 this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
2228 }
2229
to_bitmasksimdutf::haswell::__anone55652eb0911::simd::simd16x322230 simdutf_really_inline uint64_t to_bitmask() const {
2231 uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
2232 uint64_t r_hi = this->chunks[1].to_bitmask();
2233 return r_lo | (r_hi << 32);
2234 }
2235
reduce_orsimdutf::haswell::__anone55652eb0911::simd::simd16x322236 simdutf_really_inline simd16<T> reduce_or() const {
2237 return this->chunks[0] | this->chunks[1];
2238 }
2239
is_asciisimdutf::haswell::__anone55652eb0911::simd::simd16x322240 simdutf_really_inline bool is_ascii() const {
2241 return this->reduce_or().is_ascii();
2242 }
2243
store_ascii_as_utf16simdutf::haswell::__anone55652eb0911::simd::simd16x322244 simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
2245 this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
2246 this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>));
2247 }
2248
bit_orsimdutf::haswell::__anone55652eb0911::simd::simd16x322249 simdutf_really_inline simd16x32<T> bit_or(const T m) const {
2250 const simd16<T> mask = simd16<T>::splat(m);
2251 return simd16x32<T>(
2252 this->chunks[0] | mask,
2253 this->chunks[1] | mask
2254 );
2255 }
2256
swap_bytessimdutf::haswell::__anone55652eb0911::simd::simd16x322257 simdutf_really_inline void swap_bytes() {
2258 this->chunks[0] = this->chunks[0].swap_bytes();
2259 this->chunks[1] = this->chunks[1].swap_bytes();
2260 }
2261
eqsimdutf::haswell::__anone55652eb0911::simd::simd16x322262 simdutf_really_inline uint64_t eq(const T m) const {
2263 const simd16<T> mask = simd16<T>::splat(m);
2264 return simd16x32<bool>(
2265 this->chunks[0] == mask,
2266 this->chunks[1] == mask
2267 ).to_bitmask();
2268 }
2269
eqsimdutf::haswell::__anone55652eb0911::simd::simd16x322270 simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
2271 return simd16x32<bool>(
2272 this->chunks[0] == other.chunks[0],
2273 this->chunks[1] == other.chunks[1]
2274 ).to_bitmask();
2275 }
2276
lteqsimdutf::haswell::__anone55652eb0911::simd::simd16x322277 simdutf_really_inline uint64_t lteq(const T m) const {
2278 const simd16<T> mask = simd16<T>::splat(m);
2279 return simd16x32<bool>(
2280 this->chunks[0] <= mask,
2281 this->chunks[1] <= mask
2282 ).to_bitmask();
2283 }
2284
in_rangesimdutf::haswell::__anone55652eb0911::simd::simd16x322285 simdutf_really_inline uint64_t in_range(const T low, const T high) const {
2286 const simd16<T> mask_low = simd16<T>::splat(low);
2287 const simd16<T> mask_high = simd16<T>::splat(high);
2288
2289 return simd16x32<bool>(
2290 (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
2291 (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
2292 (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
2293 (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
2294 ).to_bitmask();
2295 }
not_in_rangesimdutf::haswell::__anone55652eb0911::simd::simd16x322296 simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
2297 const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
2298 const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
2299 return simd16x32<bool>(
2300 (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
2301 (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low)
2302 ).to_bitmask();
2303 }
ltsimdutf::haswell::__anone55652eb0911::simd::simd16x322304 simdutf_really_inline uint64_t lt(const T m) const {
2305 const simd16<T> mask = simd16<T>::splat(m);
2306 return simd16x32<bool>(
2307 this->chunks[0] < mask,
2308 this->chunks[1] < mask
2309 ).to_bitmask();
2310 }
2311 }; // struct simd16x32<T>
2312 /* end file src/simdutf/haswell/simd16-inl.h */
2313
2314 } // namespace simd
2315
2316 } // unnamed namespace
2317 } // namespace haswell
2318 } // namespace simdutf
2319
2320 #endif // SIMDUTF_HASWELL_SIMD_H
2321 /* end file src/simdutf/haswell/simd.h */
2322
2323 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h
2324 /* begin file src/simdutf/haswell/end.h */
2325 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
2326 // nothing needed.
2327 #else
2328 SIMDUTF_UNTARGET_REGION
2329 #endif
2330
2331
2332 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
2333 SIMDUTF_POP_DISABLE_WARNINGS
2334 #endif // end of workaround
2335 /* end file src/simdutf/haswell/end.h */
2336
2337 #endif // SIMDUTF_IMPLEMENTATION_HASWELL
2338 #endif // SIMDUTF_HASWELL_COMMON_H
2339 /* end file src/simdutf/haswell.h */
2340 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere.h
2341 /* begin file src/simdutf/westmere.h */
2342 #ifndef SIMDUTF_WESTMERE_H
2343 #define SIMDUTF_WESTMERE_H
2344
2345 #ifdef SIMDUTF_FALLBACK_H
2346 #error "westmere.h must be included before fallback.h"
2347 #endif
2348
2349
2350 // Default Westmere to on if this is x86-64, unless we'll always select Haswell.
2351 #ifndef SIMDUTF_IMPLEMENTATION_WESTMERE
2352 //
2353 // You do not want to set it to (SIMDUTF_IS_X86_64 && !SIMDUTF_REQUIRES_HASWELL)
2354 // because you want to rely on runtime dispatch!
2355 //
2356 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL
2357 #define SIMDUTF_IMPLEMENTATION_WESTMERE 0
2358 #else
2359 #define SIMDUTF_IMPLEMENTATION_WESTMERE (SIMDUTF_IS_X86_64)
2360 #endif
2361
2362 #endif
2363
2364 #define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__)
2365
2366 #if SIMDUTF_IMPLEMENTATION_WESTMERE
2367
2368 #define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,popcnt")
2369
2370 namespace simdutf {
2371 /**
2372 * Implementation for Westmere (Intel SSE4.2).
2373 */
2374 namespace westmere {
2375 } // namespace westmere
2376 } // namespace simdutf
2377
2378 //
2379 // These two need to be included outside SIMDUTF_TARGET_REGION
2380 //
2381 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/implementation.h
2382 /* begin file src/simdutf/westmere/implementation.h */
2383 #ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H
2384 #define SIMDUTF_WESTMERE_IMPLEMENTATION_H
2385
2386
2387 // The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
2388 namespace simdutf {
2389 namespace westmere {
2390
2391 namespace {
2392 using namespace simdutf;
2393 }
2394
2395 class implementation final : public simdutf::implementation {
2396 public:
implementation()2397 simdutf_really_inline implementation() : simdutf::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42) {}
2398 simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
2399 simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
2400 simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
2401 simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
2402 simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
2403 simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
2404 simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
2405 simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
2406 simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
2407 simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
2408 simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
2409 simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
2410 simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
2411 simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
2412 simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
2413 simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2414 simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2415 simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
2416 simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
2417 simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2418 simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2419 simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2420 simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2421 simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2422 simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2423 simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2424 simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2425 simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2426 simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2427 simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2428 simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2429 simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2430 simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2431 simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2432 simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2433 simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2434 simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2435 simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2436 simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2437 simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2438 simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2439 void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
2440 simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
2441 simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
2442 simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
2443 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
2444 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
2445 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
2446 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
2447 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
2448 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
2449 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
2450 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
2451 };
2452
2453 } // namespace westmere
2454 } // namespace simdutf
2455
2456 #endif // SIMDUTF_WESTMERE_IMPLEMENTATION_H
2457 /* end file src/simdutf/westmere/implementation.h */
2458 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/intrinsics.h
2459 /* begin file src/simdutf/westmere/intrinsics.h */
2460 #ifndef SIMDUTF_WESTMERE_INTRINSICS_H
2461 #define SIMDUTF_WESTMERE_INTRINSICS_H
2462
2463 #ifdef SIMDUTF_VISUAL_STUDIO
2464 // under clang within visual studio, this will include <x86intrin.h>
2465 #include <intrin.h> // visual studio or clang
2466 #else
2467
2468 #if SIMDUTF_GCC11ORMORE
2469 // We should not get warnings while including <x86intrin.h> yet we do
2470 // under some versions of GCC.
2471 // If the x86intrin.h header has uninitialized values that are problematic,
2472 // it is a GCC issue, we want to ignore these warnigns.
2473 SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
2474 #endif
2475
2476 #include <x86intrin.h> // elsewhere
2477
2478
2479 #if SIMDUTF_GCC11ORMORE
2480 // cancels the suppression of the -Wuninitialized
2481 SIMDUTF_POP_DISABLE_WARNINGS
2482 #endif
2483
2484 #endif // SIMDUTF_VISUAL_STUDIO
2485
2486
2487 #ifdef SIMDUTF_CLANG_VISUAL_STUDIO
2488 /**
2489 * You are not supposed, normally, to include these
2490 * headers directly. Instead you should either include intrin.h
2491 * or x86intrin.h. However, when compiling with clang
2492 * under Windows (i.e., when _MSC_VER is set), these headers
2493 * only get included *if* the corresponding features are detected
2494 * from macros:
2495 */
2496 #include <smmintrin.h> // for _mm_alignr_epi8
2497 #endif
2498
2499
2500
2501 #endif // SIMDUTF_WESTMERE_INTRINSICS_H
2502 /* end file src/simdutf/westmere/intrinsics.h */
2503
2504 //
2505 // The rest need to be inside the region
2506 //
2507 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h
2508 /* begin file src/simdutf/westmere/begin.h */
2509 // redefining SIMDUTF_IMPLEMENTATION to "westmere"
2510 // #define SIMDUTF_IMPLEMENTATION westmere
2511
2512 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
2513 // nothing needed.
2514 #else
2515 SIMDUTF_TARGET_WESTMERE
2516 #endif
2517 /* end file src/simdutf/westmere/begin.h */
2518
2519 // Declarations
2520 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/bitmanipulation.h
2521 /* begin file src/simdutf/westmere/bitmanipulation.h */
2522 #ifndef SIMDUTF_WESTMERE_BITMANIPULATION_H
2523 #define SIMDUTF_WESTMERE_BITMANIPULATION_H
2524
2525 namespace simdutf {
2526 namespace westmere {
2527 namespace {
2528
2529 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
count_ones(uint64_t input_num)2530 simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
2531 // note: we do not support legacy 32-bit Windows
2532 return __popcnt64(input_num);// Visual Studio wants two underscores
2533 }
2534 #else
2535 simdutf_really_inline long long int count_ones(uint64_t input_num) {
2536 return _popcnt64(input_num);
2537 }
2538 #endif
2539
2540 } // unnamed namespace
2541 } // namespace westmere
2542 } // namespace simdutf
2543
2544 #endif // SIMDUTF_WESTMERE_BITMANIPULATION_H
2545 /* end file src/simdutf/westmere/bitmanipulation.h */
2546 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd.h
2547 /* begin file src/simdutf/westmere/simd.h */
2548 #ifndef SIMDUTF_WESTMERE_SIMD_H
2549 #define SIMDUTF_WESTMERE_SIMD_H
2550
2551 namespace simdutf {
2552 namespace westmere {
2553 namespace {
2554 namespace simd {
2555
2556 template<typename Child>
2557 struct base {
2558 __m128i value;
2559
2560 // Zero constructor
basesimdutf::westmere::__anone55652eb0c11::simd::base2561 simdutf_really_inline base() : value{__m128i()} {}
2562
2563 // Conversion from SIMD register
basesimdutf::westmere::__anone55652eb0c11::simd::base2564 simdutf_really_inline base(const __m128i _value) : value(_value) {}
2565 // Conversion to SIMD register
operator const __m128i&simdutf::westmere::__anone55652eb0c11::simd::base2566 simdutf_really_inline operator const __m128i&() const { return this->value; }
operator __m128i&simdutf::westmere::__anone55652eb0c11::simd::base2567 simdutf_really_inline operator __m128i&() { return this->value; }
2568 template <endianness big_endian>
store_ascii_as_utf16simdutf::westmere::__anone55652eb0c11::simd::base2569 simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
2570 __m128i first = _mm_cvtepu8_epi16(*this);
2571 __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this,8));
2572 if (big_endian) {
2573 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
2574 first = _mm_shuffle_epi8(first, swap);
2575 second = _mm_shuffle_epi8(second, swap);
2576 }
2577 _mm_storeu_si128(reinterpret_cast<__m128i *>(p), first);
2578 _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), second);
2579 }
store_ascii_as_utf32simdutf::westmere::__anone55652eb0c11::simd::base2580 simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const {
2581 _mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi32(*this));
2582 _mm_storeu_si128(reinterpret_cast<__m128i *>(p+4), _mm_cvtepu8_epi32(_mm_srli_si128(*this,4)));
2583 _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), _mm_cvtepu8_epi32(_mm_srli_si128(*this,8)));
2584 _mm_storeu_si128(reinterpret_cast<__m128i *>(p+12), _mm_cvtepu8_epi32(_mm_srli_si128(*this,12)));
2585 }
2586 // Bit operations
operator |simdutf::westmere::__anone55652eb0c11::simd::base2587 simdutf_really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
operator &simdutf::westmere::__anone55652eb0c11::simd::base2588 simdutf_really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
operator ^simdutf::westmere::__anone55652eb0c11::simd::base2589 simdutf_really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
bit_andnotsimdutf::westmere::__anone55652eb0c11::simd::base2590 simdutf_really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
operator |=simdutf::westmere::__anone55652eb0c11::simd::base2591 simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
operator &=simdutf::westmere::__anone55652eb0c11::simd::base2592 simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
operator ^=simdutf::westmere::__anone55652eb0c11::simd::base2593 simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
2594 };
2595
2596 // Forward-declared so they can be used by splat and friends.
2597 template<typename T>
2598 struct simd8;
2599
2600 template<typename T, typename Mask=simd8<bool>>
2601 struct base8: base<simd8<T>> {
2602 typedef uint16_t bitmask_t;
2603 typedef uint32_t bitmask2_t;
2604
firstsimdutf::westmere::__anone55652eb0c11::simd::base82605 simdutf_really_inline T first() const { return _mm_extract_epi8(*this,0); }
lastsimdutf::westmere::__anone55652eb0c11::simd::base82606 simdutf_really_inline T last() const { return _mm_extract_epi8(*this,15); }
base8simdutf::westmere::__anone55652eb0c11::simd::base82607 simdutf_really_inline base8() : base<simd8<T>>() {}
base8simdutf::westmere::__anone55652eb0c11::simd::base82608 simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
2609
operator ==(const simd8<T> lhs,const simd8<T> rhs)2610 friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm_cmpeq_epi8(lhs, rhs); }
2611
2612 static const int SIZE = sizeof(base<simd8<T>>::value);
2613
2614 template<int N=1>
prevsimdutf::westmere::__anone55652eb0c11::simd::base82615 simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
2616 return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
2617 }
2618 };
2619
2620 // SIMD byte mask type (returned by things like eq and gt)
2621 template<>
2622 struct simd8<bool>: base8<bool> {
splatsimdutf::westmere::__anone55652eb0c11::simd::simd82623 static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
2624
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82625 simdutf_really_inline simd8<bool>() : base8() {}
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82626 simdutf_really_inline simd8<bool>(const __m128i _value) : base8<bool>(_value) {}
2627 // Splat constructor
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82628 simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
2629
to_bitmasksimdutf::westmere::__anone55652eb0c11::simd::simd82630 simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
anysimdutf::westmere::__anone55652eb0c11::simd::simd82631 simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
nonesimdutf::westmere::__anone55652eb0c11::simd::simd82632 simdutf_really_inline bool none() const { return _mm_testz_si128(*this, *this); }
allsimdutf::westmere::__anone55652eb0c11::simd::simd82633 simdutf_really_inline bool all() const { return _mm_movemask_epi8(*this) == 0xFFFF; }
operator ~simdutf::westmere::__anone55652eb0c11::simd::simd82634 simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
2635 };
2636
2637 template<typename T>
2638 struct base8_numeric: base8<T> {
splatsimdutf::westmere::__anone55652eb0c11::simd::base8_numeric2639 static simdutf_really_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
zerosimdutf::westmere::__anone55652eb0c11::simd::base8_numeric2640 static simdutf_really_inline simd8<T> zero() { return _mm_setzero_si128(); }
loadsimdutf::westmere::__anone55652eb0c11::simd::base8_numeric2641 static simdutf_really_inline simd8<T> load(const T values[16]) {
2642 return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
2643 }
2644 // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2645 static simdutf_really_inline simd8<T> repeat_16(
2646 T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
2647 T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
2648 ) {
2649 return simd8<T>(
2650 v0, v1, v2, v3, v4, v5, v6, v7,
2651 v8, v9, v10,v11,v12,v13,v14,v15
2652 );
2653 }
2654
base8_numericsimdutf::westmere::__anone55652eb0c11::simd::base8_numeric2655 simdutf_really_inline base8_numeric() : base8<T>() {}
base8_numericsimdutf::westmere::__anone55652eb0c11::simd::base8_numeric2656 simdutf_really_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
2657
2658 // Store to array
storesimdutf::westmere::__anone55652eb0c11::simd::base8_numeric2659 simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
2660
2661 // Override to distinguish from bool version
operator ~simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2662 simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
2663
2664 // Addition/subtraction are the same for signed and unsigned
operator +simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2665 simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
operator -simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2666 simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
operator +=simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2667 simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
operator -=simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2668 simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
2669
2670 // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
2671 template<typename L>
lookup_16simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2672 simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
2673 return _mm_shuffle_epi8(lookup_table, *this);
2674 }
2675
2676 template<typename L>
lookup_16simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2677 simdutf_really_inline simd8<L> lookup_16(
2678 L replace0, L replace1, L replace2, L replace3,
2679 L replace4, L replace5, L replace6, L replace7,
2680 L replace8, L replace9, L replace10, L replace11,
2681 L replace12, L replace13, L replace14, L replace15) const {
2682 return lookup_16(simd8<L>::repeat_16(
2683 replace0, replace1, replace2, replace3,
2684 replace4, replace5, replace6, replace7,
2685 replace8, replace9, replace10, replace11,
2686 replace12, replace13, replace14, replace15
2687 ));
2688 }
2689 };
2690
2691 // Signed bytes
2692 template<>
2693 struct simd8<int8_t> : base8_numeric<int8_t> {
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82694 simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82695 simdutf_really_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
2696 // Splat constructor
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82697 simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
2698 // Array constructor
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82699 simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
2700 // Member-by-member initialization
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82701 simdutf_really_inline simd8(
2702 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
2703 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
2704 ) : simd8(_mm_setr_epi8(
2705 v0, v1, v2, v3, v4, v5, v6, v7,
2706 v8, v9, v10,v11,v12,v13,v14,v15
2707 )) {}
2708 // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::westmere::__anone55652eb0c11::simd::simd82709 simdutf_really_inline static simd8<int8_t> repeat_16(
2710 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
2711 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
2712 ) {
2713 return simd8<int8_t>(
2714 v0, v1, v2, v3, v4, v5, v6, v7,
2715 v8, v9, v10,v11,v12,v13,v14,v15
2716 );
2717 }
2718 simdutf_really_inline operator simd8<uint8_t>() const;
is_asciisimdutf::westmere::__anone55652eb0c11::simd::simd82719 simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
2720
2721 // Order-sensitive comparisons
max_valsimdutf::westmere::__anone55652eb0c11::simd::simd82722 simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
min_valsimdutf::westmere::__anone55652eb0c11::simd::simd82723 simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
operator >simdutf::westmere::__anone55652eb0c11::simd::simd82724 simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
operator <simdutf::westmere::__anone55652eb0c11::simd::simd82725 simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
2726 };
2727
2728 // Unsigned bytes
2729 template<>
2730 struct simd8<uint8_t>: base8_numeric<uint8_t> {
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82731 simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82732 simdutf_really_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
2733
2734 // Splat constructor
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82735 simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
2736 // Array constructor
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82737 simdutf_really_inline simd8(const uint8_t* values) : simd8(load(values)) {}
2738 // Member-by-member initialization
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82739 simdutf_really_inline simd8(
2740 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
2741 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
2742 ) : simd8(_mm_setr_epi8(
2743 v0, v1, v2, v3, v4, v5, v6, v7,
2744 v8, v9, v10,v11,v12,v13,v14,v15
2745 )) {}
2746 // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::westmere::__anone55652eb0c11::simd::simd82747 simdutf_really_inline static simd8<uint8_t> repeat_16(
2748 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
2749 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
2750 ) {
2751 return simd8<uint8_t>(
2752 v0, v1, v2, v3, v4, v5, v6, v7,
2753 v8, v9, v10,v11,v12,v13,v14,v15
2754 );
2755 }
2756
2757 // Saturated math
saturating_addsimdutf::westmere::__anone55652eb0c11::simd::simd82758 simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }
saturating_subsimdutf::westmere::__anone55652eb0c11::simd::simd82759 simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); }
2760
2761 // Order-specific operations
max_valsimdutf::westmere::__anone55652eb0c11::simd::simd82762 simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); }
min_valsimdutf::westmere::__anone55652eb0c11::simd::simd82763 simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); }
2764 // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::westmere::__anone55652eb0c11::simd::simd82765 simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
2766 // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::westmere::__anone55652eb0c11::simd::simd82767 simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::westmere::__anone55652eb0c11::simd::simd82768 simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
operator >=simdutf::westmere::__anone55652eb0c11::simd::simd82769 simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
operator >simdutf::westmere::__anone55652eb0c11::simd::simd82770 simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
operator <simdutf::westmere::__anone55652eb0c11::simd::simd82771 simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
2772
2773 // Bit-specific operations
bits_not_setsimdutf::westmere::__anone55652eb0c11::simd::simd82774 simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
bits_not_setsimdutf::westmere::__anone55652eb0c11::simd::simd82775 simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
any_bits_setsimdutf::westmere::__anone55652eb0c11::simd::simd82776 simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
any_bits_setsimdutf::westmere::__anone55652eb0c11::simd::simd82777 simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
is_asciisimdutf::westmere::__anone55652eb0c11::simd::simd82778 simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
2779
bits_not_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82780 simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
any_bits_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82781 simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
bits_not_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82782 simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
any_bits_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82783 simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
2784 template<int N>
shrsimdutf::westmere::__anone55652eb0c11::simd::simd82785 simdutf_really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
2786 template<int N>
shlsimdutf::westmere::__anone55652eb0c11::simd::simd82787 simdutf_really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
2788 // Get one of the bits and make a bitmask out of it.
2789 // e.g. value.get_bit<7>() gets the high bit
2790 template<int N>
get_bitsimdutf::westmere::__anone55652eb0c11::simd::simd82791 simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
2792 };
operator simd8<uint8_t>() const2793 simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
2794
2795 // Unsigned bytes
2796 template<>
2797 struct simd8<uint16_t>: base<uint16_t> {
splatsimdutf::westmere::__anone55652eb0c11::simd::simd82798 static simdutf_really_inline simd8<uint16_t> splat(uint16_t _value) { return _mm_set1_epi16(_value); }
loadsimdutf::westmere::__anone55652eb0c11::simd::simd82799 static simdutf_really_inline simd8<uint16_t> load(const uint16_t values[8]) {
2800 return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
2801 }
2802
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82803 simdutf_really_inline simd8() : base<uint16_t>() {}
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82804 simdutf_really_inline simd8(const __m128i _value) : base<uint16_t>(_value) {}
2805 // Splat constructor
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82806 simdutf_really_inline simd8(uint16_t _value) : simd8(splat(_value)) {}
2807 // Array constructor
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82808 simdutf_really_inline simd8(const uint16_t* values) : simd8(load(values)) {}
2809 // Member-by-member initialization
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82810 simdutf_really_inline simd8(
2811 uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7
2812 ) : simd8(_mm_setr_epi16(
2813 v0, v1, v2, v3, v4, v5, v6, v7
2814 )) {}
2815
2816 // Saturated math
saturating_addsimdutf::westmere::__anone55652eb0c11::simd::simd82817 simdutf_really_inline simd8<uint16_t> saturating_add(const simd8<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
saturating_subsimdutf::westmere::__anone55652eb0c11::simd::simd82818 simdutf_really_inline simd8<uint16_t> saturating_sub(const simd8<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
2819
2820 // Order-specific operations
max_valsimdutf::westmere::__anone55652eb0c11::simd::simd82821 simdutf_really_inline simd8<uint16_t> max_val(const simd8<uint16_t> other) const { return _mm_max_epu16(*this, other); }
min_valsimdutf::westmere::__anone55652eb0c11::simd::simd82822 simdutf_really_inline simd8<uint16_t> min_val(const simd8<uint16_t> other) const { return _mm_min_epu16(*this, other); }
2823 // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::westmere::__anone55652eb0c11::simd::simd82824 simdutf_really_inline simd8<uint16_t> gt_bits(const simd8<uint16_t> other) const { return this->saturating_sub(other); }
2825 // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::westmere::__anone55652eb0c11::simd::simd82826 simdutf_really_inline simd8<uint16_t> lt_bits(const simd8<uint16_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::westmere::__anone55652eb0c11::simd::simd82827 simdutf_really_inline simd8<bool> operator<=(const simd8<uint16_t> other) const { return other.max_val(*this) == other; }
operator >=simdutf::westmere::__anone55652eb0c11::simd::simd82828 simdutf_really_inline simd8<bool> operator>=(const simd8<uint16_t> other) const { return other.min_val(*this) == other; }
operator ==simdutf::westmere::__anone55652eb0c11::simd::simd82829 simdutf_really_inline simd8<bool> operator==(const simd8<uint16_t> other) const { return _mm_cmpeq_epi16(*this, other); }
operator &simdutf::westmere::__anone55652eb0c11::simd::simd82830 simdutf_really_inline simd8<bool> operator&(const simd8<uint16_t> other) const { return _mm_and_si128(*this, other); }
operator |simdutf::westmere::__anone55652eb0c11::simd::simd82831 simdutf_really_inline simd8<bool> operator|(const simd8<uint16_t> other) const { return _mm_or_si128(*this, other); }
2832
2833 // Bit-specific operations
bits_not_setsimdutf::westmere::__anone55652eb0c11::simd::simd82834 simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint16_t(0); }
any_bits_setsimdutf::westmere::__anone55652eb0c11::simd::simd82835 simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
2836
bits_not_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82837 simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
any_bits_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82838 simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
bits_not_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82839 simdutf_really_inline bool bits_not_set_anywhere(simd8<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
any_bits_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82840 simdutf_really_inline bool any_bits_set_anywhere(simd8<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
2841 };
2842 template<typename T>
2843 struct simd8x64 {
2844 static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
2845 static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
2846 simd8<T> chunks[NUM_CHUNKS];
2847
2848 simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
2849 simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
2850 simd8x64() = delete; // no default constructor allowed
2851
simd8x64simdutf::westmere::__anone55652eb0c11::simd::simd8x642852 simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
simd8x64simdutf::westmere::__anone55652eb0c11::simd::simd8x642853 simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
2854
storesimdutf::westmere::__anone55652eb0c11::simd::simd8x642855 simdutf_really_inline void store(T* ptr) const {
2856 this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
2857 this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
2858 this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
2859 this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
2860 }
2861
operator |=simdutf::westmere::__anone55652eb0c11::simd::simd8x642862 simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
2863 this->chunks[0] |= other.chunks[0];
2864 this->chunks[1] |= other.chunks[1];
2865 this->chunks[2] |= other.chunks[2];
2866 this->chunks[3] |= other.chunks[3];
2867 return *this;
2868 }
2869
reduce_orsimdutf::westmere::__anone55652eb0c11::simd::simd8x642870 simdutf_really_inline simd8<T> reduce_or() const {
2871 return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
2872 }
2873
is_asciisimdutf::westmere::__anone55652eb0c11::simd::simd8x642874 simdutf_really_inline bool is_ascii() const {
2875 return this->reduce_or().is_ascii();
2876 }
2877
2878 template <endianness endian>
store_ascii_as_utf16simdutf::westmere::__anone55652eb0c11::simd::simd8x642879 simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
2880 this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
2881 this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
2882 this->chunks[2].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*2);
2883 this->chunks[3].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*3);
2884 }
2885
store_ascii_as_utf32simdutf::westmere::__anone55652eb0c11::simd::simd8x642886 simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
2887 this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
2888 this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
2889 this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*2);
2890 this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*3);
2891 }
2892
to_bitmasksimdutf::westmere::__anone55652eb0c11::simd::simd8x642893 simdutf_really_inline uint64_t to_bitmask() const {
2894 uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
2895 uint64_t r1 = this->chunks[1].to_bitmask() ;
2896 uint64_t r2 = this->chunks[2].to_bitmask() ;
2897 uint64_t r3 = this->chunks[3].to_bitmask() ;
2898 return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
2899 }
2900
eqsimdutf::westmere::__anone55652eb0c11::simd::simd8x642901 simdutf_really_inline uint64_t eq(const T m) const {
2902 const simd8<T> mask = simd8<T>::splat(m);
2903 return simd8x64<bool>(
2904 this->chunks[0] == mask,
2905 this->chunks[1] == mask,
2906 this->chunks[2] == mask,
2907 this->chunks[3] == mask
2908 ).to_bitmask();
2909 }
2910
eqsimdutf::westmere::__anone55652eb0c11::simd::simd8x642911 simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
2912 return simd8x64<bool>(
2913 this->chunks[0] == other.chunks[0],
2914 this->chunks[1] == other.chunks[1],
2915 this->chunks[2] == other.chunks[2],
2916 this->chunks[3] == other.chunks[3]
2917 ).to_bitmask();
2918 }
2919
lteqsimdutf::westmere::__anone55652eb0c11::simd::simd8x642920 simdutf_really_inline uint64_t lteq(const T m) const {
2921 const simd8<T> mask = simd8<T>::splat(m);
2922 return simd8x64<bool>(
2923 this->chunks[0] <= mask,
2924 this->chunks[1] <= mask,
2925 this->chunks[2] <= mask,
2926 this->chunks[3] <= mask
2927 ).to_bitmask();
2928 }
2929
in_rangesimdutf::westmere::__anone55652eb0c11::simd::simd8x642930 simdutf_really_inline uint64_t in_range(const T low, const T high) const {
2931 const simd8<T> mask_low = simd8<T>::splat(low);
2932 const simd8<T> mask_high = simd8<T>::splat(high);
2933
2934 return simd8x64<bool>(
2935 (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
2936 (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
2937 (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
2938 (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
2939 ).to_bitmask();
2940 }
not_in_rangesimdutf::westmere::__anone55652eb0c11::simd::simd8x642941 simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
2942 const simd8<T> mask_low = simd8<T>::splat(low-1);
2943 const simd8<T> mask_high = simd8<T>::splat(high+1);
2944 return simd8x64<bool>(
2945 (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
2946 (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
2947 (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
2948 (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
2949 ).to_bitmask();
2950 }
ltsimdutf::westmere::__anone55652eb0c11::simd::simd8x642951 simdutf_really_inline uint64_t lt(const T m) const {
2952 const simd8<T> mask = simd8<T>::splat(m);
2953 return simd8x64<bool>(
2954 this->chunks[0] < mask,
2955 this->chunks[1] < mask,
2956 this->chunks[2] < mask,
2957 this->chunks[3] < mask
2958 ).to_bitmask();
2959 }
2960
gtsimdutf::westmere::__anone55652eb0c11::simd::simd8x642961 simdutf_really_inline uint64_t gt(const T m) const {
2962 const simd8<T> mask = simd8<T>::splat(m);
2963 return simd8x64<bool>(
2964 this->chunks[0] > mask,
2965 this->chunks[1] > mask,
2966 this->chunks[2] > mask,
2967 this->chunks[3] > mask
2968 ).to_bitmask();
2969 }
gteqsimdutf::westmere::__anone55652eb0c11::simd::simd8x642970 simdutf_really_inline uint64_t gteq(const T m) const {
2971 const simd8<T> mask = simd8<T>::splat(m);
2972 return simd8x64<bool>(
2973 this->chunks[0] >= mask,
2974 this->chunks[1] >= mask,
2975 this->chunks[2] >= mask,
2976 this->chunks[3] >= mask
2977 ).to_bitmask();
2978 }
gteq_unsignedsimdutf::westmere::__anone55652eb0c11::simd::simd8x642979 simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
2980 const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
2981 return simd8x64<bool>(
2982 simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
2983 simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
2984 simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
2985 simd8<uint8_t>(__m128i(this->chunks[3])) >= mask
2986 ).to_bitmask();
2987 }
2988 }; // struct simd8x64<T>
2989
2990 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd16-inl.h
2991 /* begin file src/simdutf/westmere/simd16-inl.h */
2992 template<typename T>
2993 struct simd16;
2994
2995 template<typename T, typename Mask=simd16<bool>>
2996 struct base16: base<simd16<T>> {
2997 typedef uint16_t bitmask_t;
2998 typedef uint32_t bitmask2_t;
2999
base16simdutf::westmere::__anone55652eb0c11::simd::base163000 simdutf_really_inline base16() : base<simd16<T>>() {}
base16simdutf::westmere::__anone55652eb0c11::simd::base163001 simdutf_really_inline base16(const __m128i _value) : base<simd16<T>>(_value) {}
3002 template <typename Pointer>
base16simdutf::westmere::__anone55652eb0c11::simd::base163003 simdutf_really_inline base16(const Pointer* ptr) : base16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr))) {}
3004
operator ==(const simd16<T> lhs,const simd16<T> rhs)3005 friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm_cmpeq_epi16(lhs, rhs); }
3006
3007 static const int SIZE = sizeof(base<simd16<T>>::value);
3008
3009 template<int N=1>
prevsimdutf::westmere::__anone55652eb0c11::simd::base163010 simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
3011 return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
3012 }
3013 };
3014
3015 // SIMD byte mask type (returned by things like eq and gt)
3016 template<>
3017 struct simd16<bool>: base16<bool> {
splatsimdutf::westmere::__anone55652eb0c11::simd::simd163018 static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm_set1_epi16(uint16_t(-(!!_value))); }
3019
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163020 simdutf_really_inline simd16<bool>() : base16() {}
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163021 simdutf_really_inline simd16<bool>(const __m128i _value) : base16<bool>(_value) {}
3022 // Splat constructor
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163023 simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
3024
to_bitmasksimdutf::westmere::__anone55652eb0c11::simd::simd163025 simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
anysimdutf::westmere::__anone55652eb0c11::simd::simd163026 simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
operator ~simdutf::westmere::__anone55652eb0c11::simd::simd163027 simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
3028 };
3029
3030 template<typename T>
3031 struct base16_numeric: base16<T> {
splatsimdutf::westmere::__anone55652eb0c11::simd::base16_numeric3032 static simdutf_really_inline simd16<T> splat(T _value) { return _mm_set1_epi16(_value); }
zerosimdutf::westmere::__anone55652eb0c11::simd::base16_numeric3033 static simdutf_really_inline simd16<T> zero() { return _mm_setzero_si128(); }
loadsimdutf::westmere::__anone55652eb0c11::simd::base16_numeric3034 static simdutf_really_inline simd16<T> load(const T values[8]) {
3035 return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
3036 }
3037
base16_numericsimdutf::westmere::__anone55652eb0c11::simd::base16_numeric3038 simdutf_really_inline base16_numeric() : base16<T>() {}
base16_numericsimdutf::westmere::__anone55652eb0c11::simd::base16_numeric3039 simdutf_really_inline base16_numeric(const __m128i _value) : base16<T>(_value) {}
3040
3041 // Store to array
storesimdutf::westmere::__anone55652eb0c11::simd::base16_numeric3042 simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
3043
3044 // Override to distinguish from bool version
operator ~simdutf::westmere::__anone55652eb0c11::simd::base16_numeric3045 simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
3046
3047 // Addition/subtraction are the same for signed and unsigned
operator +simdutf::westmere::__anone55652eb0c11::simd::base16_numeric3048 simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm_add_epi16(*this, other); }
operator -simdutf::westmere::__anone55652eb0c11::simd::base16_numeric3049 simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm_sub_epi16(*this, other); }
operator +=simdutf::westmere::__anone55652eb0c11::simd::base16_numeric3050 simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
operator -=simdutf::westmere::__anone55652eb0c11::simd::base16_numeric3051 simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
3052 };
3053
3054 // Signed words
3055 template<>
3056 struct simd16<int16_t> : base16_numeric<int16_t> {
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163057 simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163058 simdutf_really_inline simd16(const __m128i _value) : base16_numeric<int16_t>(_value) {}
3059 // Splat constructor
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163060 simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
3061 // Array constructor
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163062 simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163063 simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
3064 // Member-by-member initialization
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163065 simdutf_really_inline simd16(
3066 int16_t v0, int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7)
3067 : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
3068 simdutf_really_inline operator simd16<uint16_t>() const;
3069
3070 // Order-sensitive comparisons
max_valsimdutf::westmere::__anone55652eb0c11::simd::simd163071 simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm_max_epi16(*this, other); }
min_valsimdutf::westmere::__anone55652eb0c11::simd::simd163072 simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm_min_epi16(*this, other); }
operator >simdutf::westmere::__anone55652eb0c11::simd::simd163073 simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(*this, other); }
operator <simdutf::westmere::__anone55652eb0c11::simd::simd163074 simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(other, *this); }
3075 };
3076
3077 // Unsigned words
3078 template<>
3079 struct simd16<uint16_t>: base16_numeric<uint16_t> {
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163080 simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163081 simdutf_really_inline simd16(const __m128i _value) : base16_numeric<uint16_t>(_value) {}
3082
3083 // Splat constructor
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163084 simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
3085 // Array constructor
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163086 simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163087 simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
3088 // Member-by-member initialization
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163089 simdutf_really_inline simd16(
3090 uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
3091 : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
3092 // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::westmere::__anone55652eb0c11::simd::simd163093 simdutf_really_inline static simd16<uint16_t> repeat_16(
3094 uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7
3095 ) {
3096 return simd16<uint16_t>(v0, v1, v2, v3, v4, v5, v6, v7);
3097 }
3098
3099 // Saturated math
saturating_addsimdutf::westmere::__anone55652eb0c11::simd::simd163100 simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
saturating_subsimdutf::westmere::__anone55652eb0c11::simd::simd163101 simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
3102
3103 // Order-specific operations
max_valsimdutf::westmere::__anone55652eb0c11::simd::simd163104 simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm_max_epu16(*this, other); }
min_valsimdutf::westmere::__anone55652eb0c11::simd::simd163105