• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* auto-generated on 2023-06-05 08:58:28 -0400. Do not edit! */
2 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf.cpp
3 /* begin file src/simdutf.cpp */
4 #include "simdutf.h"
5 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=implementation.cpp
6 /* begin file src/implementation.cpp */
7 #include <initializer_list>
8 #include <climits>
9 
10 // Useful for debugging purposes
11 namespace simdutf {
12 namespace {
13 
14 template <typename T>
toBinaryString(T b)15 std::string toBinaryString(T b) {
16    std::string binary = "";
17    T mask = T(1) << (sizeof(T) * CHAR_BIT - 1);
18    while (mask > 0) {
19     binary += ((b & mask) == 0) ? '0' : '1';
20     mask >>= 1;
21   }
22   return binary;
23 }
24 }
25 }
26 
27 // Implementations
28 // The best choice should always come first!
29 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64.h
30 /* begin file src/simdutf/arm64.h */
31 #ifndef SIMDUTF_ARM64_H
32 #define SIMDUTF_ARM64_H
33 
34 #ifdef SIMDUTF_FALLBACK_H
35 #error "arm64.h must be included before fallback.h"
36 #endif
37 
38 
39 #ifndef SIMDUTF_IMPLEMENTATION_ARM64
40 #define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64)
41 #endif
42 #define SIMDUTF_CAN_ALWAYS_RUN_ARM64 SIMDUTF_IMPLEMENTATION_ARM64 && SIMDUTF_IS_ARM64
43 
44 
45 
46 #if SIMDUTF_IMPLEMENTATION_ARM64
47 
48 namespace simdutf {
49 /**
50  * Implementation for NEON (ARMv8).
51  */
52 namespace arm64 {
53 } // namespace arm64
54 } // namespace simdutf
55 
56 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/implementation.h
57 /* begin file src/simdutf/arm64/implementation.h */
58 #ifndef SIMDUTF_ARM64_IMPLEMENTATION_H
59 #define SIMDUTF_ARM64_IMPLEMENTATION_H
60 
61 
62 namespace simdutf {
63 namespace arm64 {
64 
65 namespace {
66 using namespace simdutf;
67 }
68 
69 class implementation final : public simdutf::implementation {
70 public:
implementation()71   simdutf_really_inline implementation() : simdutf::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {}
72   simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
73   simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
74   simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
75   simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
76   simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
77   simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
78   simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
79   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
80   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
81   simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
82   simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
83   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
84   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
85   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
86   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
87   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
88   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
89   simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
90   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
91   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
92   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
93   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
94   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
95   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
96   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
97   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
98   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
99   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
100   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
101   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
102   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
103   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
104   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
105   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
106   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
107   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
108   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
109   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
110   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
111   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
112   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
113   void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
114   simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
115   simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
116   simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
117   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
118   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
119   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
120   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
121   simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
122   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
123   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
124   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
125 };
126 
127 } // namespace arm64
128 } // namespace simdutf
129 
130 #endif // SIMDUTF_ARM64_IMPLEMENTATION_H
131 /* end file src/simdutf/arm64/implementation.h */
132 
133 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h
134 /* begin file src/simdutf/arm64/begin.h */
135 // redefining SIMDUTF_IMPLEMENTATION to "arm64"
136 // #define SIMDUTF_IMPLEMENTATION arm64
137 /* end file src/simdutf/arm64/begin.h */
138 
139 // Declarations
140 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/intrinsics.h
141 /* begin file src/simdutf/arm64/intrinsics.h */
142 #ifndef SIMDUTF_ARM64_INTRINSICS_H
143 #define SIMDUTF_ARM64_INTRINSICS_H
144 
145 
146 // This should be the correct header whether
147 // you use visual studio or other compilers.
148 #include <arm_neon.h>
149 
150 #endif //  SIMDUTF_ARM64_INTRINSICS_H
151 /* end file src/simdutf/arm64/intrinsics.h */
152 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/bitmanipulation.h
153 /* begin file src/simdutf/arm64/bitmanipulation.h */
154 #ifndef SIMDUTF_ARM64_BITMANIPULATION_H
155 #define SIMDUTF_ARM64_BITMANIPULATION_H
156 
157 namespace simdutf {
158 namespace arm64 {
159 namespace {
160 
161 /* result might be undefined when input_num is zero */
count_ones(uint64_t input_num)162 simdutf_really_inline int count_ones(uint64_t input_num) {
163    return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
164 }
165 
166 } // unnamed namespace
167 } // namespace arm64
168 } // namespace simdutf
169 
170 #endif // SIMDUTF_ARM64_BITMANIPULATION_H
171 /* end file src/simdutf/arm64/bitmanipulation.h */
172 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd.h
173 /* begin file src/simdutf/arm64/simd.h */
174 #ifndef SIMDUTF_ARM64_SIMD_H
175 #define SIMDUTF_ARM64_SIMD_H
176 
177 #include <type_traits>
178 
179 
180 namespace simdutf {
181 namespace arm64 {
182 namespace {
183 namespace simd {
184 
185 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
186 namespace {
187 // Start of private section with Visual Studio workaround
188 
189 
190 /**
191  * make_uint8x16_t initializes a SIMD register (uint8x16_t).
192  * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
193  * is not recognized under Visual Studio! This is a workaround.
194  * Using a std::initializer_list<uint8_t>  as a parameter resulted in
195  * inefficient code. With the current approach, if the parameters are
196  * compile-time constants,
197  * GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
198  * You should not use this function except for compile-time constants:
199  * it is not efficient.
200  */
make_uint8x16_t(uint8_t x1,uint8_t x2,uint8_t x3,uint8_t x4,uint8_t x5,uint8_t x6,uint8_t x7,uint8_t x8,uint8_t x9,uint8_t x10,uint8_t x11,uint8_t x12,uint8_t x13,uint8_t x14,uint8_t x15,uint8_t x16)201 simdutf_really_inline uint8x16_t make_uint8x16_t(uint8_t x1,  uint8_t x2,  uint8_t x3,  uint8_t x4,
202                                          uint8_t x5,  uint8_t x6,  uint8_t x7,  uint8_t x8,
203                                          uint8_t x9,  uint8_t x10, uint8_t x11, uint8_t x12,
204                                          uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
205   // Doing a load like so end ups generating worse code.
206   // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
207   //                     x9, x10,x11,x12,x13,x14,x15,x16};
208   // return vld1q_u8(array);
209   uint8x16_t x{};
210   // incredibly, Visual Studio does not allow x[0] = x1
211   x = vsetq_lane_u8(x1, x, 0);
212   x = vsetq_lane_u8(x2, x, 1);
213   x = vsetq_lane_u8(x3, x, 2);
214   x = vsetq_lane_u8(x4, x, 3);
215   x = vsetq_lane_u8(x5, x, 4);
216   x = vsetq_lane_u8(x6, x, 5);
217   x = vsetq_lane_u8(x7, x, 6);
218   x = vsetq_lane_u8(x8, x, 7);
219   x = vsetq_lane_u8(x9, x, 8);
220   x = vsetq_lane_u8(x10, x, 9);
221   x = vsetq_lane_u8(x11, x, 10);
222   x = vsetq_lane_u8(x12, x, 11);
223   x = vsetq_lane_u8(x13, x, 12);
224   x = vsetq_lane_u8(x14, x, 13);
225   x = vsetq_lane_u8(x15, x, 14);
226   x = vsetq_lane_u8(x16, x, 15);
227   return x;
228 }
229 
230 // We have to do the same work for make_int8x16_t
make_int8x16_t(int8_t x1,int8_t x2,int8_t x3,int8_t x4,int8_t x5,int8_t x6,int8_t x7,int8_t x8,int8_t x9,int8_t x10,int8_t x11,int8_t x12,int8_t x13,int8_t x14,int8_t x15,int8_t x16)231 simdutf_really_inline int8x16_t make_int8x16_t(int8_t x1,  int8_t x2,  int8_t x3,  int8_t x4,
232                                        int8_t x5,  int8_t x6,  int8_t x7,  int8_t x8,
233                                        int8_t x9,  int8_t x10, int8_t x11, int8_t x12,
234                                        int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
235   // Doing a load like so end ups generating worse code.
236   // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
237   //                     x9, x10,x11,x12,x13,x14,x15,x16};
238   // return vld1q_s8(array);
239   int8x16_t x{};
240   // incredibly, Visual Studio does not allow x[0] = x1
241   x = vsetq_lane_s8(x1, x, 0);
242   x = vsetq_lane_s8(x2, x, 1);
243   x = vsetq_lane_s8(x3, x, 2);
244   x = vsetq_lane_s8(x4, x, 3);
245   x = vsetq_lane_s8(x5, x, 4);
246   x = vsetq_lane_s8(x6, x, 5);
247   x = vsetq_lane_s8(x7, x, 6);
248   x = vsetq_lane_s8(x8, x, 7);
249   x = vsetq_lane_s8(x9, x, 8);
250   x = vsetq_lane_s8(x10, x, 9);
251   x = vsetq_lane_s8(x11, x, 10);
252   x = vsetq_lane_s8(x12, x, 11);
253   x = vsetq_lane_s8(x13, x, 12);
254   x = vsetq_lane_s8(x14, x, 13);
255   x = vsetq_lane_s8(x15, x, 14);
256   x = vsetq_lane_s8(x16, x, 15);
257   return x;
258 }
259 
make_uint8x8_t(uint8_t x1,uint8_t x2,uint8_t x3,uint8_t x4,uint8_t x5,uint8_t x6,uint8_t x7,uint8_t x8)260 simdutf_really_inline uint8x8_t make_uint8x8_t(uint8_t x1,  uint8_t x2,  uint8_t x3,  uint8_t x4,
261                                          uint8_t x5,  uint8_t x6,  uint8_t x7,  uint8_t x8) {
262   uint8x8_t x{};
263   x = vset_lane_u8(x1, x, 0);
264   x = vset_lane_u8(x2, x, 1);
265   x = vset_lane_u8(x3, x, 2);
266   x = vset_lane_u8(x4, x, 3);
267   x = vset_lane_u8(x5, x, 4);
268   x = vset_lane_u8(x6, x, 5);
269   x = vset_lane_u8(x7, x, 6);
270   x = vset_lane_u8(x8, x, 7);
271   return x;
272 }
273 
make_uint16x8_t(uint16_t x1,uint16_t x2,uint16_t x3,uint16_t x4,uint16_t x5,uint16_t x6,uint16_t x7,uint16_t x8)274 simdutf_really_inline uint16x8_t make_uint16x8_t(uint16_t x1,  uint16_t x2,  uint16_t x3,  uint16_t x4,
275                                        uint16_t x5,  uint16_t x6,  uint16_t x7,  uint16_t x8) {
276   uint16x8_t x{};
277   x = vsetq_lane_u16(x1, x, 0);
278   x = vsetq_lane_u16(x2, x, 1);
279   x = vsetq_lane_u16(x3, x, 2);
280   x = vsetq_lane_u16(x4, x, 3);
281   x = vsetq_lane_u16(x5, x, 4);
282   x = vsetq_lane_u16(x6, x, 5);
283   x = vsetq_lane_u16(x7, x, 6);
284   x = vsetq_lane_u16(x8, x, 7);;
285   return x;
286 }
287 
make_int16x8_t(int16_t x1,int16_t x2,int16_t x3,int16_t x4,int16_t x5,int16_t x6,int16_t x7,int16_t x8)288 simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t x3,  int16_t x4,
289                                        int16_t x5,  int16_t x6,  int16_t x7,  int16_t x8) {
290   uint16x8_t x{};
291   x = vsetq_lane_s16(x1, x, 0);
292   x = vsetq_lane_s16(x2, x, 1);
293   x = vsetq_lane_s16(x3, x, 2);
294   x = vsetq_lane_s16(x4, x, 3);
295   x = vsetq_lane_s16(x5, x, 4);
296   x = vsetq_lane_s16(x6, x, 5);
297   x = vsetq_lane_s16(x7, x, 6);
298   x = vsetq_lane_s16(x8, x, 7);;
299   return x;
300 }
301 
302 
303 // End of private section with Visual Studio workaround
304 } // namespace
305 #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
306 
307 
308   template<typename T>
309   struct simd8;
310 
311   //
312   // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
313   //
314   template<typename T, typename Mask=simd8<bool>>
315   struct base_u8 {
316     uint8x16_t value;
317     static const int SIZE = sizeof(value);
318 
319     // Conversion from/to SIMD register
base_u8simdutf::arm64::__anone55652eb0411::simd::base_u8320     simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {}
operator const uint8x16_t&simdutf::arm64::__anone55652eb0411::simd::base_u8321     simdutf_really_inline operator const uint8x16_t&() const { return this->value; }
operator uint8x16_t&simdutf::arm64::__anone55652eb0411::simd::base_u8322     simdutf_really_inline operator uint8x16_t&() { return this->value; }
firstsimdutf::arm64::__anone55652eb0411::simd::base_u8323     simdutf_really_inline T first() const { return vgetq_lane_u8(*this,0); }
lastsimdutf::arm64::__anone55652eb0411::simd::base_u8324     simdutf_really_inline T last() const { return vgetq_lane_u8(*this,15); }
325 
326     // Bit operations
operator |simdutf::arm64::__anone55652eb0411::simd::base_u8327     simdutf_really_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
operator &simdutf::arm64::__anone55652eb0411::simd::base_u8328     simdutf_really_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
operator ^simdutf::arm64::__anone55652eb0411::simd::base_u8329     simdutf_really_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
bit_andnotsimdutf::arm64::__anone55652eb0411::simd::base_u8330     simdutf_really_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
operator ~simdutf::arm64::__anone55652eb0411::simd::base_u8331     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
operator |=simdutf::arm64::__anone55652eb0411::simd::base_u8332     simdutf_really_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
operator &=simdutf::arm64::__anone55652eb0411::simd::base_u8333     simdutf_really_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
operator ^=simdutf::arm64::__anone55652eb0411::simd::base_u8334     simdutf_really_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
335 
operator ==(const simd8<T> lhs,const simd8<T> rhs)336     friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); }
337 
338     template<int N=1>
prevsimdutf::arm64::__anone55652eb0411::simd::base_u8339     simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
340       return vextq_u8(prev_chunk, *this, 16 - N);
341     }
342   };
343 
344   // SIMD byte mask type (returned by things like eq and gt)
345   template<>
346   struct simd8<bool>: base_u8<bool> {
347     typedef uint16_t bitmask_t;
348     typedef uint32_t bitmask2_t;
349 
splatsimdutf::arm64::__anone55652eb0411::simd::simd8350     static simdutf_really_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
351 
simd8simdutf::arm64::__anone55652eb0411::simd::simd8352     simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
353     // False constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8354     simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {}
355     // Splat constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8356     simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
storesimdutf::arm64::__anone55652eb0411::simd::simd8357     simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
358 
359     // We return uint32_t instead of uint16_t because that seems to be more efficient for most
360     // purposes (cutting it down to uint16_t costs performance in some compilers).
to_bitmasksimdutf::arm64::__anone55652eb0411::simd::simd8361     simdutf_really_inline uint32_t to_bitmask() const {
362 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
363       const uint8x16_t bit_mask =  make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
364                                                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
365 #else
366       const uint8x16_t bit_mask =  {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
367                                     0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
368 #endif
369       auto minput = *this & bit_mask;
370       uint8x16_t tmp = vpaddq_u8(minput, minput);
371       tmp = vpaddq_u8(tmp, tmp);
372       tmp = vpaddq_u8(tmp, tmp);
373       return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
374     }
375 
376     // Returns 4-bit out of each byte, alternating between the high 4 bits and low bits
377     // result it is 64 bit.
378     // This method is expected to be faster than none() and is equivalent
379     // when the vector register is the result of a comparison, with byte
380     // values 0xff and 0x00.
to_bitmask64simdutf::arm64::__anone55652eb0411::simd::simd8381     simdutf_really_inline uint64_t to_bitmask64() const {
382       return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
383     }
384 
anysimdutf::arm64::__anone55652eb0411::simd::simd8385     simdutf_really_inline bool any() const { return vmaxvq_u8(*this) != 0; }
nonesimdutf::arm64::__anone55652eb0411::simd::simd8386     simdutf_really_inline bool none() const { return vmaxvq_u8(*this) == 0; }
allsimdutf::arm64::__anone55652eb0411::simd::simd8387     simdutf_really_inline bool all() const { return vminvq_u8(*this) == 0xFF; }
388 
389 
390   };
391 
392   // Unsigned bytes
393   template<>
394   struct simd8<uint8_t>: base_u8<uint8_t> {
splatsimdutf::arm64::__anone55652eb0411::simd::simd8395     static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) { return vmovq_n_u8(_value); }
zerosimdutf::arm64::__anone55652eb0411::simd::simd8396     static simdutf_really_inline simd8<uint8_t> zero() { return vdupq_n_u8(0); }
loadsimdutf::arm64::__anone55652eb0411::simd::simd8397     static simdutf_really_inline simd8<uint8_t> load(const uint8_t* values) { return vld1q_u8(values); }
simd8simdutf::arm64::__anone55652eb0411::simd::simd8398     simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
399     // Zero constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8400     simdutf_really_inline simd8() : simd8(zero()) {}
401     // Array constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8402     simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
403     // Splat constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8404     simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
405     // Member-by-member initialization
406 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
simd8simdutf::arm64::__anone55652eb0411::simd::simd8407     simdutf_really_inline simd8(
408       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
409       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
410     ) : simd8(make_uint8x16_t(
411       v0, v1, v2, v3, v4, v5, v6, v7,
412       v8, v9, v10,v11,v12,v13,v14,v15
413     )) {}
414 #else
simd8simdutf::arm64::__anone55652eb0411::simd::simd8415     simdutf_really_inline simd8(
416       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
417       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
418     ) : simd8(uint8x16_t{
419       v0, v1, v2, v3, v4, v5, v6, v7,
420       v8, v9, v10,v11,v12,v13,v14,v15
421     }) {}
422 #endif
423 
424     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::arm64::__anone55652eb0411::simd::simd8425     simdutf_really_inline static simd8<uint8_t> repeat_16(
426       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
427       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
428     ) {
429       return simd8<uint8_t>(
430         v0, v1, v2, v3, v4, v5, v6, v7,
431         v8, v9, v10,v11,v12,v13,v14,v15
432       );
433     }
434 
435     // Store to array
storesimdutf::arm64::__anone55652eb0411::simd::simd8436     simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
437 
438     // Saturated math
saturating_addsimdutf::arm64::__anone55652eb0411::simd::simd8439     simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
saturating_subsimdutf::arm64::__anone55652eb0411::simd::simd8440     simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
441 
442     // Addition/subtraction are the same for signed and unsigned
operator +simdutf::arm64::__anone55652eb0411::simd::simd8443     simdutf_really_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
operator -simdutf::arm64::__anone55652eb0411::simd::simd8444     simdutf_really_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
operator +=simdutf::arm64::__anone55652eb0411::simd::simd8445     simdutf_really_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
operator -=simdutf::arm64::__anone55652eb0411::simd::simd8446     simdutf_really_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
447 
448     // Order-specific operations
max_valsimdutf::arm64::__anone55652eb0411::simd::simd8449     simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd8450     simdutf_really_inline uint8_t min_val() const { return vminvq_u8(*this); }
max_valsimdutf::arm64::__anone55652eb0411::simd::simd8451     simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return vmaxq_u8(*this, other); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd8452     simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return vminq_u8(*this, other); }
operator <=simdutf::arm64::__anone55652eb0411::simd::simd8453     simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }
operator >=simdutf::arm64::__anone55652eb0411::simd::simd8454     simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(*this, other); }
operator <simdutf::arm64::__anone55652eb0411::simd::simd8455     simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(*this, other); }
operator >simdutf::arm64::__anone55652eb0411::simd::simd8456     simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(*this, other); }
457     // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
gt_bitssimdutf::arm64::__anone55652eb0411::simd::simd8458     simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
459     // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
lt_bitssimdutf::arm64::__anone55652eb0411::simd::simd8460     simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
461 
462     // Bit-specific operations
any_bits_setsimdutf::arm64::__anone55652eb0411::simd::simd8463     simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
is_asciisimdutf::arm64::__anone55652eb0411::simd::simd8464     simdutf_really_inline bool is_ascii() const { return this->max_val() < 0b10000000u; }
465 
any_bits_set_anywheresimdutf::arm64::__anone55652eb0411::simd::simd8466     simdutf_really_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; }
any_bits_set_anywheresimdutf::arm64::__anone55652eb0411::simd::simd8467     simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
468     template<int N>
shrsimdutf::arm64::__anone55652eb0411::simd::simd8469     simdutf_really_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
470     template<int N>
shlsimdutf::arm64::__anone55652eb0411::simd::simd8471     simdutf_really_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
472 
473     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
474     template<typename L>
lookup_16simdutf::arm64::__anone55652eb0411::simd::simd8475     simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
476       return lookup_table.apply_lookup_16_to(*this);
477     }
478 
479 
480     template<typename L>
lookup_16simdutf::arm64::__anone55652eb0411::simd::simd8481     simdutf_really_inline simd8<L> lookup_16(
482         L replace0,  L replace1,  L replace2,  L replace3,
483         L replace4,  L replace5,  L replace6,  L replace7,
484         L replace8,  L replace9,  L replace10, L replace11,
485         L replace12, L replace13, L replace14, L replace15) const {
486       return lookup_16(simd8<L>::repeat_16(
487         replace0,  replace1,  replace2,  replace3,
488         replace4,  replace5,  replace6,  replace7,
489         replace8,  replace9,  replace10, replace11,
490         replace12, replace13, replace14, replace15
491       ));
492     }
493 
494     template<typename T>
apply_lookup_16_tosimdutf::arm64::__anone55652eb0411::simd::simd8495     simdutf_really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) const {
496       return vqtbl1q_u8(*this, simd8<uint8_t>(original));
497     }
498   };
499 
500   // Signed bytes
501   template<>
502   struct simd8<int8_t> {
503     int8x16_t value;
504 
splatsimdutf::arm64::__anone55652eb0411::simd::simd8505     static simdutf_really_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
zerosimdutf::arm64::__anone55652eb0411::simd::simd8506     static simdutf_really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
loadsimdutf::arm64::__anone55652eb0411::simd::simd8507     static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
508     template <endianness big_endian>
store_ascii_as_utf16simdutf::arm64::__anone55652eb0411::simd::simd8509     simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
510       uint16x8_t first = vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value)));
511       uint16x8_t second = vmovl_high_u8(vreinterpretq_u8_s8(this->value));
512       if (!match_system(big_endian)) {
513         #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
514         const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
515         #else
516         const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
517         #endif
518         first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(first), swap));
519         second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(second), swap));
520       }
521       vst1q_u16(reinterpret_cast<uint16_t*>(p), first);
522       vst1q_u16(reinterpret_cast<uint16_t*>(p + 8), second);
523     }
store_ascii_as_utf32simdutf::arm64::__anone55652eb0411::simd::simd8524     simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const {
525       vst1q_u32(reinterpret_cast<uint32_t*>(p), vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value))))));
526       vst1q_u32(reinterpret_cast<uint32_t*>(p + 4), vmovl_high_u16(vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value)))));
527       vst1q_u32(reinterpret_cast<uint32_t*>(p + 8), vmovl_u16(vget_low_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value)))));
528       vst1q_u32(reinterpret_cast<uint32_t*>(p + 12), vmovl_high_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value))));
529     }
530     // Conversion from/to SIMD register
simd8simdutf::arm64::__anone55652eb0411::simd::simd8531     simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {}
operator const int8x16_t&simdutf::arm64::__anone55652eb0411::simd::simd8532     simdutf_really_inline operator const int8x16_t&() const { return this->value; }
operator const uint8x16_tsimdutf::arm64::__anone55652eb0411::simd::simd8533     simdutf_really_inline operator const uint8x16_t() const { return vreinterpretq_u8_s8(this->value); }
operator int8x16_t&simdutf::arm64::__anone55652eb0411::simd::simd8534     simdutf_really_inline operator int8x16_t&() { return this->value; }
535 
536     // Zero constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8537     simdutf_really_inline simd8() : simd8(zero()) {}
538     // Splat constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8539     simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
540     // Array constructor
simd8simdutf::arm64::__anone55652eb0411::simd::simd8541     simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
542     // Member-by-member initialization
543 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
simd8simdutf::arm64::__anone55652eb0411::simd::simd8544     simdutf_really_inline simd8(
545       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
546       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
547     ) : simd8(make_int8x16_t(
548       v0, v1, v2, v3, v4, v5, v6, v7,
549       v8, v9, v10,v11,v12,v13,v14,v15
550     )) {}
551 #else
simd8simdutf::arm64::__anone55652eb0411::simd::simd8552     simdutf_really_inline simd8(
553       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
554       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
555     ) : simd8(int8x16_t{
556       v0, v1, v2, v3, v4, v5, v6, v7,
557       v8, v9, v10,v11,v12,v13,v14,v15
558     }) {}
559 #endif
560     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::arm64::__anone55652eb0411::simd::simd8561     simdutf_really_inline static simd8<int8_t> repeat_16(
562       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
563       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
564     ) {
565       return simd8<int8_t>(
566         v0, v1, v2, v3, v4, v5, v6, v7,
567         v8, v9, v10,v11,v12,v13,v14,v15
568       );
569     }
570 
571     // Store to array
storesimdutf::arm64::__anone55652eb0411::simd::simd8572     simdutf_really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, value); }
573     // Explicit conversion to/from unsigned
574     //
575     // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
576     // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
577     // and relatively ugly and hard to read.
578 #ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
simd8simdutf::arm64::__anone55652eb0411::simd::simd8579     simdutf_really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
580 #endif
operator simd8<uint8_t>simdutf::arm64::__anone55652eb0411::simd::simd8581     simdutf_really_inline operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
582 
operator |simdutf::arm64::__anone55652eb0411::simd::simd8583     simdutf_really_inline simd8<int8_t> operator|(const simd8<int8_t> other) const { return vorrq_s8(value, other.value); }
operator &simdutf::arm64::__anone55652eb0411::simd::simd8584     simdutf_really_inline simd8<int8_t> operator&(const simd8<int8_t> other) const { return vandq_s8(value, other.value); }
operator ^simdutf::arm64::__anone55652eb0411::simd::simd8585     simdutf_really_inline simd8<int8_t> operator^(const simd8<int8_t> other) const { return veorq_s8(value, other.value); }
bit_andnotsimdutf::arm64::__anone55652eb0411::simd::simd8586     simdutf_really_inline simd8<int8_t> bit_andnot(const simd8<int8_t> other) const { return vbicq_s8(value, other.value); }
587 
588     // Math
operator +simdutf::arm64::__anone55652eb0411::simd::simd8589     simdutf_really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(value, other.value); }
operator -simdutf::arm64::__anone55652eb0411::simd::simd8590     simdutf_really_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(value, other.value); }
operator +=simdutf::arm64::__anone55652eb0411::simd::simd8591     simdutf_really_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
operator -=simdutf::arm64::__anone55652eb0411::simd::simd8592     simdutf_really_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
593 
max_valsimdutf::arm64::__anone55652eb0411::simd::simd8594     simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd8595     simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); }
is_asciisimdutf::arm64::__anone55652eb0411::simd::simd8596     simdutf_really_inline bool is_ascii() const { return this->min_val() >= 0; }
597 
598     // Order-sensitive comparisons
max_valsimdutf::arm64::__anone55652eb0411::simd::simd8599     simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return vmaxq_s8(value, other.value); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd8600     simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return vminq_s8(value, other.value); }
operator >simdutf::arm64::__anone55652eb0411::simd::simd8601     simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(value, other.value); }
operator <simdutf::arm64::__anone55652eb0411::simd::simd8602     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(value, other.value); }
operator ==simdutf::arm64::__anone55652eb0411::simd::simd8603     simdutf_really_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(value, other.value); }
604 
605     template<int N=1>
prevsimdutf::arm64::__anone55652eb0411::simd::simd8606     simdutf_really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
607       return vextq_s8(prev_chunk, *this, 16 - N);
608     }
609 
610     // Perform a lookup assuming no value is larger than 16
611     template<typename L>
lookup_16simdutf::arm64::__anone55652eb0411::simd::simd8612     simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
613       return lookup_table.apply_lookup_16_to(*this);
614     }
615     template<typename L>
lookup_16simdutf::arm64::__anone55652eb0411::simd::simd8616     simdutf_really_inline simd8<L> lookup_16(
617         L replace0,  L replace1,  L replace2,  L replace3,
618         L replace4,  L replace5,  L replace6,  L replace7,
619         L replace8,  L replace9,  L replace10, L replace11,
620         L replace12, L replace13, L replace14, L replace15) const {
621       return lookup_16(simd8<L>::repeat_16(
622         replace0,  replace1,  replace2,  replace3,
623         replace4,  replace5,  replace6,  replace7,
624         replace8,  replace9,  replace10, replace11,
625         replace12, replace13, replace14, replace15
626       ));
627     }
628 
629     template<typename T>
apply_lookup_16_tosimdutf::arm64::__anone55652eb0411::simd::simd8630     simdutf_really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
631       return vqtbl1q_s8(*this, simd8<uint8_t>(original));
632     }
633   };
634 
635   template<typename T>
636   struct simd8x64 {
637     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
638     static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
639     simd8<T> chunks[NUM_CHUNKS];
640 
641     simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
642     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
643     simd8x64() = delete; // no default constructor allowed
644 
simd8x64simdutf::arm64::__anone55652eb0411::simd::simd8x64645     simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
simd8x64simdutf::arm64::__anone55652eb0411::simd::simd8x64646     simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
647 
storesimdutf::arm64::__anone55652eb0411::simd::simd8x64648     simdutf_really_inline void store(T* ptr) const {
649       this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
650       this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
651       this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
652       this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
653     }
654 
655 
operator |=simdutf::arm64::__anone55652eb0411::simd::simd8x64656     simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
657       this->chunks[0] |= other.chunks[0];
658       this->chunks[1] |= other.chunks[1];
659       this->chunks[2] |= other.chunks[2];
660       this->chunks[3] |= other.chunks[3];
661       return *this;
662     }
663 
reduce_orsimdutf::arm64::__anone55652eb0411::simd::simd8x64664     simdutf_really_inline simd8<T> reduce_or() const {
665       return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
666     }
667 
is_asciisimdutf::arm64::__anone55652eb0411::simd::simd8x64668     simdutf_really_inline bool is_ascii() const {
669       return reduce_or().is_ascii();
670     }
671 
672     template <endianness endian>
store_ascii_as_utf16simdutf::arm64::__anone55652eb0411::simd::simd8x64673     simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
674       this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
675       this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
676       this->chunks[2].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*2);
677       this->chunks[3].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*3);
678     }
679 
store_ascii_as_utf32simdutf::arm64::__anone55652eb0411::simd::simd8x64680     simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
681       this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
682       this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
683       this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*2);
684       this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*3);
685     }
686 
to_bitmasksimdutf::arm64::__anone55652eb0411::simd::simd8x64687     simdutf_really_inline uint64_t to_bitmask() const {
688 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
689       const uint8x16_t bit_mask = make_uint8x16_t(
690         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
691         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
692       );
693 #else
694       const uint8x16_t bit_mask = {
695         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
696         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
697       };
698 #endif
699       // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
700       uint8x16_t sum0 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), vandq_u8(uint8x16_t(this->chunks[1]), bit_mask));
701       uint8x16_t sum1 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), vandq_u8(uint8x16_t(this->chunks[3]), bit_mask));
702       sum0 = vpaddq_u8(sum0, sum1);
703       sum0 = vpaddq_u8(sum0, sum0);
704       return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
705     }
706 
eqsimdutf::arm64::__anone55652eb0411::simd::simd8x64707     simdutf_really_inline uint64_t eq(const T m) const {
708     const simd8<T> mask = simd8<T>::splat(m);
709     return  simd8x64<bool>(
710       this->chunks[0] == mask,
711       this->chunks[1] == mask,
712       this->chunks[2] == mask,
713       this->chunks[3] == mask
714     ).to_bitmask();
715   }
716 
lteqsimdutf::arm64::__anone55652eb0411::simd::simd8x64717   simdutf_really_inline uint64_t lteq(const T m) const {
718     const simd8<T> mask = simd8<T>::splat(m);
719     return  simd8x64<bool>(
720       this->chunks[0] <= mask,
721       this->chunks[1] <= mask,
722       this->chunks[2] <= mask,
723       this->chunks[3] <= mask
724     ).to_bitmask();
725   }
726 
in_rangesimdutf::arm64::__anone55652eb0411::simd::simd8x64727     simdutf_really_inline uint64_t in_range(const T low, const T high) const {
728       const simd8<T> mask_low = simd8<T>::splat(low);
729       const simd8<T> mask_high = simd8<T>::splat(high);
730 
731       return  simd8x64<bool>(
732         (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
733         (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
734         (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
735         (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
736       ).to_bitmask();
737     }
not_in_rangesimdutf::arm64::__anone55652eb0411::simd::simd8x64738     simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
739       const simd8<T> mask_low = simd8<T>::splat(low);
740       const simd8<T> mask_high = simd8<T>::splat(high);
741       return  simd8x64<bool>(
742         (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
743         (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
744         (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
745         (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
746       ).to_bitmask();
747     }
ltsimdutf::arm64::__anone55652eb0411::simd::simd8x64748     simdutf_really_inline uint64_t lt(const T m) const {
749       const simd8<T> mask = simd8<T>::splat(m);
750       return  simd8x64<bool>(
751         this->chunks[0] < mask,
752         this->chunks[1] < mask,
753         this->chunks[2] < mask,
754         this->chunks[3] < mask
755       ).to_bitmask();
756     }
gtsimdutf::arm64::__anone55652eb0411::simd::simd8x64757     simdutf_really_inline uint64_t gt(const T m) const {
758       const simd8<T> mask = simd8<T>::splat(m);
759       return  simd8x64<bool>(
760         this->chunks[0] > mask,
761         this->chunks[1] > mask,
762         this->chunks[2] > mask,
763         this->chunks[3] > mask
764       ).to_bitmask();
765     }
gteqsimdutf::arm64::__anone55652eb0411::simd::simd8x64766     simdutf_really_inline uint64_t gteq(const T m) const {
767       const simd8<T> mask = simd8<T>::splat(m);
768       return  simd8x64<bool>(
769         this->chunks[0] >= mask,
770         this->chunks[1] >= mask,
771         this->chunks[2] >= mask,
772         this->chunks[3] >= mask
773       ).to_bitmask();
774     }
gteq_unsignedsimdutf::arm64::__anone55652eb0411::simd::simd8x64775     simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
776       const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
777       return  simd8x64<bool>(
778         simd8<uint8_t>(uint8x16_t(this->chunks[0])) >= mask,
779         simd8<uint8_t>(uint8x16_t(this->chunks[1])) >= mask,
780         simd8<uint8_t>(uint8x16_t(this->chunks[2])) >= mask,
781         simd8<uint8_t>(uint8x16_t(this->chunks[3])) >= mask
782       ).to_bitmask();
783     }
784   }; // struct simd8x64<T>
785 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd16-inl.h
786 /* begin file src/simdutf/arm64/simd16-inl.h */
787 template<typename T>
788 struct simd16;
789 
790   template<typename T, typename Mask=simd16<bool>>
791   struct base_u16 {
792     uint16x8_t value;
793     static const int SIZE = sizeof(value);
794 
795     // Conversion from/to SIMD register
796     simdutf_really_inline base_u16() = default;
base_u16simdutf::arm64::__anone55652eb0411::simd::base_u16797     simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {}
operator const uint16x8_t&simdutf::arm64::__anone55652eb0411::simd::base_u16798     simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
operator uint16x8_t&simdutf::arm64::__anone55652eb0411::simd::base_u16799     simdutf_really_inline operator uint16x8_t&() { return this->value; }
800     // Bit operations
operator |simdutf::arm64::__anone55652eb0411::simd::base_u16801     simdutf_really_inline simd16<T> operator|(const simd16<T> other) const { return vorrq_u16(*this, other); }
operator &simdutf::arm64::__anone55652eb0411::simd::base_u16802     simdutf_really_inline simd16<T> operator&(const simd16<T> other) const { return vandq_u16(*this, other); }
operator ^simdutf::arm64::__anone55652eb0411::simd::base_u16803     simdutf_really_inline simd16<T> operator^(const simd16<T> other) const { return veorq_u16(*this, other); }
bit_andnotsimdutf::arm64::__anone55652eb0411::simd::base_u16804     simdutf_really_inline simd16<T> bit_andnot(const simd16<T> other) const { return vbicq_u16(*this, other); }
operator ~simdutf::arm64::__anone55652eb0411::simd::base_u16805     simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
operator |=simdutf::arm64::__anone55652eb0411::simd::base_u16806     simdutf_really_inline simd16<T>& operator|=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
operator &=simdutf::arm64::__anone55652eb0411::simd::base_u16807     simdutf_really_inline simd16<T>& operator&=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
operator ^=simdutf::arm64::__anone55652eb0411::simd::base_u16808     simdutf_really_inline simd16<T>& operator^=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
809 
operator ==(const simd16<T> lhs,const simd16<T> rhs)810     friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return vceqq_u16(lhs, rhs); }
811 
812     template<int N=1>
prevsimdutf::arm64::__anone55652eb0411::simd::base_u16813     simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
814       return vextq_u18(prev_chunk, *this, 8 - N);
815     }
816   };
817 
818 template<typename T, typename Mask=simd16<bool>>
819 struct base16: base_u16<T> {
820   typedef uint16_t bitmask_t;
821   typedef uint32_t bitmask2_t;
822 
base16simdutf::arm64::__anone55652eb0411::simd::base16823   simdutf_really_inline base16() : base_u16<T>() {}
base16simdutf::arm64::__anone55652eb0411::simd::base16824   simdutf_really_inline base16(const uint16x8_t _value) : base_u16<T>(_value) {}
825   template <typename Pointer>
base16simdutf::arm64::__anone55652eb0411::simd::base16826   simdutf_really_inline base16(const Pointer* ptr) : base16(vld1q_u16(ptr)) {}
827 
828   static const int SIZE = sizeof(base_u16<T>::value);
829 
830   template<int N=1>
prevsimdutf::arm64::__anone55652eb0411::simd::base16831   simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
832     return vextq_u18(prev_chunk, *this, 8 - N);
833   }
834 };
835 
836 // SIMD byte mask type (returned by things like eq and gt)
837 template<>
838 struct simd16<bool>: base16<bool> {
splatsimdutf::arm64::__anone55652eb0411::simd::simd16839   static simdutf_really_inline simd16<bool> splat(bool _value) { return vmovq_n_u16(uint16_t(-(!!_value))); }
840 
simd16simdutf::arm64::__anone55652eb0411::simd::simd16841   simdutf_really_inline simd16<bool>() : base16() {}
simd16simdutf::arm64::__anone55652eb0411::simd::simd16842   simdutf_really_inline simd16<bool>(const uint16x8_t _value) : base16<bool>(_value) {}
843   // Splat constructor
simd16simdutf::arm64::__anone55652eb0411::simd::simd16844   simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
845 
846 };
847 
848 template<typename T>
849 struct base16_numeric: base16<T> {
splatsimdutf::arm64::__anone55652eb0411::simd::base16_numeric850   static simdutf_really_inline simd16<T> splat(T _value) { return vmovq_n_u16(_value); }
zerosimdutf::arm64::__anone55652eb0411::simd::base16_numeric851   static simdutf_really_inline simd16<T> zero() { return vdupq_n_u16(0); }
loadsimdutf::arm64::__anone55652eb0411::simd::base16_numeric852   static simdutf_really_inline simd16<T> load(const T values[8]) {
853     return vld1q_u16(reinterpret_cast<const uint16_t*>(values));
854   }
855 
base16_numericsimdutf::arm64::__anone55652eb0411::simd::base16_numeric856   simdutf_really_inline base16_numeric() : base16<T>() {}
base16_numericsimdutf::arm64::__anone55652eb0411::simd::base16_numeric857   simdutf_really_inline base16_numeric(const uint16x8_t _value) : base16<T>(_value) {}
858 
859   // Store to array
storesimdutf::arm64::__anone55652eb0411::simd::base16_numeric860   simdutf_really_inline void store(T dst[8]) const { return vst1q_u16(dst, *this); }
861 
862   // Override to distinguish from bool version
operator ~simdutf::arm64::__anone55652eb0411::simd::base16_numeric863   simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
864 
865   // Addition/subtraction are the same for signed and unsigned
operator +simdutf::arm64::__anone55652eb0411::simd::base16_numeric866   simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return vaddq_u8(*this, other); }
operator -simdutf::arm64::__anone55652eb0411::simd::base16_numeric867   simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return vsubq_u8(*this, other); }
operator +=simdutf::arm64::__anone55652eb0411::simd::base16_numeric868   simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
operator -=simdutf::arm64::__anone55652eb0411::simd::base16_numeric869   simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
870 };
871 
872 // Signed words
873 template<>
874 struct simd16<int16_t> : base16_numeric<int16_t> {
simd16simdutf::arm64::__anone55652eb0411::simd::simd16875   simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
876 #ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
simd16simdutf::arm64::__anone55652eb0411::simd::simd16877   simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<int16_t>(_value) {}
878 #endif
simd16simdutf::arm64::__anone55652eb0411::simd::simd16879   simdutf_really_inline simd16(const int16x8_t _value) : base16_numeric<int16_t>(vreinterpretq_u16_s16(_value)) {}
880 
881   // Splat constructor
simd16simdutf::arm64::__anone55652eb0411::simd::simd16882   simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
883   // Array constructor
simd16simdutf::arm64::__anone55652eb0411::simd::simd16884   simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
simd16simdutf::arm64::__anone55652eb0411::simd::simd16885   simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
886   simdutf_really_inline operator simd16<uint16_t>() const;
operator const uint16x8_t&simdutf::arm64::__anone55652eb0411::simd::simd16887   simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
operator const int16x8_tsimdutf::arm64::__anone55652eb0411::simd::simd16888   simdutf_really_inline operator const int16x8_t() const { return vreinterpretq_s16_u16(this->value); }
889 
max_valsimdutf::arm64::__anone55652eb0411::simd::simd16890   simdutf_really_inline int16_t max_val() const { return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd16891   simdutf_really_inline int16_t min_val() const { return vminvq_s16(vreinterpretq_s16_u16(this->value)); }
892   // Order-sensitive comparisons
max_valsimdutf::arm64::__anone55652eb0411::simd::simd16893   simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd16894   simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
operator >simdutf::arm64::__anone55652eb0411::simd::simd16895   simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return vcgtq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
operator <simdutf::arm64::__anone55652eb0411::simd::simd16896   simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return vcltq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
897 };
898 
899 
900 
901 
902 // Unsigned words
903 template<>
904 struct simd16<uint16_t>: base16_numeric<uint16_t>  {
simd16simdutf::arm64::__anone55652eb0411::simd::simd16905   simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
simd16simdutf::arm64::__anone55652eb0411::simd::simd16906   simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<uint16_t>(_value) {}
907 
908   // Splat constructor
simd16simdutf::arm64::__anone55652eb0411::simd::simd16909   simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
910   // Array constructor
simd16simdutf::arm64::__anone55652eb0411::simd::simd16911   simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
simd16simdutf::arm64::__anone55652eb0411::simd::simd16912   simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
913 
914 
max_valsimdutf::arm64::__anone55652eb0411::simd::simd16915   simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd16916   simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); }
917   // Saturated math
saturating_addsimdutf::arm64::__anone55652eb0411::simd::simd16918   simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return vqaddq_u16(*this, other); }
saturating_subsimdutf::arm64::__anone55652eb0411::simd::simd16919   simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return vqsubq_u16(*this, other); }
920 
921   // Order-specific operations
max_valsimdutf::arm64::__anone55652eb0411::simd::simd16922   simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return vmaxq_u16(*this, other); }
min_valsimdutf::arm64::__anone55652eb0411::simd::simd16923   simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return vminq_u16(*this, other); }
924   // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::arm64::__anone55652eb0411::simd::simd16925   simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
926   // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::arm64::__anone55652eb0411::simd::simd16927   simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::arm64::__anone55652eb0411::simd::simd16928   simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return vcleq_u16(*this, other); }
operator >=simdutf::arm64::__anone55652eb0411::simd::simd16929   simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return vcgeq_u16(*this, other); }
operator >simdutf::arm64::__anone55652eb0411::simd::simd16930   simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return  vcgtq_u16(*this, other); }
operator <simdutf::arm64::__anone55652eb0411::simd::simd16931   simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return vcltq_u16(*this, other); }
932 
933   // Bit-specific operations
bits_not_setsimdutf::arm64::__anone55652eb0411::simd::simd16934   simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
935   template<int N>
shrsimdutf::arm64::__anone55652eb0411::simd::simd16936   simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(vshrq_n_u16(*this, N)); }
937   template<int N>
shlsimdutf::arm64::__anone55652eb0411::simd::simd16938   simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(vshlq_n_u16(*this, N)); }
939 
940   // logical operations
operator |simdutf::arm64::__anone55652eb0411::simd::simd16941   simdutf_really_inline simd16<uint16_t> operator|(const simd16<uint16_t> other) const { return vorrq_u16(*this, other); }
operator &simdutf::arm64::__anone55652eb0411::simd::simd16942   simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> other) const { return vandq_u16(*this, other); }
operator ^simdutf::arm64::__anone55652eb0411::simd::simd16943   simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> other) const { return veorq_u16(*this, other); }
944 
945   // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
packsimdutf::arm64::__anone55652eb0411::simd::simd16946   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
947     return vqmovn_high_u16(vqmovn_u16(v0), v1);
948   }
949 
950   // Change the endianness
swap_bytessimdutf::arm64::__anone55652eb0411::simd::simd16951   simdutf_really_inline simd16<uint16_t> swap_bytes() const {
952     #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
953     const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
954     #else
955     const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
956     #endif
957     return vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(*this), swap));
958   }
959 };
operator simd16<uint16_t>() const960 simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
961 
962 
963   template<typename T>
964   struct simd16x32 {
965     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
966     static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
967     simd16<T> chunks[NUM_CHUNKS];
968 
969     simd16x32(const simd16x32<T>& o) = delete; // no copy allowed
970     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
971     simd16x32() = delete; // no default constructor allowed
972 
simd16x32simdutf::arm64::__anone55652eb0411::simd::simd16x32973     simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
simd16x32simdutf::arm64::__anone55652eb0411::simd::simd16x32974     simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
975 
storesimdutf::arm64::__anone55652eb0411::simd::simd16x32976     simdutf_really_inline void store(T* ptr) const {
977       this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
978       this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
979       this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
980       this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
981     }
982 
reduce_orsimdutf::arm64::__anone55652eb0411::simd::simd16x32983     simdutf_really_inline simd16<T> reduce_or() const {
984       return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
985     }
986 
is_asciisimdutf::arm64::__anone55652eb0411::simd::simd16x32987     simdutf_really_inline bool is_ascii() const {
988       return reduce_or().is_ascii();
989     }
990 
store_ascii_as_utf16simdutf::arm64::__anone55652eb0411::simd::simd16x32991     simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
992       this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
993       this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
994       this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
995       this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
996     }
997 
to_bitmasksimdutf::arm64::__anone55652eb0411::simd::simd16x32998     simdutf_really_inline uint64_t to_bitmask() const {
999 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
1000       const uint8x16_t bit_mask = make_uint8x16_t(
1001         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
1002         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
1003       );
1004 #else
1005       const uint8x16_t bit_mask = {
1006         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
1007         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
1008       };
1009 #endif
1010       // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
1011       uint8x16_t sum0 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask)));
1012       uint8x16_t sum1 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask)));
1013       sum0 = vpaddq_u8(sum0, sum1);
1014       sum0 = vpaddq_u8(sum0, sum0);
1015       return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
1016     }
1017 
swap_bytessimdutf::arm64::__anone55652eb0411::simd::simd16x321018     simdutf_really_inline void swap_bytes() {
1019       this->chunks[0] = this->chunks[0].swap_bytes();
1020       this->chunks[1] = this->chunks[1].swap_bytes();
1021       this->chunks[2] = this->chunks[2].swap_bytes();
1022       this->chunks[3] = this->chunks[3].swap_bytes();
1023     }
1024 
eqsimdutf::arm64::__anone55652eb0411::simd::simd16x321025     simdutf_really_inline uint64_t eq(const T m) const {
1026     const simd16<T> mask = simd16<T>::splat(m);
1027     return  simd16x32<bool>(
1028       this->chunks[0] == mask,
1029       this->chunks[1] == mask,
1030       this->chunks[2] == mask,
1031       this->chunks[3] == mask
1032     ).to_bitmask();
1033   }
1034 
lteqsimdutf::arm64::__anone55652eb0411::simd::simd16x321035   simdutf_really_inline uint64_t lteq(const T m) const {
1036     const simd16<T> mask = simd16<T>::splat(m);
1037     return  simd16x32<bool>(
1038       this->chunks[0] <= mask,
1039       this->chunks[1] <= mask,
1040       this->chunks[2] <= mask,
1041       this->chunks[3] <= mask
1042     ).to_bitmask();
1043   }
1044 
in_rangesimdutf::arm64::__anone55652eb0411::simd::simd16x321045     simdutf_really_inline uint64_t in_range(const T low, const T high) const {
1046       const simd16<T> mask_low = simd16<T>::splat(low);
1047       const simd16<T> mask_high = simd16<T>::splat(high);
1048 
1049       return  simd16x32<bool>(
1050         (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
1051         (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
1052         (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
1053         (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
1054       ).to_bitmask();
1055     }
not_in_rangesimdutf::arm64::__anone55652eb0411::simd::simd16x321056     simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
1057       const simd16<T> mask_low = simd16<T>::splat(low);
1058       const simd16<T> mask_high = simd16<T>::splat(high);
1059       return  simd16x32<bool>(
1060         (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
1061         (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
1062         (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
1063         (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
1064       ).to_bitmask();
1065     }
ltsimdutf::arm64::__anone55652eb0411::simd::simd16x321066     simdutf_really_inline uint64_t lt(const T m) const {
1067       const simd16<T> mask = simd16<T>::splat(m);
1068       return  simd16x32<bool>(
1069         this->chunks[0] < mask,
1070         this->chunks[1] < mask,
1071         this->chunks[2] < mask,
1072         this->chunks[3] < mask
1073       ).to_bitmask();
1074     }
1075 
1076   }; // struct simd16x32<T>
1077   template<>
not_in_range(const uint16_t low,const uint16_t high) const1078   simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(const uint16_t low, const uint16_t high) const {
1079       const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
1080       const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
1081       simd16x32<uint16_t> x(
1082         simd16<uint16_t>((this->chunks[0] > mask_high) | (this->chunks[0] < mask_low)),
1083         simd16<uint16_t>((this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)),
1084         simd16<uint16_t>((this->chunks[2] > mask_high) | (this->chunks[2] < mask_low)),
1085         simd16<uint16_t>((this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
1086       );
1087       return  x.to_bitmask();
1088     }
1089 /* end file src/simdutf/arm64/simd16-inl.h */
1090 } // namespace simd
1091 } // unnamed namespace
1092 } // namespace arm64
1093 } // namespace simdutf
1094 
1095 #endif // SIMDUTF_ARM64_SIMD_H
1096 /* end file src/simdutf/arm64/simd.h */
1097 
1098 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h
1099 /* begin file src/simdutf/arm64/end.h */
1100 /* end file src/simdutf/arm64/end.h */
1101 
1102 #endif // SIMDUTF_IMPLEMENTATION_ARM64
1103 
1104 #endif // SIMDUTF_ARM64_H
1105 /* end file src/simdutf/arm64.h */
1106 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/icelake.h
1107 /* begin file src/simdutf/icelake.h */
1108 #ifndef SIMDUTF_ICELAKE_H
1109 #define SIMDUTF_ICELAKE_H
1110 
1111 
1112 
1113 #ifdef __has_include
1114 // How do we detect that a compiler supports vbmi2?
1115 // For sure if the following header is found, we are ok?
1116 #if __has_include(<avx512vbmi2intrin.h>)
1117 #define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1
1118 #endif
1119 #endif
1120 
1121 #ifdef _MSC_VER
1122 #if _MSC_VER >= 1920
1123 // Visual Studio 2019 and up support VBMI2 under x64 even if the header
1124 // avx512vbmi2intrin.h is not found.
1125 #define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1
1126 #endif
1127 #endif
1128 
1129 // We allow icelake on x64 as long as the compiler is known to support VBMI2.
1130 #ifndef SIMDUTF_IMPLEMENTATION_ICELAKE
1131 #define SIMDUTF_IMPLEMENTATION_ICELAKE ((SIMDUTF_IS_X86_64) && (SIMDUTF_COMPILER_SUPPORTS_VBMI2))
1132 #endif
1133 
1134 // To see why  (__BMI__) && (__LZCNT__) are not part of this next line, see
1135 // https://github.com/simdutf/simdutf/issues/1247
1136 #define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && (SIMDUTF_HAS_AVX512F && \
1137                                          SIMDUTF_HAS_AVX512DQ && \
1138                                          SIMDUTF_HAS_AVX512VL && \
1139                                            SIMDUTF_HAS_AVX512VBMI2) && (!SIMDUTF_IS_32BITS))
1140 
1141 #if SIMDUTF_IMPLEMENTATION_ICELAKE
1142 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
1143 #define SIMDUTF_TARGET_ICELAKE
1144 #else
1145 #define SIMDUTF_TARGET_ICELAKE SIMDUTF_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt")
1146 #endif
1147 
1148 namespace simdutf {
1149 namespace icelake {
1150 } // namespace icelake
1151 } // namespace simdutf
1152 
1153 
1154 
1155 //
1156 // These two need to be included outside SIMDUTF_TARGET_REGION
1157 //
1158 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/icelake/intrinsics.h
1159 /* begin file src/simdutf/icelake/intrinsics.h */
1160 #ifndef SIMDUTF_ICELAKE_INTRINSICS_H
1161 #define SIMDUTF_ICELAKE_INTRINSICS_H
1162 
1163 
1164 #ifdef SIMDUTF_VISUAL_STUDIO
1165 // under clang within visual studio, this will include <x86intrin.h>
1166 #include <intrin.h>  // visual studio or clang
1167 #include <immintrin.h>
1168 #else
1169 
1170 #if SIMDUTF_GCC11ORMORE
1171 // We should not get warnings while including <x86intrin.h> yet we do
1172 // under some versions of GCC.
1173 // If the x86intrin.h header has uninitialized values that are problematic,
1174 // it is a GCC issue, we want to ignore these warnigns.
1175 SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
1176 #endif
1177 
1178 #include <x86intrin.h> // elsewhere
1179 
1180 
1181 #if SIMDUTF_GCC11ORMORE
1182 // cancels the suppression of the -Wuninitialized
1183 SIMDUTF_POP_DISABLE_WARNINGS
1184 #endif
1185 
1186 #ifndef _tzcnt_u64
1187 #define _tzcnt_u64(x) __tzcnt_u64(x)
1188 #endif // _tzcnt_u64
1189 #endif // SIMDUTF_VISUAL_STUDIO
1190 
1191 #ifdef SIMDUTF_CLANG_VISUAL_STUDIO
1192 /**
1193  * You are not supposed, normally, to include these
1194  * headers directly. Instead you should either include intrin.h
1195  * or x86intrin.h. However, when compiling with clang
1196  * under Windows (i.e., when _MSC_VER is set), these headers
1197  * only get included *if* the corresponding features are detected
1198  * from macros:
1199  * e.g., if __AVX2__ is set... in turn,  we normally set these
1200  * macros by compiling against the corresponding architecture
1201  * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
1202  * software with these advanced instructions. In simdutf, we
1203  * want to compile the whole program for a generic target,
1204  * and only target our specific kernels. As a workaround,
1205  * we directly include the needed headers. These headers would
1206  * normally guard against such usage, but we carefully included
1207  * <x86intrin.h>  (or <intrin.h>) before, so the headers
1208  * are fooled.
1209  */
1210 #include <bmiintrin.h>   // for _blsr_u64
1211 #include <bmi2intrin.h>  // for _pext_u64, _pdep_u64
1212 #include <lzcntintrin.h> // for  __lzcnt64
1213 #include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
1214 #include <smmintrin.h>
1215 #include <tmmintrin.h>
1216 #include <avxintrin.h>
1217 #include <avx2intrin.h>
1218 // Important: we need the AVX-512 headers:
1219 #include <avx512fintrin.h>
1220 #include <avx512dqintrin.h>
1221 #include <avx512cdintrin.h>
1222 #include <avx512bwintrin.h>
1223 #include <avx512vlintrin.h>
1224 #include <avx512vlbwintrin.h>
1225 #include <avx512vbmiintrin.h>
1226 #include <avx512vbmi2intrin.h>
1227 // unfortunately, we may not get _blsr_u64, but, thankfully, clang
1228 // has it as a macro.
1229 #ifndef _blsr_u64
1230 // we roll our own
1231 #define _blsr_u64(n) ((n - 1) & n)
1232 #endif //  _blsr_u64
1233 #endif // SIMDUTF_CLANG_VISUAL_STUDIO
1234 
1235 
1236 
1237 #if defined(__GNUC__) && !defined(__clang__)
1238 
1239 #if __GNUC__ == 8
1240 #define SIMDUTF_GCC8 1
1241 #elif __GNUC__ == 9
1242 #define SIMDUTF_GCC9 1
1243 #endif //  __GNUC__ == 8 || __GNUC__ == 9
1244 
1245 #endif // defined(__GNUC__) && !defined(__clang__)
1246 
1247 #if SIMDUTF_GCC8
1248 #pragma GCC push_options
1249 #pragma GCC target("avx512f")
1250 /**
1251  * GCC 8 fails to provide _mm512_set_epi8. We roll our own.
1252  */
_mm512_set_epi8(uint8_t a0,uint8_t a1,uint8_t a2,uint8_t a3,uint8_t a4,uint8_t a5,uint8_t a6,uint8_t a7,uint8_t a8,uint8_t a9,uint8_t a10,uint8_t a11,uint8_t a12,uint8_t a13,uint8_t a14,uint8_t a15,uint8_t a16,uint8_t a17,uint8_t a18,uint8_t a19,uint8_t a20,uint8_t a21,uint8_t a22,uint8_t a23,uint8_t a24,uint8_t a25,uint8_t a26,uint8_t a27,uint8_t a28,uint8_t a29,uint8_t a30,uint8_t a31,uint8_t a32,uint8_t a33,uint8_t a34,uint8_t a35,uint8_t a36,uint8_t a37,uint8_t a38,uint8_t a39,uint8_t a40,uint8_t a41,uint8_t a42,uint8_t a43,uint8_t a44,uint8_t a45,uint8_t a46,uint8_t a47,uint8_t a48,uint8_t a49,uint8_t a50,uint8_t a51,uint8_t a52,uint8_t a53,uint8_t a54,uint8_t a55,uint8_t a56,uint8_t a57,uint8_t a58,uint8_t a59,uint8_t a60,uint8_t a61,uint8_t a62,uint8_t a63)1253 inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
1254   return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
1255                           uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
1256                           uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
1257                           uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
1258                           uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
1259                           uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
1260                           uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
1261                           uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
1262 }
1263 #pragma GCC pop_options
1264 #endif // SIMDUTF_GCC8
1265 
1266 #endif // SIMDUTF_HASWELL_INTRINSICS_H
1267 /* end file src/simdutf/icelake/intrinsics.h */
1268 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/icelake/implementation.h
1269 /* begin file src/simdutf/icelake/implementation.h */
1270 #ifndef SIMDUTF_ICELAKE_IMPLEMENTATION_H
1271 #define SIMDUTF_ICELAKE_IMPLEMENTATION_H
1272 
1273 
1274 namespace simdutf {
1275 namespace icelake {
1276 
1277 namespace {
1278 using namespace simdutf;
1279 }
1280 
1281 class implementation final : public simdutf::implementation {
1282 public:
implementation()1283   simdutf_really_inline implementation() : simdutf::implementation(
1284       "icelake",
1285       "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 extensions)",
1286       internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2 ) {}
1287   simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
1288   simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
1289   simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
1290   simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
1291   simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
1292   simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
1293   simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
1294   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
1295   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
1296   simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
1297   simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
1298   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1299   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1300   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1301   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1302   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1303   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1304   simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
1305   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
1306   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1307   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1308   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1309   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1310   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1311   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1312   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1313   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1314   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1315   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1316   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1317   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1318   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1319   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1320   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1321   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1322   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1323   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1324   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1325   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1326   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1327   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1328   void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
1329   simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
1330   simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
1331   simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
1332   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1333   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1334   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1335   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1336   simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
1337   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1338   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1339   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
1340 };
1341 
1342 } // namespace icelake
1343 } // namespace simdutf
1344 
1345 #endif // SIMDUTF_ICELAKE_IMPLEMENTATION_H
1346 /* end file src/simdutf/icelake/implementation.h */
1347 
1348 //
1349 // The rest need to be inside the region
1350 //
1351 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/icelake/begin.h
1352 /* begin file src/simdutf/icelake/begin.h */
1353 // redefining SIMDUTF_IMPLEMENTATION to "icelake"
1354 // #define SIMDUTF_IMPLEMENTATION icelake
1355 
1356 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
1357 // nothing needed.
1358 #else
1359 SIMDUTF_TARGET_ICELAKE
1360 #endif
1361 
1362 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
1363 SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
1364 #endif // end of workaround
1365 /* end file src/simdutf/icelake/begin.h */
1366 // Declarations
1367 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/icelake/bitmanipulation.h
1368 /* begin file src/simdutf/icelake/bitmanipulation.h */
1369 #ifndef SIMDUTF_ICELAKE_BITMANIPULATION_H
1370 #define SIMDUTF_ICELAKE_BITMANIPULATION_H
1371 
1372 namespace simdutf {
1373 namespace icelake {
1374 namespace {
1375 
1376 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
count_ones(uint64_t input_num)1377 simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
1378   // note: we do not support legacy 32-bit Windows
1379   return __popcnt64(input_num);// Visual Studio wants two underscores
1380 }
1381 #else
1382 simdutf_really_inline long long int count_ones(uint64_t input_num) {
1383   return _popcnt64(input_num);
1384 }
1385 #endif
1386 
1387 } // unnamed namespace
1388 } // namespace icelake
1389 } // namespace simdutf
1390 
1391 #endif // SIMDUTF_ICELAKE_BITMANIPULATION_H
1392 /* end file src/simdutf/icelake/bitmanipulation.h */
1393 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/icelake/end.h
1394 /* begin file src/simdutf/icelake/end.h */
1395 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
1396 // nothing needed.
1397 #else
1398 SIMDUTF_UNTARGET_REGION
1399 #endif
1400 
1401 
1402 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
1403 SIMDUTF_POP_DISABLE_WARNINGS
1404 #endif // end of workaround
1405 /* end file src/simdutf/icelake/end.h */
1406 
1407 
1408 
1409 #endif // SIMDUTF_IMPLEMENTATION_ICELAKE
1410 #endif // SIMDUTF_ICELAKE_H
1411 /* end file src/simdutf/icelake.h */
1412 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell.h
1413 /* begin file src/simdutf/haswell.h */
1414 #ifndef SIMDUTF_HASWELL_H
1415 #define SIMDUTF_HASWELL_H
1416 
1417 #ifdef SIMDUTF_WESTMERE_H
1418 #error "haswell.h must be included before westmere.h"
1419 #endif
1420 #ifdef SIMDUTF_FALLBACK_H
1421 #error "haswell.h must be included before fallback.h"
1422 #endif
1423 
1424 
1425 // Default Haswell to on if this is x86-64. Even if we're not compiled for it, it could be selected
1426 // at runtime.
1427 #ifndef SIMDUTF_IMPLEMENTATION_HASWELL
1428 //
1429 // You do not want to restrict it like so: SIMDUTF_IS_X86_64 && __AVX2__
1430 // because we want to rely on *runtime dispatch*.
1431 //
1432 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
1433 #define SIMDUTF_IMPLEMENTATION_HASWELL 0
1434 #else
1435 #define SIMDUTF_IMPLEMENTATION_HASWELL (SIMDUTF_IS_X86_64)
1436 #endif
1437 
1438 #endif
1439 // To see why  (__BMI__) && (__LZCNT__) are not part of this next line, see
1440 // https://github.com/simdutf/simdutf/issues/1247
1441 #define SIMDUTF_CAN_ALWAYS_RUN_HASWELL ((SIMDUTF_IMPLEMENTATION_HASWELL) && (SIMDUTF_IS_X86_64) && (__AVX2__))
1442 
1443 #if SIMDUTF_IMPLEMENTATION_HASWELL
1444 
1445 #define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,lzcnt,popcnt")
1446 
1447 namespace simdutf {
1448 /**
1449  * Implementation for Haswell (Intel AVX2).
1450  */
1451 namespace haswell {
1452 } // namespace haswell
1453 } // namespace simdutf
1454 
1455 //
1456 // These two need to be included outside SIMDUTF_TARGET_REGION
1457 //
1458 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/implementation.h
1459 /* begin file src/simdutf/haswell/implementation.h */
1460 #ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H
1461 #define SIMDUTF_HASWELL_IMPLEMENTATION_H
1462 
1463 
1464 // The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
1465 namespace simdutf {
1466 namespace haswell {
1467 
1468 using namespace simdutf;
1469 
1470 class implementation final : public simdutf::implementation {
1471 public:
implementation()1472   simdutf_really_inline implementation() : simdutf::implementation(
1473       "haswell",
1474       "Intel/AMD AVX2",
1475       internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2
1476   ) {}
1477   simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
1478   simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
1479   simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
1480   simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
1481   simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
1482   simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
1483   simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
1484   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
1485   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
1486   simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
1487   simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
1488   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1489   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1490   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1491   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1492   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1493   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1494   simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
1495   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
1496   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1497   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1498   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1499   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1500   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1501   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1502   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1503   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1504   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1505   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1506   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1507   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1508   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1509   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1510   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1511   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1512   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1513   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1514   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1515   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1516   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1517   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1518   void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
1519   simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
1520   simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
1521   simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
1522   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1523   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1524   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1525   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1526   simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
1527   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1528   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1529   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
1530 };
1531 
1532 } // namespace haswell
1533 } // namespace simdutf
1534 
1535 #endif // SIMDUTF_HASWELL_IMPLEMENTATION_H
1536 /* end file src/simdutf/haswell/implementation.h */
1537 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/intrinsics.h
1538 /* begin file src/simdutf/haswell/intrinsics.h */
1539 #ifndef SIMDUTF_HASWELL_INTRINSICS_H
1540 #define SIMDUTF_HASWELL_INTRINSICS_H
1541 
1542 
1543 #ifdef SIMDUTF_VISUAL_STUDIO
1544 // under clang within visual studio, this will include <x86intrin.h>
1545 #include <intrin.h>  // visual studio or clang
1546 #else
1547 
1548 #if SIMDUTF_GCC11ORMORE
1549 // We should not get warnings while including <x86intrin.h> yet we do
1550 // under some versions of GCC.
1551 // If the x86intrin.h header has uninitialized values that are problematic,
1552 // it is a GCC issue, we want to ignore these warnigns.
1553 SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
1554 #endif
1555 
1556 #include <x86intrin.h> // elsewhere
1557 
1558 
1559 #if SIMDUTF_GCC11ORMORE
1560 // cancels the suppression of the -Wuninitialized
1561 SIMDUTF_POP_DISABLE_WARNINGS
1562 #endif
1563 
1564 #endif // SIMDUTF_VISUAL_STUDIO
1565 
1566 #ifdef SIMDUTF_CLANG_VISUAL_STUDIO
1567 /**
1568  * You are not supposed, normally, to include these
1569  * headers directly. Instead you should either include intrin.h
1570  * or x86intrin.h. However, when compiling with clang
1571  * under Windows (i.e., when _MSC_VER is set), these headers
1572  * only get included *if* the corresponding features are detected
1573  * from macros:
1574  * e.g., if __AVX2__ is set... in turn,  we normally set these
1575  * macros by compiling against the corresponding architecture
1576  * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
1577  * software with these advanced instructions. In simdutf, we
1578  * want to compile the whole program for a generic target,
1579  * and only target our specific kernels. As a workaround,
1580  * we directly include the needed headers. These headers would
1581  * normally guard against such usage, but we carefully included
1582  * <x86intrin.h>  (or <intrin.h>) before, so the headers
1583  * are fooled.
1584  */
1585 #include <bmiintrin.h>   // for _blsr_u64
1586 #include <lzcntintrin.h> // for  __lzcnt64
1587 #include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
1588 #include <smmintrin.h>
1589 #include <tmmintrin.h>
1590 #include <avxintrin.h>
1591 #include <avx2intrin.h>
1592 // unfortunately, we may not get _blsr_u64, but, thankfully, clang
1593 // has it as a macro.
1594 #ifndef _blsr_u64
1595 // we roll our own
1596 #define _blsr_u64(n) ((n - 1) & n)
1597 #endif //  _blsr_u64
1598 #endif // SIMDUTF_CLANG_VISUAL_STUDIO
1599 
1600 #endif // SIMDUTF_HASWELL_INTRINSICS_H
1601 /* end file src/simdutf/haswell/intrinsics.h */
1602 
1603 //
1604 // The rest need to be inside the region
1605 //
1606 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h
1607 /* begin file src/simdutf/haswell/begin.h */
1608 // redefining SIMDUTF_IMPLEMENTATION to "haswell"
1609 // #define SIMDUTF_IMPLEMENTATION haswell
1610 
1611 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
1612 // nothing needed.
1613 #else
1614 SIMDUTF_TARGET_HASWELL
1615 #endif
1616 
1617 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
1618 SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
1619 #endif // end of workaround
1620 /* end file src/simdutf/haswell/begin.h */
1621 // Declarations
1622 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/bitmanipulation.h
1623 /* begin file src/simdutf/haswell/bitmanipulation.h */
1624 #ifndef SIMDUTF_HASWELL_BITMANIPULATION_H
1625 #define SIMDUTF_HASWELL_BITMANIPULATION_H
1626 
1627 namespace simdutf {
1628 namespace haswell {
1629 namespace {
1630 
1631 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
count_ones(uint64_t input_num)1632 simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
1633   // note: we do not support legacy 32-bit Windows
1634   return __popcnt64(input_num);// Visual Studio wants two underscores
1635 }
1636 #else
1637 simdutf_really_inline long long int count_ones(uint64_t input_num) {
1638   return _popcnt64(input_num);
1639 }
1640 #endif
1641 
1642 } // unnamed namespace
1643 } // namespace haswell
1644 } // namespace simdutf
1645 
1646 #endif // SIMDUTF_HASWELL_BITMANIPULATION_H
1647 /* end file src/simdutf/haswell/bitmanipulation.h */
1648 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd.h
1649 /* begin file src/simdutf/haswell/simd.h */
1650 #ifndef SIMDUTF_HASWELL_SIMD_H
1651 #define SIMDUTF_HASWELL_SIMD_H
1652 
1653 
1654 namespace simdutf {
1655 namespace haswell {
1656 namespace {
1657 namespace simd {
1658 
1659   // Forward-declared so they can be used by splat and friends.
1660   template<typename Child>
1661   struct base {
1662     __m256i value;
1663 
1664     // Zero constructor
basesimdutf::haswell::__anone55652eb0911::simd::base1665     simdutf_really_inline base() : value{__m256i()} {}
1666 
1667     // Conversion from SIMD register
basesimdutf::haswell::__anone55652eb0911::simd::base1668     simdutf_really_inline base(const __m256i _value) : value(_value) {}
1669     // Conversion to SIMD register
operator const __m256i&simdutf::haswell::__anone55652eb0911::simd::base1670     simdutf_really_inline operator const __m256i&() const { return this->value; }
operator __m256i&simdutf::haswell::__anone55652eb0911::simd::base1671     simdutf_really_inline operator __m256i&() { return this->value; }
1672     template <endianness big_endian>
store_ascii_as_utf16simdutf::haswell::__anone55652eb0911::simd::base1673     simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
1674       __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this));
1675       __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this,1));
1676       if (big_endian) {
1677         const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
1678                                   17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
1679         first = _mm256_shuffle_epi8(first, swap);
1680         second = _mm256_shuffle_epi8(second, swap);
1681       }
1682       _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), first);
1683       _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), second);
1684     }
store_ascii_as_utf32simdutf::haswell::__anone55652eb0911::simd::base1685     simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
1686       _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this)));
1687       _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr+8), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(_mm256_srli_si256(*this,8))));
1688       _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this,1)));
1689       _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 24), _mm256_cvtepu8_epi32(_mm_srli_si128(_mm256_extractf128_si256(*this,1),8)));
1690     }
1691     // Bit operations
operator |simdutf::haswell::__anone55652eb0911::simd::base1692     simdutf_really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
operator &simdutf::haswell::__anone55652eb0911::simd::base1693     simdutf_really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
operator ^simdutf::haswell::__anone55652eb0911::simd::base1694     simdutf_really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
bit_andnotsimdutf::haswell::__anone55652eb0911::simd::base1695     simdutf_really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
operator |=simdutf::haswell::__anone55652eb0911::simd::base1696     simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
operator &=simdutf::haswell::__anone55652eb0911::simd::base1697     simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
operator ^=simdutf::haswell::__anone55652eb0911::simd::base1698     simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
1699   };
1700 
1701   // Forward-declared so they can be used by splat and friends.
1702   template<typename T>
1703   struct simd8;
1704 
1705   template<typename T, typename Mask=simd8<bool>>
1706   struct base8: base<simd8<T>> {
1707     typedef uint32_t bitmask_t;
1708     typedef uint64_t bitmask2_t;
1709 
base8simdutf::haswell::__anone55652eb0911::simd::base81710     simdutf_really_inline base8() : base<simd8<T>>() {}
base8simdutf::haswell::__anone55652eb0911::simd::base81711     simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
firstsimdutf::haswell::__anone55652eb0911::simd::base81712     simdutf_really_inline T first() const { return _mm256_extract_epi8(*this,0); }
lastsimdutf::haswell::__anone55652eb0911::simd::base81713     simdutf_really_inline T last() const { return _mm256_extract_epi8(*this,31); }
operator ==(const simd8<T> lhs,const simd8<T> rhs)1714     friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm256_cmpeq_epi8(lhs, rhs); }
1715 
1716     static const int SIZE = sizeof(base<T>::value);
1717 
1718     template<int N=1>
prevsimdutf::haswell::__anone55652eb0911::simd::base81719     simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
1720       return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
1721     }
1722   };
1723 
1724   // SIMD byte mask type (returned by things like eq and gt)
1725   template<>
1726   struct simd8<bool>: base8<bool> {
splatsimdutf::haswell::__anone55652eb0911::simd::simd81727     static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); }
1728 
simd8simdutf::haswell::__anone55652eb0911::simd::simd81729     simdutf_really_inline simd8<bool>() : base8() {}
simd8simdutf::haswell::__anone55652eb0911::simd::simd81730     simdutf_really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {}
1731     // Splat constructor
simd8simdutf::haswell::__anone55652eb0911::simd::simd81732     simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
1733 
to_bitmasksimdutf::haswell::__anone55652eb0911::simd::simd81734     simdutf_really_inline uint32_t to_bitmask() const { return uint32_t(_mm256_movemask_epi8(*this)); }
anysimdutf::haswell::__anone55652eb0911::simd::simd81735     simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
nonesimdutf::haswell::__anone55652eb0911::simd::simd81736     simdutf_really_inline bool none() const { return _mm256_testz_si256(*this, *this); }
allsimdutf::haswell::__anone55652eb0911::simd::simd81737     simdutf_really_inline bool all() const { return static_cast<uint32_t>(_mm256_movemask_epi8(*this)) == 0xFFFFFFFF; }
operator ~simdutf::haswell::__anone55652eb0911::simd::simd81738     simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
1739   };
1740 
1741   template<typename T>
1742   struct base8_numeric: base8<T> {
splatsimdutf::haswell::__anone55652eb0911::simd::base8_numeric1743     static simdutf_really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
zerosimdutf::haswell::__anone55652eb0911::simd::base8_numeric1744     static simdutf_really_inline simd8<T> zero() { return _mm256_setzero_si256(); }
loadsimdutf::haswell::__anone55652eb0911::simd::base8_numeric1745     static simdutf_really_inline simd8<T> load(const T values[32]) {
1746       return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
1747     }
1748     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::haswell::__anone55652eb0911::simd::base8_numeric1749     static simdutf_really_inline simd8<T> repeat_16(
1750       T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
1751       T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
1752     ) {
1753       return simd8<T>(
1754         v0, v1, v2, v3, v4, v5, v6, v7,
1755         v8, v9, v10,v11,v12,v13,v14,v15,
1756         v0, v1, v2, v3, v4, v5, v6, v7,
1757         v8, v9, v10,v11,v12,v13,v14,v15
1758       );
1759     }
1760 
base8_numericsimdutf::haswell::__anone55652eb0911::simd::base8_numeric1761     simdutf_really_inline base8_numeric() : base8<T>() {}
base8_numericsimdutf::haswell::__anone55652eb0911::simd::base8_numeric1762     simdutf_really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
1763 
1764     // Store to array
storesimdutf::haswell::__anone55652eb0911::simd::base8_numeric1765     simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
1766 
1767     // Addition/subtraction are the same for signed and unsigned
operator +simdutf::haswell::__anone55652eb0911::simd::base8_numeric1768     simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
operator -simdutf::haswell::__anone55652eb0911::simd::base8_numeric1769     simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
operator +=simdutf::haswell::__anone55652eb0911::simd::base8_numeric1770     simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
operator -=simdutf::haswell::__anone55652eb0911::simd::base8_numeric1771     simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
1772 
1773     // Override to distinguish from bool version
operator ~simdutf::haswell::__anone55652eb0911::simd::base8_numeric1774     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
1775 
1776     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
1777     template<typename L>
lookup_16simdutf::haswell::__anone55652eb0911::simd::base8_numeric1778     simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
1779       return _mm256_shuffle_epi8(lookup_table, *this);
1780     }
1781 
1782     template<typename L>
lookup_16simdutf::haswell::__anone55652eb0911::simd::base8_numeric1783     simdutf_really_inline simd8<L> lookup_16(
1784         L replace0,  L replace1,  L replace2,  L replace3,
1785         L replace4,  L replace5,  L replace6,  L replace7,
1786         L replace8,  L replace9,  L replace10, L replace11,
1787         L replace12, L replace13, L replace14, L replace15) const {
1788       return lookup_16(simd8<L>::repeat_16(
1789         replace0,  replace1,  replace2,  replace3,
1790         replace4,  replace5,  replace6,  replace7,
1791         replace8,  replace9,  replace10, replace11,
1792         replace12, replace13, replace14, replace15
1793       ));
1794     }
1795   };
1796 
1797 
1798   // Signed bytes
1799   template<>
1800   struct simd8<int8_t> : base8_numeric<int8_t> {
simd8simdutf::haswell::__anone55652eb0911::simd::simd81801     simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
simd8simdutf::haswell::__anone55652eb0911::simd::simd81802     simdutf_really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
1803 
1804     // Splat constructor
simd8simdutf::haswell::__anone55652eb0911::simd::simd81805     simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
1806     // Array constructor
simd8simdutf::haswell::__anone55652eb0911::simd::simd81807     simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
1808     simdutf_really_inline operator simd8<uint8_t>() const;
1809     // Member-by-member initialization
simd8simdutf::haswell::__anone55652eb0911::simd::simd81810     simdutf_really_inline simd8(
1811       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
1812       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
1813       int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
1814       int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
1815     ) : simd8(_mm256_setr_epi8(
1816       v0, v1, v2, v3, v4, v5, v6, v7,
1817       v8, v9, v10,v11,v12,v13,v14,v15,
1818       v16,v17,v18,v19,v20,v21,v22,v23,
1819       v24,v25,v26,v27,v28,v29,v30,v31
1820     )) {}
1821     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::haswell::__anone55652eb0911::simd::simd81822     simdutf_really_inline static simd8<int8_t> repeat_16(
1823       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
1824       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
1825     ) {
1826       return simd8<int8_t>(
1827         v0, v1, v2, v3, v4, v5, v6, v7,
1828         v8, v9, v10,v11,v12,v13,v14,v15,
1829         v0, v1, v2, v3, v4, v5, v6, v7,
1830         v8, v9, v10,v11,v12,v13,v14,v15
1831       );
1832     }
is_asciisimdutf::haswell::__anone55652eb0911::simd::simd81833     simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
1834     // Order-sensitive comparisons
max_valsimdutf::haswell::__anone55652eb0911::simd::simd81835     simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
min_valsimdutf::haswell::__anone55652eb0911::simd::simd81836     simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
operator >simdutf::haswell::__anone55652eb0911::simd::simd81837     simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
operator <simdutf::haswell::__anone55652eb0911::simd::simd81838     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
1839   };
1840 
1841   // Unsigned bytes
1842   template<>
1843   struct simd8<uint8_t>: base8_numeric<uint8_t> {
simd8simdutf::haswell::__anone55652eb0911::simd::simd81844     simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
simd8simdutf::haswell::__anone55652eb0911::simd::simd81845     simdutf_really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
1846     // Splat constructor
simd8simdutf::haswell::__anone55652eb0911::simd::simd81847     simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
1848     // Array constructor
simd8simdutf::haswell::__anone55652eb0911::simd::simd81849     simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
1850     // Member-by-member initialization
simd8simdutf::haswell::__anone55652eb0911::simd::simd81851     simdutf_really_inline simd8(
1852       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
1853       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
1854       uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
1855       uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
1856     ) : simd8(_mm256_setr_epi8(
1857       v0, v1, v2, v3, v4, v5, v6, v7,
1858       v8, v9, v10,v11,v12,v13,v14,v15,
1859       v16,v17,v18,v19,v20,v21,v22,v23,
1860       v24,v25,v26,v27,v28,v29,v30,v31
1861     )) {}
1862     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::haswell::__anone55652eb0911::simd::simd81863     simdutf_really_inline static simd8<uint8_t> repeat_16(
1864       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
1865       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
1866     ) {
1867       return simd8<uint8_t>(
1868         v0, v1, v2, v3, v4, v5, v6, v7,
1869         v8, v9, v10,v11,v12,v13,v14,v15,
1870         v0, v1, v2, v3, v4, v5, v6, v7,
1871         v8, v9, v10,v11,v12,v13,v14,v15
1872       );
1873     }
1874 
1875 
1876     // Saturated math
saturating_addsimdutf::haswell::__anone55652eb0911::simd::simd81877     simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
saturating_subsimdutf::haswell::__anone55652eb0911::simd::simd81878     simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
1879 
1880     // Order-specific operations
max_valsimdutf::haswell::__anone55652eb0911::simd::simd81881     simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); }
min_valsimdutf::haswell::__anone55652eb0911::simd::simd81882     simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm256_min_epu8(other, *this); }
1883     // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::haswell::__anone55652eb0911::simd::simd81884     simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
1885     // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::haswell::__anone55652eb0911::simd::simd81886     simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::haswell::__anone55652eb0911::simd::simd81887     simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
operator >=simdutf::haswell::__anone55652eb0911::simd::simd81888     simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
operator >simdutf::haswell::__anone55652eb0911::simd::simd81889     simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
operator <simdutf::haswell::__anone55652eb0911::simd::simd81890     simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
1891 
1892     // Bit-specific operations
bits_not_setsimdutf::haswell::__anone55652eb0911::simd::simd81893     simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
bits_not_setsimdutf::haswell::__anone55652eb0911::simd::simd81894     simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
any_bits_setsimdutf::haswell::__anone55652eb0911::simd::simd81895     simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
any_bits_setsimdutf::haswell::__anone55652eb0911::simd::simd81896     simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
is_asciisimdutf::haswell::__anone55652eb0911::simd::simd81897     simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
bits_not_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd81898     simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
any_bits_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd81899     simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
bits_not_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd81900     simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); }
any_bits_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd81901     simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
1902     template<int N>
shrsimdutf::haswell::__anone55652eb0911::simd::simd81903     simdutf_really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
1904     template<int N>
shlsimdutf::haswell::__anone55652eb0911::simd::simd81905     simdutf_really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
1906     // Get one of the bits and make a bitmask out of it.
1907     // e.g. value.get_bit<7>() gets the high bit
1908     template<int N>
get_bitsimdutf::haswell::__anone55652eb0911::simd::simd81909     simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); }
1910   };
operator simd8<uint8_t>() const1911   simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
1912 
1913 
1914   template<typename T>
1915   struct simd8x64 {
1916     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
1917     static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
1918     simd8<T> chunks[NUM_CHUNKS];
1919 
1920     simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
1921     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
1922     simd8x64() = delete; // no default constructor allowed
1923 
simd8x64simdutf::haswell::__anone55652eb0911::simd::simd8x641924     simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
simd8x64simdutf::haswell::__anone55652eb0911::simd::simd8x641925     simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T))} {}
1926 
storesimdutf::haswell::__anone55652eb0911::simd::simd8x641927     simdutf_really_inline void store(T* ptr) const {
1928       this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
1929       this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
1930     }
1931 
to_bitmasksimdutf::haswell::__anone55652eb0911::simd::simd8x641932     simdutf_really_inline uint64_t to_bitmask() const {
1933       uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
1934       uint64_t r_hi =                       this->chunks[1].to_bitmask();
1935       return r_lo | (r_hi << 32);
1936     }
1937 
operator |=simdutf::haswell::__anone55652eb0911::simd::simd8x641938     simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T> &other) {
1939       this->chunks[0] |= other.chunks[0];
1940       this->chunks[1] |= other.chunks[1];
1941       return *this;
1942     }
1943 
reduce_orsimdutf::haswell::__anone55652eb0911::simd::simd8x641944     simdutf_really_inline simd8<T> reduce_or() const {
1945       return this->chunks[0] | this->chunks[1];
1946     }
1947 
is_asciisimdutf::haswell::__anone55652eb0911::simd::simd8x641948     simdutf_really_inline bool is_ascii() const {
1949       return this->reduce_or().is_ascii();
1950     }
1951 
1952     template <endianness endian>
store_ascii_as_utf16simdutf::haswell::__anone55652eb0911::simd::simd8x641953     simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
1954       this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
1955       this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
1956     }
1957 
store_ascii_as_utf32simdutf::haswell::__anone55652eb0911::simd::simd8x641958     simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
1959       this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
1960       this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
1961     }
1962 
bit_orsimdutf::haswell::__anone55652eb0911::simd::simd8x641963     simdutf_really_inline simd8x64<T> bit_or(const T m) const {
1964       const simd8<T> mask = simd8<T>::splat(m);
1965       return simd8x64<T>(
1966         this->chunks[0] | mask,
1967         this->chunks[1] | mask
1968       );
1969     }
1970 
eqsimdutf::haswell::__anone55652eb0911::simd::simd8x641971     simdutf_really_inline uint64_t eq(const T m) const {
1972       const simd8<T> mask = simd8<T>::splat(m);
1973       return  simd8x64<bool>(
1974         this->chunks[0] == mask,
1975         this->chunks[1] == mask
1976       ).to_bitmask();
1977     }
1978 
eqsimdutf::haswell::__anone55652eb0911::simd::simd8x641979     simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
1980       return  simd8x64<bool>(
1981         this->chunks[0] == other.chunks[0],
1982         this->chunks[1] == other.chunks[1]
1983       ).to_bitmask();
1984     }
1985 
lteqsimdutf::haswell::__anone55652eb0911::simd::simd8x641986     simdutf_really_inline uint64_t lteq(const T m) const {
1987       const simd8<T> mask = simd8<T>::splat(m);
1988       return  simd8x64<bool>(
1989         this->chunks[0] <= mask,
1990         this->chunks[1] <= mask
1991       ).to_bitmask();
1992     }
1993 
in_rangesimdutf::haswell::__anone55652eb0911::simd::simd8x641994     simdutf_really_inline uint64_t in_range(const T low, const T high) const {
1995       const simd8<T> mask_low = simd8<T>::splat(low);
1996       const simd8<T> mask_high = simd8<T>::splat(high);
1997 
1998       return  simd8x64<bool>(
1999         (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
2000         (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
2001         (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
2002         (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
2003       ).to_bitmask();
2004     }
not_in_rangesimdutf::haswell::__anone55652eb0911::simd::simd8x642005     simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
2006       const simd8<T> mask_low = simd8<T>::splat(low);
2007       const simd8<T> mask_high = simd8<T>::splat(high);
2008       return  simd8x64<bool>(
2009         (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
2010         (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)
2011       ).to_bitmask();
2012     }
ltsimdutf::haswell::__anone55652eb0911::simd::simd8x642013     simdutf_really_inline uint64_t lt(const T m) const {
2014       const simd8<T> mask = simd8<T>::splat(m);
2015       return  simd8x64<bool>(
2016         this->chunks[0] < mask,
2017         this->chunks[1] < mask
2018       ).to_bitmask();
2019     }
2020 
gtsimdutf::haswell::__anone55652eb0911::simd::simd8x642021     simdutf_really_inline uint64_t gt(const T m) const {
2022       const simd8<T> mask = simd8<T>::splat(m);
2023       return  simd8x64<bool>(
2024         this->chunks[0] > mask,
2025         this->chunks[1] > mask
2026       ).to_bitmask();
2027     }
gteqsimdutf::haswell::__anone55652eb0911::simd::simd8x642028     simdutf_really_inline uint64_t gteq(const T m) const {
2029       const simd8<T> mask = simd8<T>::splat(m);
2030       return  simd8x64<bool>(
2031         this->chunks[0] >= mask,
2032         this->chunks[1] >= mask
2033       ).to_bitmask();
2034     }
gteq_unsignedsimdutf::haswell::__anone55652eb0911::simd::simd8x642035     simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
2036       const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
2037       return  simd8x64<bool>(
2038         (simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
2039         (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask)
2040       ).to_bitmask();
2041     }
2042   }; // struct simd8x64<T>
2043 
2044 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd16-inl.h
2045 /* begin file src/simdutf/haswell/simd16-inl.h */
2046 #ifdef __GNUC__
2047 #if __GNUC__ < 8
2048 #define _mm256_set_m128i(xmm1, xmm2) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
2049 #define _mm256_setr_m128i(xmm2, xmm1)  _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
2050 #endif
2051 #endif
2052 
2053 template<typename T>
2054 struct simd16;
2055 
2056 template<typename T, typename Mask=simd16<bool>>
2057 struct base16: base<simd16<T>> {
2058   using bitmask_type = uint32_t;
2059 
base16simdutf::haswell::__anone55652eb0911::simd::base162060   simdutf_really_inline base16() : base<simd16<T>>() {}
base16simdutf::haswell::__anone55652eb0911::simd::base162061   simdutf_really_inline base16(const __m256i _value) : base<simd16<T>>(_value) {}
2062   template <typename Pointer>
base16simdutf::haswell::__anone55652eb0911::simd::base162063   simdutf_really_inline base16(const Pointer* ptr) : base16(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr))) {}
operator ==(const simd16<T> lhs,const simd16<T> rhs)2064   friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm256_cmpeq_epi16(lhs, rhs); }
2065 
2066   /// the size of vector in bytes
2067   static const int SIZE = sizeof(base<simd16<T>>::value);
2068 
2069   /// the number of elements of type T a vector can hold
2070   static const int ELEMENTS = SIZE / sizeof(T);
2071 
2072   template<int N=1>
prevsimdutf::haswell::__anone55652eb0911::simd::base162073   simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
2074     return _mm256_alignr_epi8(*this, prev_chunk, 16 - N);
2075   }
2076 };
2077 
2078 // SIMD byte mask type (returned by things like eq and gt)
2079 template<>
2080 struct simd16<bool>: base16<bool> {
splatsimdutf::haswell::__anone55652eb0911::simd::simd162081   static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm256_set1_epi16(uint16_t(-(!!_value))); }
2082 
simd16simdutf::haswell::__anone55652eb0911::simd::simd162083   simdutf_really_inline simd16<bool>() : base16() {}
simd16simdutf::haswell::__anone55652eb0911::simd::simd162084   simdutf_really_inline simd16<bool>(const __m256i _value) : base16<bool>(_value) {}
2085   // Splat constructor
simd16simdutf::haswell::__anone55652eb0911::simd::simd162086   simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
2087 
to_bitmasksimdutf::haswell::__anone55652eb0911::simd::simd162088   simdutf_really_inline bitmask_type to_bitmask() const { return _mm256_movemask_epi8(*this); }
anysimdutf::haswell::__anone55652eb0911::simd::simd162089   simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
operator ~simdutf::haswell::__anone55652eb0911::simd::simd162090   simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
2091 };
2092 
2093 template<typename T>
2094 struct base16_numeric: base16<T> {
splatsimdutf::haswell::__anone55652eb0911::simd::base16_numeric2095   static simdutf_really_inline simd16<T> splat(T _value) { return _mm256_set1_epi16(_value); }
zerosimdutf::haswell::__anone55652eb0911::simd::base16_numeric2096   static simdutf_really_inline simd16<T> zero() { return _mm256_setzero_si256(); }
loadsimdutf::haswell::__anone55652eb0911::simd::base16_numeric2097   static simdutf_really_inline simd16<T> load(const T values[8]) {
2098     return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
2099   }
2100 
base16_numericsimdutf::haswell::__anone55652eb0911::simd::base16_numeric2101   simdutf_really_inline base16_numeric() : base16<T>() {}
base16_numericsimdutf::haswell::__anone55652eb0911::simd::base16_numeric2102   simdutf_really_inline base16_numeric(const __m256i _value) : base16<T>(_value) {}
2103 
2104   // Store to array
storesimdutf::haswell::__anone55652eb0911::simd::base16_numeric2105   simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
2106 
2107   // Override to distinguish from bool version
operator ~simdutf::haswell::__anone55652eb0911::simd::base16_numeric2108   simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
2109 
2110   // Addition/subtraction are the same for signed and unsigned
operator +simdutf::haswell::__anone55652eb0911::simd::base16_numeric2111   simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm256_add_epi16(*this, other); }
operator -simdutf::haswell::__anone55652eb0911::simd::base16_numeric2112   simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm256_sub_epi16(*this, other); }
operator +=simdutf::haswell::__anone55652eb0911::simd::base16_numeric2113   simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
operator -=simdutf::haswell::__anone55652eb0911::simd::base16_numeric2114   simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
2115 };
2116 
2117 // Signed words
2118 template<>
2119 struct simd16<int16_t> : base16_numeric<int16_t> {
simd16simdutf::haswell::__anone55652eb0911::simd::simd162120   simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
simd16simdutf::haswell::__anone55652eb0911::simd::simd162121   simdutf_really_inline simd16(const __m256i _value) : base16_numeric<int16_t>(_value) {}
2122   // Splat constructor
simd16simdutf::haswell::__anone55652eb0911::simd::simd162123   simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
2124   // Array constructor
simd16simdutf::haswell::__anone55652eb0911::simd::simd162125   simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
simd16simdutf::haswell::__anone55652eb0911::simd::simd162126   simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
2127   // Order-sensitive comparisons
max_valsimdutf::haswell::__anone55652eb0911::simd::simd162128   simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm256_max_epi16(*this, other); }
min_valsimdutf::haswell::__anone55652eb0911::simd::simd162129   simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm256_min_epi16(*this, other); }
operator >simdutf::haswell::__anone55652eb0911::simd::simd162130   simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(*this, other); }
operator <simdutf::haswell::__anone55652eb0911::simd::simd162131   simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(other, *this); }
2132 };
2133 
2134 // Unsigned words
2135 template<>
2136 struct simd16<uint16_t>: base16_numeric<uint16_t>  {
simd16simdutf::haswell::__anone55652eb0911::simd::simd162137   simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
simd16simdutf::haswell::__anone55652eb0911::simd::simd162138   simdutf_really_inline simd16(const __m256i _value) : base16_numeric<uint16_t>(_value) {}
2139 
2140   // Splat constructor
simd16simdutf::haswell::__anone55652eb0911::simd::simd162141   simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
2142   // Array constructor
simd16simdutf::haswell::__anone55652eb0911::simd::simd162143   simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
simd16simdutf::haswell::__anone55652eb0911::simd::simd162144   simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
2145 
2146   // Saturated math
saturating_addsimdutf::haswell::__anone55652eb0911::simd::simd162147   simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm256_adds_epu16(*this, other); }
saturating_subsimdutf::haswell::__anone55652eb0911::simd::simd162148   simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm256_subs_epu16(*this, other); }
2149 
2150   // Order-specific operations
max_valsimdutf::haswell::__anone55652eb0911::simd::simd162151   simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm256_max_epu16(*this, other); }
min_valsimdutf::haswell::__anone55652eb0911::simd::simd162152   simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm256_min_epu16(*this, other); }
2153   // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::haswell::__anone55652eb0911::simd::simd162154   simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
2155   // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::haswell::__anone55652eb0911::simd::simd162156   simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::haswell::__anone55652eb0911::simd::simd162157   simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
operator >=simdutf::haswell::__anone55652eb0911::simd::simd162158   simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
operator >simdutf::haswell::__anone55652eb0911::simd::simd162159   simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
operator <simdutf::haswell::__anone55652eb0911::simd::simd162160   simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
2161 
2162   // Bit-specific operations
bits_not_setsimdutf::haswell::__anone55652eb0911::simd::simd162163   simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
bits_not_setsimdutf::haswell::__anone55652eb0911::simd::simd162164   simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
any_bits_setsimdutf::haswell::__anone55652eb0911::simd::simd162165   simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
any_bits_setsimdutf::haswell::__anone55652eb0911::simd::simd162166   simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
2167 
bits_not_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd162168   simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
any_bits_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd162169   simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
bits_not_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd162170   simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm256_testz_si256(*this, bits); }
any_bits_set_anywheresimdutf::haswell::__anone55652eb0911::simd::simd162171   simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
2172   template<int N>
shrsimdutf::haswell::__anone55652eb0911::simd::simd162173   simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm256_srli_epi16(*this, N)); }
2174   template<int N>
shlsimdutf::haswell::__anone55652eb0911::simd::simd162175   simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm256_slli_epi16(*this, N)); }
2176   // Get one of the bits and make a bitmask out of it.
2177   // e.g. value.get_bit<7>() gets the high bit
2178   template<int N>
get_bitsimdutf::haswell::__anone55652eb0911::simd::simd162179   simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15-N)); }
2180 
2181   // Change the endianness
swap_bytessimdutf::haswell::__anone55652eb0911::simd::simd162182   simdutf_really_inline simd16<uint16_t> swap_bytes() const {
2183     const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
2184                                   17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
2185     return _mm256_shuffle_epi8(*this, swap);
2186   }
2187 
2188   // Pack with the unsigned saturation two uint16_t words into single uint8_t vector
packsimdutf::haswell::__anone55652eb0911::simd::simd162189   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
2190     // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
2191     //       we have to shuffle lanes in order to produce bytes in the
2192     //       correct order.
2193 
2194     // get the 0th lanes
2195     const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
2196     const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);
2197 
2198     // get the 1st lanes
2199     const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
2200     const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);
2201 
2202     // build new vectors (shuffle lanes)
2203     const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
2204     const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);
2205 
2206     // pack words in linear order from v0 and v1
2207     return _mm256_packus_epi16(t0, t1);
2208   }
2209 };
2210 
2211 
2212   template<typename T>
2213   struct simd16x32 {
2214     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
2215     static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
2216     simd16<T> chunks[NUM_CHUNKS];
2217 
2218     simd16x32(const simd16x32<T>& o) = delete; // no copy allowed
2219     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
2220     simd16x32() = delete; // no default constructor allowed
2221 
simd16x32simdutf::haswell::__anone55652eb0911::simd::simd16x322222     simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1) : chunks{chunk0, chunk1} {}
simd16x32simdutf::haswell::__anone55652eb0911::simd::simd16x322223     simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T))} {}
2224 
storesimdutf::haswell::__anone55652eb0911::simd::simd16x322225     simdutf_really_inline void store(T* ptr) const {
2226       this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
2227       this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
2228     }
2229 
to_bitmasksimdutf::haswell::__anone55652eb0911::simd::simd16x322230     simdutf_really_inline uint64_t to_bitmask() const {
2231       uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
2232       uint64_t r_hi =                       this->chunks[1].to_bitmask();
2233       return r_lo | (r_hi << 32);
2234     }
2235 
reduce_orsimdutf::haswell::__anone55652eb0911::simd::simd16x322236     simdutf_really_inline simd16<T> reduce_or() const {
2237       return this->chunks[0] | this->chunks[1];
2238     }
2239 
is_asciisimdutf::haswell::__anone55652eb0911::simd::simd16x322240     simdutf_really_inline bool is_ascii() const {
2241       return this->reduce_or().is_ascii();
2242     }
2243 
store_ascii_as_utf16simdutf::haswell::__anone55652eb0911::simd::simd16x322244     simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
2245       this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
2246       this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>));
2247     }
2248 
bit_orsimdutf::haswell::__anone55652eb0911::simd::simd16x322249     simdutf_really_inline simd16x32<T> bit_or(const T m) const {
2250       const simd16<T> mask = simd16<T>::splat(m);
2251       return simd16x32<T>(
2252         this->chunks[0] | mask,
2253         this->chunks[1] | mask
2254       );
2255     }
2256 
swap_bytessimdutf::haswell::__anone55652eb0911::simd::simd16x322257     simdutf_really_inline void swap_bytes() {
2258       this->chunks[0] = this->chunks[0].swap_bytes();
2259       this->chunks[1] = this->chunks[1].swap_bytes();
2260     }
2261 
eqsimdutf::haswell::__anone55652eb0911::simd::simd16x322262     simdutf_really_inline uint64_t eq(const T m) const {
2263       const simd16<T> mask = simd16<T>::splat(m);
2264       return  simd16x32<bool>(
2265         this->chunks[0] == mask,
2266         this->chunks[1] == mask
2267       ).to_bitmask();
2268     }
2269 
eqsimdutf::haswell::__anone55652eb0911::simd::simd16x322270     simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
2271       return  simd16x32<bool>(
2272         this->chunks[0] == other.chunks[0],
2273         this->chunks[1] == other.chunks[1]
2274       ).to_bitmask();
2275     }
2276 
lteqsimdutf::haswell::__anone55652eb0911::simd::simd16x322277     simdutf_really_inline uint64_t lteq(const T m) const {
2278       const simd16<T> mask = simd16<T>::splat(m);
2279       return  simd16x32<bool>(
2280         this->chunks[0] <= mask,
2281         this->chunks[1] <= mask
2282       ).to_bitmask();
2283     }
2284 
in_rangesimdutf::haswell::__anone55652eb0911::simd::simd16x322285     simdutf_really_inline uint64_t in_range(const T low, const T high) const {
2286       const simd16<T> mask_low = simd16<T>::splat(low);
2287       const simd16<T> mask_high = simd16<T>::splat(high);
2288 
2289       return  simd16x32<bool>(
2290         (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
2291         (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
2292         (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
2293         (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
2294       ).to_bitmask();
2295     }
not_in_rangesimdutf::haswell::__anone55652eb0911::simd::simd16x322296     simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
2297       const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
2298       const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
2299       return simd16x32<bool>(
2300         (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
2301         (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low)
2302       ).to_bitmask();
2303     }
ltsimdutf::haswell::__anone55652eb0911::simd::simd16x322304     simdutf_really_inline uint64_t lt(const T m) const {
2305       const simd16<T> mask = simd16<T>::splat(m);
2306       return  simd16x32<bool>(
2307         this->chunks[0] < mask,
2308         this->chunks[1] < mask
2309       ).to_bitmask();
2310     }
2311   }; // struct simd16x32<T>
2312 /* end file src/simdutf/haswell/simd16-inl.h */
2313 
2314 } // namespace simd
2315 
2316 } // unnamed namespace
2317 } // namespace haswell
2318 } // namespace simdutf
2319 
2320 #endif // SIMDUTF_HASWELL_SIMD_H
2321 /* end file src/simdutf/haswell/simd.h */
2322 
2323 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h
2324 /* begin file src/simdutf/haswell/end.h */
2325 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
2326 // nothing needed.
2327 #else
2328 SIMDUTF_UNTARGET_REGION
2329 #endif
2330 
2331 
2332 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
2333 SIMDUTF_POP_DISABLE_WARNINGS
2334 #endif // end of workaround
2335 /* end file src/simdutf/haswell/end.h */
2336 
2337 #endif // SIMDUTF_IMPLEMENTATION_HASWELL
2338 #endif // SIMDUTF_HASWELL_COMMON_H
2339 /* end file src/simdutf/haswell.h */
2340 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere.h
2341 /* begin file src/simdutf/westmere.h */
2342 #ifndef SIMDUTF_WESTMERE_H
2343 #define SIMDUTF_WESTMERE_H
2344 
2345 #ifdef SIMDUTF_FALLBACK_H
2346 #error "westmere.h must be included before fallback.h"
2347 #endif
2348 
2349 
2350 // Default Westmere to on if this is x86-64, unless we'll always select Haswell.
2351 #ifndef SIMDUTF_IMPLEMENTATION_WESTMERE
2352 //
2353 // You do not want to set it to (SIMDUTF_IS_X86_64 && !SIMDUTF_REQUIRES_HASWELL)
2354 // because you want to rely on runtime dispatch!
2355 //
2356 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL
2357 #define SIMDUTF_IMPLEMENTATION_WESTMERE 0
2358 #else
2359 #define SIMDUTF_IMPLEMENTATION_WESTMERE (SIMDUTF_IS_X86_64)
2360 #endif
2361 
2362 #endif
2363 
2364 #define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__)
2365 
2366 #if SIMDUTF_IMPLEMENTATION_WESTMERE
2367 
2368 #define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,popcnt")
2369 
2370 namespace simdutf {
2371 /**
2372  * Implementation for Westmere (Intel SSE4.2).
2373  */
2374 namespace westmere {
2375 } // namespace westmere
2376 } // namespace simdutf
2377 
2378 //
2379 // These two need to be included outside SIMDUTF_TARGET_REGION
2380 //
2381 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/implementation.h
2382 /* begin file src/simdutf/westmere/implementation.h */
2383 #ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H
2384 #define SIMDUTF_WESTMERE_IMPLEMENTATION_H
2385 
2386 
2387 // The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
2388 namespace simdutf {
2389 namespace westmere {
2390 
2391 namespace {
2392 using namespace simdutf;
2393 }
2394 
2395 class implementation final : public simdutf::implementation {
2396 public:
implementation()2397   simdutf_really_inline implementation() : simdutf::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42) {}
2398   simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
2399   simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
2400   simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
2401   simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
2402   simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
2403   simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
2404   simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
2405   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
2406   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
2407   simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
2408   simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
2409   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
2410   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
2411   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
2412   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
2413   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2414   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2415   simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
2416   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
2417   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2418   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2419   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2420   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2421   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2422   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2423   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2424   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2425   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2426   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2427   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2428   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2429   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2430   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2431   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2432   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2433   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2434   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2435   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2436   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2437   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2438   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2439   void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
2440   simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
2441   simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
2442   simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
2443   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
2444   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
2445   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
2446   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
2447   simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
2448   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
2449   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
2450   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
2451 };
2452 
2453 } // namespace westmere
2454 } // namespace simdutf
2455 
2456 #endif // SIMDUTF_WESTMERE_IMPLEMENTATION_H
2457 /* end file src/simdutf/westmere/implementation.h */
2458 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/intrinsics.h
2459 /* begin file src/simdutf/westmere/intrinsics.h */
2460 #ifndef SIMDUTF_WESTMERE_INTRINSICS_H
2461 #define SIMDUTF_WESTMERE_INTRINSICS_H
2462 
2463 #ifdef SIMDUTF_VISUAL_STUDIO
2464 // under clang within visual studio, this will include <x86intrin.h>
2465 #include <intrin.h> // visual studio or clang
2466 #else
2467 
2468 #if SIMDUTF_GCC11ORMORE
2469 // We should not get warnings while including <x86intrin.h> yet we do
2470 // under some versions of GCC.
2471 // If the x86intrin.h header has uninitialized values that are problematic,
2472 // it is a GCC issue, we want to ignore these warnigns.
2473 SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
2474 #endif
2475 
2476 #include <x86intrin.h> // elsewhere
2477 
2478 
2479 #if SIMDUTF_GCC11ORMORE
2480 // cancels the suppression of the -Wuninitialized
2481 SIMDUTF_POP_DISABLE_WARNINGS
2482 #endif
2483 
2484 #endif // SIMDUTF_VISUAL_STUDIO
2485 
2486 
2487 #ifdef SIMDUTF_CLANG_VISUAL_STUDIO
2488 /**
2489  * You are not supposed, normally, to include these
2490  * headers directly. Instead you should either include intrin.h
2491  * or x86intrin.h. However, when compiling with clang
2492  * under Windows (i.e., when _MSC_VER is set), these headers
2493  * only get included *if* the corresponding features are detected
2494  * from macros:
2495  */
2496 #include <smmintrin.h>  // for _mm_alignr_epi8
2497 #endif
2498 
2499 
2500 
2501 #endif // SIMDUTF_WESTMERE_INTRINSICS_H
2502 /* end file src/simdutf/westmere/intrinsics.h */
2503 
2504 //
2505 // The rest need to be inside the region
2506 //
2507 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h
2508 /* begin file src/simdutf/westmere/begin.h */
2509 // redefining SIMDUTF_IMPLEMENTATION to "westmere"
2510 // #define SIMDUTF_IMPLEMENTATION westmere
2511 
2512 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
2513 // nothing needed.
2514 #else
2515 SIMDUTF_TARGET_WESTMERE
2516 #endif
2517 /* end file src/simdutf/westmere/begin.h */
2518 
2519 // Declarations
2520 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/bitmanipulation.h
2521 /* begin file src/simdutf/westmere/bitmanipulation.h */
2522 #ifndef SIMDUTF_WESTMERE_BITMANIPULATION_H
2523 #define SIMDUTF_WESTMERE_BITMANIPULATION_H
2524 
2525 namespace simdutf {
2526 namespace westmere {
2527 namespace {
2528 
2529 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
count_ones(uint64_t input_num)2530 simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
2531   // note: we do not support legacy 32-bit Windows
2532   return __popcnt64(input_num);// Visual Studio wants two underscores
2533 }
2534 #else
2535 simdutf_really_inline long long int count_ones(uint64_t input_num) {
2536   return _popcnt64(input_num);
2537 }
2538 #endif
2539 
2540 } // unnamed namespace
2541 } // namespace westmere
2542 } // namespace simdutf
2543 
2544 #endif // SIMDUTF_WESTMERE_BITMANIPULATION_H
2545 /* end file src/simdutf/westmere/bitmanipulation.h */
2546 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd.h
2547 /* begin file src/simdutf/westmere/simd.h */
2548 #ifndef SIMDUTF_WESTMERE_SIMD_H
2549 #define SIMDUTF_WESTMERE_SIMD_H
2550 
2551 namespace simdutf {
2552 namespace westmere {
2553 namespace {
2554 namespace simd {
2555 
2556   template<typename Child>
2557   struct base {
2558     __m128i value;
2559 
2560     // Zero constructor
basesimdutf::westmere::__anone55652eb0c11::simd::base2561     simdutf_really_inline base() : value{__m128i()} {}
2562 
2563     // Conversion from SIMD register
basesimdutf::westmere::__anone55652eb0c11::simd::base2564     simdutf_really_inline base(const __m128i _value) : value(_value) {}
2565     // Conversion to SIMD register
operator const __m128i&simdutf::westmere::__anone55652eb0c11::simd::base2566     simdutf_really_inline operator const __m128i&() const { return this->value; }
operator __m128i&simdutf::westmere::__anone55652eb0c11::simd::base2567     simdutf_really_inline operator __m128i&() { return this->value; }
2568     template <endianness big_endian>
store_ascii_as_utf16simdutf::westmere::__anone55652eb0c11::simd::base2569     simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
2570       __m128i first = _mm_cvtepu8_epi16(*this);
2571       __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this,8));
2572       if (big_endian) {
2573         const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
2574         first = _mm_shuffle_epi8(first, swap);
2575         second = _mm_shuffle_epi8(second, swap);
2576       }
2577       _mm_storeu_si128(reinterpret_cast<__m128i *>(p), first);
2578       _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), second);
2579     }
store_ascii_as_utf32simdutf::westmere::__anone55652eb0c11::simd::base2580     simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const {
2581       _mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi32(*this));
2582       _mm_storeu_si128(reinterpret_cast<__m128i *>(p+4), _mm_cvtepu8_epi32(_mm_srli_si128(*this,4)));
2583       _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), _mm_cvtepu8_epi32(_mm_srli_si128(*this,8)));
2584       _mm_storeu_si128(reinterpret_cast<__m128i *>(p+12), _mm_cvtepu8_epi32(_mm_srli_si128(*this,12)));
2585     }
2586     // Bit operations
operator |simdutf::westmere::__anone55652eb0c11::simd::base2587     simdutf_really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
operator &simdutf::westmere::__anone55652eb0c11::simd::base2588     simdutf_really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
operator ^simdutf::westmere::__anone55652eb0c11::simd::base2589     simdutf_really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
bit_andnotsimdutf::westmere::__anone55652eb0c11::simd::base2590     simdutf_really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
operator |=simdutf::westmere::__anone55652eb0c11::simd::base2591     simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
operator &=simdutf::westmere::__anone55652eb0c11::simd::base2592     simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
operator ^=simdutf::westmere::__anone55652eb0c11::simd::base2593     simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
2594   };
2595 
2596   // Forward-declared so they can be used by splat and friends.
2597   template<typename T>
2598   struct simd8;
2599 
2600   template<typename T, typename Mask=simd8<bool>>
2601   struct base8: base<simd8<T>> {
2602     typedef uint16_t bitmask_t;
2603     typedef uint32_t bitmask2_t;
2604 
firstsimdutf::westmere::__anone55652eb0c11::simd::base82605     simdutf_really_inline T first() const { return _mm_extract_epi8(*this,0); }
lastsimdutf::westmere::__anone55652eb0c11::simd::base82606     simdutf_really_inline T last() const { return _mm_extract_epi8(*this,15); }
base8simdutf::westmere::__anone55652eb0c11::simd::base82607     simdutf_really_inline base8() : base<simd8<T>>() {}
base8simdutf::westmere::__anone55652eb0c11::simd::base82608     simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
2609 
operator ==(const simd8<T> lhs,const simd8<T> rhs)2610     friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm_cmpeq_epi8(lhs, rhs); }
2611 
2612     static const int SIZE = sizeof(base<simd8<T>>::value);
2613 
2614     template<int N=1>
prevsimdutf::westmere::__anone55652eb0c11::simd::base82615     simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
2616       return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
2617     }
2618   };
2619 
2620   // SIMD byte mask type (returned by things like eq and gt)
2621   template<>
2622   struct simd8<bool>: base8<bool> {
splatsimdutf::westmere::__anone55652eb0c11::simd::simd82623     static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
2624 
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82625     simdutf_really_inline simd8<bool>() : base8() {}
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82626     simdutf_really_inline simd8<bool>(const __m128i _value) : base8<bool>(_value) {}
2627     // Splat constructor
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82628     simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
2629 
to_bitmasksimdutf::westmere::__anone55652eb0c11::simd::simd82630     simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
anysimdutf::westmere::__anone55652eb0c11::simd::simd82631     simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
nonesimdutf::westmere::__anone55652eb0c11::simd::simd82632     simdutf_really_inline bool none() const { return _mm_testz_si128(*this, *this); }
allsimdutf::westmere::__anone55652eb0c11::simd::simd82633     simdutf_really_inline bool all() const { return _mm_movemask_epi8(*this) == 0xFFFF; }
operator ~simdutf::westmere::__anone55652eb0c11::simd::simd82634     simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
2635   };
2636 
2637   template<typename T>
2638   struct base8_numeric: base8<T> {
splatsimdutf::westmere::__anone55652eb0c11::simd::base8_numeric2639     static simdutf_really_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
zerosimdutf::westmere::__anone55652eb0c11::simd::base8_numeric2640     static simdutf_really_inline simd8<T> zero() { return _mm_setzero_si128(); }
loadsimdutf::westmere::__anone55652eb0c11::simd::base8_numeric2641     static simdutf_really_inline simd8<T> load(const T values[16]) {
2642       return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
2643     }
2644     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2645     static simdutf_really_inline simd8<T> repeat_16(
2646       T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
2647       T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
2648     ) {
2649       return simd8<T>(
2650         v0, v1, v2, v3, v4, v5, v6, v7,
2651         v8, v9, v10,v11,v12,v13,v14,v15
2652       );
2653     }
2654 
base8_numericsimdutf::westmere::__anone55652eb0c11::simd::base8_numeric2655     simdutf_really_inline base8_numeric() : base8<T>() {}
base8_numericsimdutf::westmere::__anone55652eb0c11::simd::base8_numeric2656     simdutf_really_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
2657 
2658     // Store to array
storesimdutf::westmere::__anone55652eb0c11::simd::base8_numeric2659     simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
2660 
2661     // Override to distinguish from bool version
operator ~simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2662     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
2663 
2664     // Addition/subtraction are the same for signed and unsigned
operator +simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2665     simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
operator -simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2666     simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
operator +=simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2667     simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
operator -=simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2668     simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
2669 
2670     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
2671     template<typename L>
lookup_16simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2672     simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
2673       return _mm_shuffle_epi8(lookup_table, *this);
2674     }
2675 
2676     template<typename L>
lookup_16simdutf::westmere::__anone55652eb0c11::simd::base8_numeric2677     simdutf_really_inline simd8<L> lookup_16(
2678         L replace0,  L replace1,  L replace2,  L replace3,
2679         L replace4,  L replace5,  L replace6,  L replace7,
2680         L replace8,  L replace9,  L replace10, L replace11,
2681         L replace12, L replace13, L replace14, L replace15) const {
2682       return lookup_16(simd8<L>::repeat_16(
2683         replace0,  replace1,  replace2,  replace3,
2684         replace4,  replace5,  replace6,  replace7,
2685         replace8,  replace9,  replace10, replace11,
2686         replace12, replace13, replace14, replace15
2687       ));
2688     }
2689   };
2690 
2691   // Signed bytes
2692   template<>
2693   struct simd8<int8_t> : base8_numeric<int8_t> {
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82694     simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82695     simdutf_really_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
2696     // Splat constructor
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82697     simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
2698     // Array constructor
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82699     simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
2700     // Member-by-member initialization
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82701     simdutf_really_inline simd8(
2702       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
2703       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
2704     ) : simd8(_mm_setr_epi8(
2705       v0, v1, v2, v3, v4, v5, v6, v7,
2706       v8, v9, v10,v11,v12,v13,v14,v15
2707     )) {}
2708     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::westmere::__anone55652eb0c11::simd::simd82709     simdutf_really_inline static simd8<int8_t> repeat_16(
2710       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
2711       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
2712     ) {
2713       return simd8<int8_t>(
2714         v0, v1, v2, v3, v4, v5, v6, v7,
2715         v8, v9, v10,v11,v12,v13,v14,v15
2716       );
2717     }
2718     simdutf_really_inline operator simd8<uint8_t>() const;
is_asciisimdutf::westmere::__anone55652eb0c11::simd::simd82719     simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
2720 
2721     // Order-sensitive comparisons
max_valsimdutf::westmere::__anone55652eb0c11::simd::simd82722     simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
min_valsimdutf::westmere::__anone55652eb0c11::simd::simd82723     simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
operator >simdutf::westmere::__anone55652eb0c11::simd::simd82724     simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
operator <simdutf::westmere::__anone55652eb0c11::simd::simd82725     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
2726   };
2727 
2728   // Unsigned bytes
2729   template<>
2730   struct simd8<uint8_t>: base8_numeric<uint8_t>  {
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82731     simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82732     simdutf_really_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
2733 
2734     // Splat constructor
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82735     simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
2736     // Array constructor
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82737     simdutf_really_inline simd8(const uint8_t* values) : simd8(load(values)) {}
2738     // Member-by-member initialization
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82739     simdutf_really_inline simd8(
2740       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
2741       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
2742     ) : simd8(_mm_setr_epi8(
2743       v0, v1, v2, v3, v4, v5, v6, v7,
2744       v8, v9, v10,v11,v12,v13,v14,v15
2745     )) {}
2746     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::westmere::__anone55652eb0c11::simd::simd82747     simdutf_really_inline static simd8<uint8_t> repeat_16(
2748       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
2749       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
2750     ) {
2751       return simd8<uint8_t>(
2752         v0, v1, v2, v3, v4, v5, v6, v7,
2753         v8, v9, v10,v11,v12,v13,v14,v15
2754       );
2755     }
2756 
2757     // Saturated math
saturating_addsimdutf::westmere::__anone55652eb0c11::simd::simd82758     simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }
saturating_subsimdutf::westmere::__anone55652eb0c11::simd::simd82759     simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); }
2760 
2761     // Order-specific operations
max_valsimdutf::westmere::__anone55652eb0c11::simd::simd82762     simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); }
min_valsimdutf::westmere::__anone55652eb0c11::simd::simd82763     simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); }
2764     // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::westmere::__anone55652eb0c11::simd::simd82765     simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
2766     // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::westmere::__anone55652eb0c11::simd::simd82767     simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::westmere::__anone55652eb0c11::simd::simd82768     simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
operator >=simdutf::westmere::__anone55652eb0c11::simd::simd82769     simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
operator >simdutf::westmere::__anone55652eb0c11::simd::simd82770     simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
operator <simdutf::westmere::__anone55652eb0c11::simd::simd82771     simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
2772 
2773     // Bit-specific operations
bits_not_setsimdutf::westmere::__anone55652eb0c11::simd::simd82774     simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
bits_not_setsimdutf::westmere::__anone55652eb0c11::simd::simd82775     simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
any_bits_setsimdutf::westmere::__anone55652eb0c11::simd::simd82776     simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
any_bits_setsimdutf::westmere::__anone55652eb0c11::simd::simd82777     simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
is_asciisimdutf::westmere::__anone55652eb0c11::simd::simd82778     simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
2779 
bits_not_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82780     simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
any_bits_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82781     simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
bits_not_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82782     simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
any_bits_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82783     simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
2784     template<int N>
shrsimdutf::westmere::__anone55652eb0c11::simd::simd82785     simdutf_really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
2786     template<int N>
shlsimdutf::westmere::__anone55652eb0c11::simd::simd82787     simdutf_really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
2788     // Get one of the bits and make a bitmask out of it.
2789     // e.g. value.get_bit<7>() gets the high bit
2790     template<int N>
get_bitsimdutf::westmere::__anone55652eb0c11::simd::simd82791     simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
2792   };
operator simd8<uint8_t>() const2793   simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
2794 
2795   // Unsigned bytes
2796   template<>
2797   struct simd8<uint16_t>: base<uint16_t> {
splatsimdutf::westmere::__anone55652eb0c11::simd::simd82798     static simdutf_really_inline simd8<uint16_t> splat(uint16_t _value) { return _mm_set1_epi16(_value); }
loadsimdutf::westmere::__anone55652eb0c11::simd::simd82799     static simdutf_really_inline simd8<uint16_t> load(const uint16_t values[8]) {
2800       return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
2801     }
2802 
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82803     simdutf_really_inline simd8() : base<uint16_t>() {}
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82804     simdutf_really_inline simd8(const __m128i _value) : base<uint16_t>(_value) {}
2805     // Splat constructor
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82806     simdutf_really_inline simd8(uint16_t _value) : simd8(splat(_value)) {}
2807     // Array constructor
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82808     simdutf_really_inline simd8(const uint16_t* values) : simd8(load(values)) {}
2809     // Member-by-member initialization
simd8simdutf::westmere::__anone55652eb0c11::simd::simd82810     simdutf_really_inline simd8(
2811       uint16_t v0,  uint16_t v1,  uint16_t v2,  uint16_t v3,  uint16_t v4,  uint16_t v5,  uint16_t v6,  uint16_t v7
2812     ) : simd8(_mm_setr_epi16(
2813       v0, v1, v2, v3, v4, v5, v6, v7
2814     )) {}
2815 
2816     // Saturated math
saturating_addsimdutf::westmere::__anone55652eb0c11::simd::simd82817     simdutf_really_inline simd8<uint16_t> saturating_add(const simd8<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
saturating_subsimdutf::westmere::__anone55652eb0c11::simd::simd82818     simdutf_really_inline simd8<uint16_t> saturating_sub(const simd8<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
2819 
2820     // Order-specific operations
max_valsimdutf::westmere::__anone55652eb0c11::simd::simd82821     simdutf_really_inline simd8<uint16_t> max_val(const simd8<uint16_t> other) const { return _mm_max_epu16(*this, other); }
min_valsimdutf::westmere::__anone55652eb0c11::simd::simd82822     simdutf_really_inline simd8<uint16_t> min_val(const simd8<uint16_t> other) const { return _mm_min_epu16(*this, other); }
2823     // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::westmere::__anone55652eb0c11::simd::simd82824     simdutf_really_inline simd8<uint16_t> gt_bits(const simd8<uint16_t> other) const { return this->saturating_sub(other); }
2825     // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::westmere::__anone55652eb0c11::simd::simd82826     simdutf_really_inline simd8<uint16_t> lt_bits(const simd8<uint16_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::westmere::__anone55652eb0c11::simd::simd82827     simdutf_really_inline simd8<bool> operator<=(const simd8<uint16_t> other) const { return other.max_val(*this) == other; }
operator >=simdutf::westmere::__anone55652eb0c11::simd::simd82828     simdutf_really_inline simd8<bool> operator>=(const simd8<uint16_t> other) const { return other.min_val(*this) == other; }
operator ==simdutf::westmere::__anone55652eb0c11::simd::simd82829     simdutf_really_inline simd8<bool> operator==(const simd8<uint16_t> other) const { return _mm_cmpeq_epi16(*this, other); }
operator &simdutf::westmere::__anone55652eb0c11::simd::simd82830     simdutf_really_inline simd8<bool> operator&(const simd8<uint16_t> other) const { return _mm_and_si128(*this, other); }
operator |simdutf::westmere::__anone55652eb0c11::simd::simd82831     simdutf_really_inline simd8<bool> operator|(const simd8<uint16_t> other) const { return _mm_or_si128(*this, other); }
2832 
2833     // Bit-specific operations
bits_not_setsimdutf::westmere::__anone55652eb0c11::simd::simd82834     simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint16_t(0); }
any_bits_setsimdutf::westmere::__anone55652eb0c11::simd::simd82835     simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
2836 
bits_not_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82837     simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
any_bits_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82838     simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
bits_not_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82839     simdutf_really_inline bool bits_not_set_anywhere(simd8<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
any_bits_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd82840     simdutf_really_inline bool any_bits_set_anywhere(simd8<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
2841      };
2842   template<typename T>
2843   struct simd8x64 {
2844     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
2845     static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
2846     simd8<T> chunks[NUM_CHUNKS];
2847 
2848     simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
2849     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
2850     simd8x64() = delete; // no default constructor allowed
2851 
simd8x64simdutf::westmere::__anone55652eb0c11::simd::simd8x642852     simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
simd8x64simdutf::westmere::__anone55652eb0c11::simd::simd8x642853     simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
2854 
storesimdutf::westmere::__anone55652eb0c11::simd::simd8x642855     simdutf_really_inline void store(T* ptr) const {
2856       this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
2857       this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
2858       this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
2859       this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
2860     }
2861 
operator |=simdutf::westmere::__anone55652eb0c11::simd::simd8x642862     simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
2863       this->chunks[0] |= other.chunks[0];
2864       this->chunks[1] |= other.chunks[1];
2865       this->chunks[2] |= other.chunks[2];
2866       this->chunks[3] |= other.chunks[3];
2867       return *this;
2868     }
2869 
reduce_orsimdutf::westmere::__anone55652eb0c11::simd::simd8x642870     simdutf_really_inline simd8<T> reduce_or() const {
2871       return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
2872     }
2873 
is_asciisimdutf::westmere::__anone55652eb0c11::simd::simd8x642874     simdutf_really_inline bool is_ascii() const {
2875       return this->reduce_or().is_ascii();
2876     }
2877 
2878     template <endianness endian>
store_ascii_as_utf16simdutf::westmere::__anone55652eb0c11::simd::simd8x642879     simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
2880       this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
2881       this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
2882       this->chunks[2].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*2);
2883       this->chunks[3].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*3);
2884     }
2885 
store_ascii_as_utf32simdutf::westmere::__anone55652eb0c11::simd::simd8x642886     simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
2887       this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
2888       this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
2889       this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*2);
2890       this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*3);
2891     }
2892 
to_bitmasksimdutf::westmere::__anone55652eb0c11::simd::simd8x642893     simdutf_really_inline uint64_t to_bitmask() const {
2894       uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
2895       uint64_t r1 =          this->chunks[1].to_bitmask() ;
2896       uint64_t r2 =          this->chunks[2].to_bitmask() ;
2897       uint64_t r3 =          this->chunks[3].to_bitmask() ;
2898       return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
2899     }
2900 
eqsimdutf::westmere::__anone55652eb0c11::simd::simd8x642901     simdutf_really_inline uint64_t eq(const T m) const {
2902       const simd8<T> mask = simd8<T>::splat(m);
2903       return  simd8x64<bool>(
2904         this->chunks[0] == mask,
2905         this->chunks[1] == mask,
2906         this->chunks[2] == mask,
2907         this->chunks[3] == mask
2908       ).to_bitmask();
2909     }
2910 
eqsimdutf::westmere::__anone55652eb0c11::simd::simd8x642911     simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
2912       return  simd8x64<bool>(
2913         this->chunks[0] == other.chunks[0],
2914         this->chunks[1] == other.chunks[1],
2915         this->chunks[2] == other.chunks[2],
2916         this->chunks[3] == other.chunks[3]
2917       ).to_bitmask();
2918     }
2919 
lteqsimdutf::westmere::__anone55652eb0c11::simd::simd8x642920     simdutf_really_inline uint64_t lteq(const T m) const {
2921       const simd8<T> mask = simd8<T>::splat(m);
2922       return  simd8x64<bool>(
2923         this->chunks[0] <= mask,
2924         this->chunks[1] <= mask,
2925         this->chunks[2] <= mask,
2926         this->chunks[3] <= mask
2927       ).to_bitmask();
2928     }
2929 
in_rangesimdutf::westmere::__anone55652eb0c11::simd::simd8x642930     simdutf_really_inline uint64_t in_range(const T low, const T high) const {
2931       const simd8<T> mask_low = simd8<T>::splat(low);
2932       const simd8<T> mask_high = simd8<T>::splat(high);
2933 
2934       return  simd8x64<bool>(
2935         (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
2936         (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
2937         (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
2938         (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
2939       ).to_bitmask();
2940     }
not_in_rangesimdutf::westmere::__anone55652eb0c11::simd::simd8x642941     simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
2942       const simd8<T> mask_low = simd8<T>::splat(low-1);
2943       const simd8<T> mask_high = simd8<T>::splat(high+1);
2944       return simd8x64<bool>(
2945         (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
2946         (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
2947         (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
2948         (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
2949       ).to_bitmask();
2950     }
ltsimdutf::westmere::__anone55652eb0c11::simd::simd8x642951     simdutf_really_inline uint64_t lt(const T m) const {
2952       const simd8<T> mask = simd8<T>::splat(m);
2953       return  simd8x64<bool>(
2954         this->chunks[0] < mask,
2955         this->chunks[1] < mask,
2956         this->chunks[2] < mask,
2957         this->chunks[3] < mask
2958       ).to_bitmask();
2959     }
2960 
gtsimdutf::westmere::__anone55652eb0c11::simd::simd8x642961     simdutf_really_inline uint64_t gt(const T m) const {
2962       const simd8<T> mask = simd8<T>::splat(m);
2963       return  simd8x64<bool>(
2964         this->chunks[0] > mask,
2965         this->chunks[1] > mask,
2966         this->chunks[2] > mask,
2967         this->chunks[3] > mask
2968       ).to_bitmask();
2969     }
gteqsimdutf::westmere::__anone55652eb0c11::simd::simd8x642970     simdutf_really_inline uint64_t gteq(const T m) const {
2971       const simd8<T> mask = simd8<T>::splat(m);
2972       return  simd8x64<bool>(
2973         this->chunks[0] >= mask,
2974         this->chunks[1] >= mask,
2975         this->chunks[2] >= mask,
2976         this->chunks[3] >= mask
2977       ).to_bitmask();
2978     }
gteq_unsignedsimdutf::westmere::__anone55652eb0c11::simd::simd8x642979     simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
2980       const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
2981       return  simd8x64<bool>(
2982         simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
2983         simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
2984         simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
2985         simd8<uint8_t>(__m128i(this->chunks[3])) >= mask
2986       ).to_bitmask();
2987     }
2988   }; // struct simd8x64<T>
2989 
2990 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd16-inl.h
2991 /* begin file src/simdutf/westmere/simd16-inl.h */
2992 template<typename T>
2993 struct simd16;
2994 
2995 template<typename T, typename Mask=simd16<bool>>
2996 struct base16: base<simd16<T>> {
2997   typedef uint16_t bitmask_t;
2998   typedef uint32_t bitmask2_t;
2999 
base16simdutf::westmere::__anone55652eb0c11::simd::base163000   simdutf_really_inline base16() : base<simd16<T>>() {}
base16simdutf::westmere::__anone55652eb0c11::simd::base163001   simdutf_really_inline base16(const __m128i _value) : base<simd16<T>>(_value) {}
3002   template <typename Pointer>
base16simdutf::westmere::__anone55652eb0c11::simd::base163003   simdutf_really_inline base16(const Pointer* ptr) : base16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr))) {}
3004 
operator ==(const simd16<T> lhs,const simd16<T> rhs)3005   friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm_cmpeq_epi16(lhs, rhs); }
3006 
3007   static const int SIZE = sizeof(base<simd16<T>>::value);
3008 
3009   template<int N=1>
prevsimdutf::westmere::__anone55652eb0c11::simd::base163010   simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
3011     return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
3012   }
3013 };
3014 
3015 // SIMD byte mask type (returned by things like eq and gt)
3016 template<>
3017 struct simd16<bool>: base16<bool> {
splatsimdutf::westmere::__anone55652eb0c11::simd::simd163018   static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm_set1_epi16(uint16_t(-(!!_value))); }
3019 
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163020   simdutf_really_inline simd16<bool>() : base16() {}
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163021   simdutf_really_inline simd16<bool>(const __m128i _value) : base16<bool>(_value) {}
3022   // Splat constructor
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163023   simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
3024 
to_bitmasksimdutf::westmere::__anone55652eb0c11::simd::simd163025   simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
anysimdutf::westmere::__anone55652eb0c11::simd::simd163026   simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
operator ~simdutf::westmere::__anone55652eb0c11::simd::simd163027   simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
3028 };
3029 
3030 template<typename T>
3031 struct base16_numeric: base16<T> {
splatsimdutf::westmere::__anone55652eb0c11::simd::base16_numeric3032   static simdutf_really_inline simd16<T> splat(T _value) { return _mm_set1_epi16(_value); }
zerosimdutf::westmere::__anone55652eb0c11::simd::base16_numeric3033   static simdutf_really_inline simd16<T> zero() { return _mm_setzero_si128(); }
loadsimdutf::westmere::__anone55652eb0c11::simd::base16_numeric3034   static simdutf_really_inline simd16<T> load(const T values[8]) {
3035     return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
3036   }
3037 
base16_numericsimdutf::westmere::__anone55652eb0c11::simd::base16_numeric3038   simdutf_really_inline base16_numeric() : base16<T>() {}
base16_numericsimdutf::westmere::__anone55652eb0c11::simd::base16_numeric3039   simdutf_really_inline base16_numeric(const __m128i _value) : base16<T>(_value) {}
3040 
3041   // Store to array
storesimdutf::westmere::__anone55652eb0c11::simd::base16_numeric3042   simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
3043 
3044   // Override to distinguish from bool version
operator ~simdutf::westmere::__anone55652eb0c11::simd::base16_numeric3045   simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
3046 
3047   // Addition/subtraction are the same for signed and unsigned
operator +simdutf::westmere::__anone55652eb0c11::simd::base16_numeric3048   simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm_add_epi16(*this, other); }
operator -simdutf::westmere::__anone55652eb0c11::simd::base16_numeric3049   simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm_sub_epi16(*this, other); }
operator +=simdutf::westmere::__anone55652eb0c11::simd::base16_numeric3050   simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
operator -=simdutf::westmere::__anone55652eb0c11::simd::base16_numeric3051   simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
3052 };
3053 
3054 // Signed words
3055 template<>
3056 struct simd16<int16_t> : base16_numeric<int16_t> {
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163057   simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163058   simdutf_really_inline simd16(const __m128i _value) : base16_numeric<int16_t>(_value) {}
3059   // Splat constructor
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163060   simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
3061   // Array constructor
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163062   simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163063   simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
3064   // Member-by-member initialization
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163065   simdutf_really_inline simd16(
3066     int16_t v0, int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7)
3067     : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
3068   simdutf_really_inline operator simd16<uint16_t>() const;
3069 
3070   // Order-sensitive comparisons
max_valsimdutf::westmere::__anone55652eb0c11::simd::simd163071   simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm_max_epi16(*this, other); }
min_valsimdutf::westmere::__anone55652eb0c11::simd::simd163072   simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm_min_epi16(*this, other); }
operator >simdutf::westmere::__anone55652eb0c11::simd::simd163073   simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(*this, other); }
operator <simdutf::westmere::__anone55652eb0c11::simd::simd163074   simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(other, *this); }
3075 };
3076 
3077 // Unsigned words
3078 template<>
3079 struct simd16<uint16_t>: base16_numeric<uint16_t>  {
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163080   simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163081   simdutf_really_inline simd16(const __m128i _value) : base16_numeric<uint16_t>(_value) {}
3082 
3083   // Splat constructor
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163084   simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
3085   // Array constructor
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163086   simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163087   simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
3088   // Member-by-member initialization
simd16simdutf::westmere::__anone55652eb0c11::simd::simd163089   simdutf_really_inline simd16(
3090     uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
3091   : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
3092   // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::westmere::__anone55652eb0c11::simd::simd163093   simdutf_really_inline static simd16<uint16_t> repeat_16(
3094     uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7
3095   ) {
3096     return simd16<uint16_t>(v0, v1, v2, v3, v4, v5, v6, v7);
3097   }
3098 
3099   // Saturated math
saturating_addsimdutf::westmere::__anone55652eb0c11::simd::simd163100   simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
saturating_subsimdutf::westmere::__anone55652eb0c11::simd::simd163101   simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
3102 
3103   // Order-specific operations
max_valsimdutf::westmere::__anone55652eb0c11::simd::simd163104   simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm_max_epu16(*this, other); }
min_valsimdutf::westmere::__anone55652eb0c11::simd::simd163105   simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm_min_epu16(*this, other); }
3106   // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::westmere::__anone55652eb0c11::simd::simd163107   simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
3108   // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::westmere::__anone55652eb0c11::simd::simd163109   simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::westmere::__anone55652eb0c11::simd::simd163110   simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
operator >=simdutf::westmere::__anone55652eb0c11::simd::simd163111   simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
operator >simdutf::westmere::__anone55652eb0c11::simd::simd163112   simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
operator <simdutf::westmere::__anone55652eb0c11::simd::simd163113   simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
3114 
3115   // Bit-specific operations
bits_not_setsimdutf::westmere::__anone55652eb0c11::simd::simd163116   simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
bits_not_setsimdutf::westmere::__anone55652eb0c11::simd::simd163117   simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
any_bits_setsimdutf::westmere::__anone55652eb0c11::simd::simd163118   simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
any_bits_setsimdutf::westmere::__anone55652eb0c11::simd::simd163119   simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
3120 
bits_not_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd163121   simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
any_bits_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd163122   simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
bits_not_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd163123   simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
any_bits_set_anywheresimdutf::westmere::__anone55652eb0c11::simd::simd163124   simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
3125   template<int N>
shrsimdutf::westmere::__anone55652eb0c11::simd::simd163126   simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm_srli_epi16(*this, N)); }
3127   template<int N>
shlsimdutf::westmere::__anone55652eb0c11::simd::simd163128   simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm_slli_epi16(*this, N)); }
3129   // Get one of the bits and make a bitmask out of it.
3130   // e.g. value.get_bit<7>() gets the high bit
3131   template<int N>
get_bitsimdutf::westmere::__anone55652eb0c11::simd::simd163132   simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
3133 
3134   // Change the endianness
swap_bytessimdutf::westmere::__anone55652eb0c11::simd::simd163135   simdutf_really_inline simd16<uint16_t> swap_bytes() const {
3136     const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
3137     return _mm_shuffle_epi8(*this, swap);
3138   }
3139 
3140   // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
packsimdutf::westmere::__anone55652eb0c11::simd::simd163141   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
3142     return _mm_packus_epi16(v0, v1);
3143   }
3144 };
operator simd16<uint16_t>() const3145 simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
3146 
3147 template<typename T>
3148   struct simd16x32 {
3149     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
3150     static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
3151     simd16<T> chunks[NUM_CHUNKS];
3152 
3153     simd16x32(const simd16x32<T>& o) = delete; // no copy allowed
3154     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
3155     simd16x32() = delete; // no default constructor allowed
3156 
simd16x32simdutf::westmere::__anone55652eb0c11::simd::simd16x323157     simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
simd16x32simdutf::westmere::__anone55652eb0c11::simd::simd16x323158     simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
3159 
storesimdutf::westmere::__anone55652eb0c11::simd::simd16x323160     simdutf_really_inline void store(T* ptr) const {
3161       this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
3162       this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
3163       this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
3164       this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
3165     }
3166 
reduce_orsimdutf::westmere::__anone55652eb0c11::simd::simd16x323167     simdutf_really_inline simd16<T> reduce_or() const {
3168       return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
3169     }
3170 
is_asciisimdutf::westmere::__anone55652eb0c11::simd::simd16x323171     simdutf_really_inline bool is_ascii() const {
3172       return this->reduce_or().is_ascii();
3173     }
3174 
store_ascii_as_utf16simdutf::westmere::__anone55652eb0c11::simd::simd16x323175     simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
3176       this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
3177       this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
3178       this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
3179       this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
3180     }
3181 
to_bitmasksimdutf::westmere::__anone55652eb0c11::simd::simd16x323182     simdutf_really_inline uint64_t to_bitmask() const {
3183       uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
3184       uint64_t r1 =          this->chunks[1].to_bitmask() ;
3185       uint64_t r2 =          this->chunks[2].to_bitmask() ;
3186       uint64_t r3 =          this->chunks[3].to_bitmask() ;
3187       return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
3188     }
3189 
swap_bytessimdutf::westmere::__anone55652eb0c11::simd::simd16x323190     simdutf_really_inline void swap_bytes() {
3191       this->chunks[0] = this->chunks[0].swap_bytes();
3192       this->chunks[1] = this->chunks[1].swap_bytes();
3193       this->chunks[2] = this->chunks[2].swap_bytes();
3194       this->chunks[3] = this->chunks[3].swap_bytes();
3195     }
3196 
eqsimdutf::westmere::__anone55652eb0c11::simd::simd16x323197     simdutf_really_inline uint64_t eq(const T m) const {
3198       const simd16<T> mask = simd16<T>::splat(m);
3199       return  simd16x32<bool>(
3200         this->chunks[0] == mask,
3201         this->chunks[1] == mask,
3202         this->chunks[2] == mask,
3203         this->chunks[3] == mask
3204       ).to_bitmask();
3205     }
3206 
eqsimdutf::westmere::__anone55652eb0c11::simd::simd16x323207     simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
3208       return  simd16x32<bool>(
3209         this->chunks[0] == other.chunks[0],
3210         this->chunks[1] == other.chunks[1],
3211         this->chunks[2] == other.chunks[2],
3212         this->chunks[3] == other.chunks[3]
3213       ).to_bitmask();
3214     }
3215 
lteqsimdutf::westmere::__anone55652eb0c11::simd::simd16x323216     simdutf_really_inline uint64_t lteq(const T m) const {
3217       const simd16<T> mask = simd16<T>::splat(m);
3218       return  simd16x32<bool>(
3219         this->chunks[0] <= mask,
3220         this->chunks[1] <= mask,
3221         this->chunks[2] <= mask,
3222         this->chunks[3] <= mask
3223       ).to_bitmask();
3224     }
3225 
in_rangesimdutf::westmere::__anone55652eb0c11::simd::simd16x323226     simdutf_really_inline uint64_t in_range(const T low, const T high) const {
3227       const simd16<T> mask_low = simd16<T>::splat(low);
3228       const simd16<T> mask_high = simd16<T>::splat(high);
3229 
3230       return  simd16x32<bool>(
3231         (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
3232         (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
3233         (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
3234         (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
3235       ).to_bitmask();
3236     }
not_in_rangesimdutf::westmere::__anone55652eb0c11::simd::simd16x323237     simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
3238       const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
3239       const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
3240       return simd16x32<bool>(
3241         (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
3242         (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
3243         (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
3244         (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
3245       ).to_bitmask();
3246     }
ltsimdutf::westmere::__anone55652eb0c11::simd::simd16x323247     simdutf_really_inline uint64_t lt(const T m) const {
3248       const simd16<T> mask = simd16<T>::splat(m);
3249       return  simd16x32<bool>(
3250         this->chunks[0] < mask,
3251         this->chunks[1] < mask,
3252         this->chunks[2] < mask,
3253         this->chunks[3] < mask
3254       ).to_bitmask();
3255     }
3256   }; // struct simd16x32<T>
3257 /* end file src/simdutf/westmere/simd16-inl.h */
3258 
3259 } // namespace simd
3260 } // unnamed namespace
3261 } // namespace westmere
3262 } // namespace simdutf
3263 
3264 #endif // SIMDUTF_WESTMERE_SIMD_INPUT_H
3265 /* end file src/simdutf/westmere/simd.h */
3266 
3267 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h
3268 /* begin file src/simdutf/westmere/end.h */
3269 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
3270 // nothing needed.
3271 #else
3272 SIMDUTF_UNTARGET_REGION
3273 #endif
3274 
3275 /* end file src/simdutf/westmere/end.h */
3276 
3277 #endif // SIMDUTF_IMPLEMENTATION_WESTMERE
3278 #endif // SIMDUTF_WESTMERE_COMMON_H
3279 /* end file src/simdutf/westmere.h */
3280 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64.h
3281 /* begin file src/simdutf/ppc64.h */
3282 #ifndef SIMDUTF_PPC64_H
3283 #define SIMDUTF_PPC64_H
3284 
3285 #ifdef SIMDUTF_FALLBACK_H
3286 #error "ppc64.h must be included before fallback.h"
3287 #endif
3288 
3289 
3290 #ifndef SIMDUTF_IMPLEMENTATION_PPC64
3291 #define SIMDUTF_IMPLEMENTATION_PPC64 (SIMDUTF_IS_PPC64)
3292 #endif
3293 #define SIMDUTF_CAN_ALWAYS_RUN_PPC64 SIMDUTF_IMPLEMENTATION_PPC64 && SIMDUTF_IS_PPC64
3294 
3295 
3296 
3297 #if SIMDUTF_IMPLEMENTATION_PPC64
3298 
3299 namespace simdutf {
3300 /**
3301  * Implementation for ALTIVEC (PPC64).
3302  */
3303 namespace ppc64 {
3304 } // namespace ppc64
3305 } // namespace simdutf
3306 
3307 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/implementation.h
3308 /* begin file src/simdutf/ppc64/implementation.h */
3309 #ifndef SIMDUTF_PPC64_IMPLEMENTATION_H
3310 #define SIMDUTF_PPC64_IMPLEMENTATION_H
3311 
3312 
3313 namespace simdutf {
3314 namespace ppc64 {
3315 
3316 namespace {
3317 using namespace simdutf;
3318 } // namespace
3319 
3320 class implementation final : public simdutf::implementation {
3321 public:
implementation()3322   simdutf_really_inline implementation()
3323       : simdutf::implementation("ppc64", "PPC64 ALTIVEC",
3324                                  internal::instruction_set::ALTIVEC) {}
3325   simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
3326   simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
3327   simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
3328   simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
3329   simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
3330   simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
3331   simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
3332   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
3333   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
3334   simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
3335   simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
3336   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
3337   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
3338   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
3339   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
3340   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3341   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3342   simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
3343   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
3344   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
3345   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3346   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3347   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3348   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3349   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3350   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3351   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3352   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3353   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3354   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3355   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3356   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3357   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3358   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3359   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3360   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
3361   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
3362   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
3363   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
3364   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
3365   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
3366   void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
3367   simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
3368   simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
3369   simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
3370   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
3371   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
3372   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
3373   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
3374   simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
3375   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
3376   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
3377   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
3378 };
3379 
3380 } // namespace ppc64
3381 } // namespace simdutf
3382 
3383 #endif // SIMDUTF_PPC64_IMPLEMENTATION_H
3384 /* end file src/simdutf/ppc64/implementation.h */
3385 
3386 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h
3387 /* begin file src/simdutf/ppc64/begin.h */
3388 // redefining SIMDUTF_IMPLEMENTATION to "ppc64"
3389 // #define SIMDUTF_IMPLEMENTATION ppc64
3390 /* end file src/simdutf/ppc64/begin.h */
3391 
3392 // Declarations
3393 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/intrinsics.h
3394 /* begin file src/simdutf/ppc64/intrinsics.h */
3395 #ifndef SIMDUTF_PPC64_INTRINSICS_H
3396 #define SIMDUTF_PPC64_INTRINSICS_H
3397 
3398 
3399 // This should be the correct header whether
3400 // you use visual studio or other compilers.
3401 #include <altivec.h>
3402 
3403 // These are defined by altivec.h in GCC toolchain, it is safe to undef them.
3404 #ifdef bool
3405 #undef bool
3406 #endif
3407 
3408 #ifdef vector
3409 #undef vector
3410 #endif
3411 
3412 #endif //  SIMDUTF_PPC64_INTRINSICS_H
3413 /* end file src/simdutf/ppc64/intrinsics.h */
3414 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/bitmanipulation.h
3415 /* begin file src/simdutf/ppc64/bitmanipulation.h */
3416 #ifndef SIMDUTF_PPC64_BITMANIPULATION_H
3417 #define SIMDUTF_PPC64_BITMANIPULATION_H
3418 
3419 namespace simdutf {
3420 namespace ppc64 {
3421 namespace {
3422 
3423 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
count_ones(uint64_t input_num)3424 simdutf_really_inline int count_ones(uint64_t input_num) {
3425   // note: we do not support legacy 32-bit Windows
3426   return __popcnt64(input_num); // Visual Studio wants two underscores
3427 }
3428 #else
3429 simdutf_really_inline int count_ones(uint64_t input_num) {
3430   return __builtin_popcountll(input_num);
3431 }
3432 #endif
3433 
3434 } // unnamed namespace
3435 } // namespace ppc64
3436 } // namespace simdutf
3437 
3438 #endif // SIMDUTF_PPC64_BITMANIPULATION_H
3439 /* end file src/simdutf/ppc64/bitmanipulation.h */
3440 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/simd.h
3441 /* begin file src/simdutf/ppc64/simd.h */
3442 #ifndef SIMDUTF_PPC64_SIMD_H
3443 #define SIMDUTF_PPC64_SIMD_H
3444 
3445 #include <type_traits>
3446 
3447 namespace simdutf {
3448 namespace ppc64 {
3449 namespace {
3450 namespace simd {
3451 
3452 using __m128i = __vector unsigned char;
3453 
3454 template <typename Child> struct base {
3455   __m128i value;
3456 
3457   // Zero constructor
basesimdutf::ppc64::__anone55652eb0f11::simd::base3458   simdutf_really_inline base() : value{__m128i()} {}
3459 
3460   // Conversion from SIMD register
basesimdutf::ppc64::__anone55652eb0f11::simd::base3461   simdutf_really_inline base(const __m128i _value) : value(_value) {}
3462 
3463   // Conversion to SIMD register
operator const __m128i&simdutf::ppc64::__anone55652eb0f11::simd::base3464   simdutf_really_inline operator const __m128i &() const {
3465     return this->value;
3466   }
operator __m128i&simdutf::ppc64::__anone55652eb0f11::simd::base3467   simdutf_really_inline operator __m128i &() { return this->value; }
3468 
3469   // Bit operations
operator |simdutf::ppc64::__anone55652eb0f11::simd::base3470   simdutf_really_inline Child operator|(const Child other) const {
3471     return vec_or(this->value, (__m128i)other);
3472   }
operator &simdutf::ppc64::__anone55652eb0f11::simd::base3473   simdutf_really_inline Child operator&(const Child other) const {
3474     return vec_and(this->value, (__m128i)other);
3475   }
operator ^simdutf::ppc64::__anone55652eb0f11::simd::base3476   simdutf_really_inline Child operator^(const Child other) const {
3477     return vec_xor(this->value, (__m128i)other);
3478   }
bit_andnotsimdutf::ppc64::__anone55652eb0f11::simd::base3479   simdutf_really_inline Child bit_andnot(const Child other) const {
3480     return vec_andc(this->value, (__m128i)other);
3481   }
operator |=simdutf::ppc64::__anone55652eb0f11::simd::base3482   simdutf_really_inline Child &operator|=(const Child other) {
3483     auto this_cast = static_cast<Child*>(this);
3484     *this_cast = *this_cast | other;
3485     return *this_cast;
3486   }
operator &=simdutf::ppc64::__anone55652eb0f11::simd::base3487   simdutf_really_inline Child &operator&=(const Child other) {
3488     auto this_cast = static_cast<Child*>(this);
3489     *this_cast = *this_cast & other;
3490     return *this_cast;
3491   }
operator ^=simdutf::ppc64::__anone55652eb0f11::simd::base3492   simdutf_really_inline Child &operator^=(const Child other) {
3493     auto this_cast = static_cast<Child*>(this);
3494     *this_cast = *this_cast ^ other;
3495     return *this_cast;
3496   }
3497 };
3498 
3499 // Forward-declared so they can be used by splat and friends.
3500 template <typename T> struct simd8;
3501 
3502 template <typename T, typename Mask = simd8<bool>>
3503 struct base8 : base<simd8<T>> {
3504   typedef uint16_t bitmask_t;
3505   typedef uint32_t bitmask2_t;
3506 
base8simdutf::ppc64::__anone55652eb0f11::simd::base83507   simdutf_really_inline base8() : base<simd8<T>>() {}
base8simdutf::ppc64::__anone55652eb0f11::simd::base83508   simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
3509 
operator ==(const simd8<T> lhs,const simd8<T> rhs)3510   friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) {
3511     return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs);
3512   }
3513 
3514   static const int SIZE = sizeof(base<simd8<T>>::value);
3515 
3516   template <int N = 1>
prevsimdutf::ppc64::__anone55652eb0f11::simd::base83517   simdutf_really_inline simd8<T> prev(simd8<T> prev_chunk) const {
3518     __m128i chunk = this->value;
3519 #ifdef __LITTLE_ENDIAN__
3520     chunk = (__m128i)vec_reve(this->value);
3521     prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
3522 #endif
3523     chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
3524 #ifdef __LITTLE_ENDIAN__
3525     chunk = (__m128i)vec_reve((__m128i)chunk);
3526 #endif
3527     return chunk;
3528   }
3529 };
3530 
3531 // SIMD byte mask type (returned by things like eq and gt)
3532 template <> struct simd8<bool> : base8<bool> {
splatsimdutf::ppc64::__anone55652eb0f11::simd::simd83533   static simdutf_really_inline simd8<bool> splat(bool _value) {
3534     return (__m128i)vec_splats((unsigned char)(-(!!_value)));
3535   }
3536 
simd8simdutf::ppc64::__anone55652eb0f11::simd::simd83537   simdutf_really_inline simd8<bool>() : base8() {}
simd8simdutf::ppc64::__anone55652eb0f11::simd::simd83538   simdutf_really_inline simd8<bool>(const __m128i _value)
3539       : base8<bool>(_value) {}
3540   // Splat constructor
simd8simdutf::ppc64::__anone55652eb0f11::simd::simd83541   simdutf_really_inline simd8<bool>(bool _value)
3542       : base8<bool>(splat(_value)) {}
3543 
to_bitmasksimdutf::ppc64::__anone55652eb0f11::simd::simd83544   simdutf_really_inline int to_bitmask() const {
3545     __vector unsigned long long result;
3546     const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
3547                                0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
3548 
3549     result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
3550                                                        (__m128i)perm_mask));
3551 #ifdef __LITTLE_ENDIAN__
3552     return static_cast<int>(result[1]);
3553 #else
3554     return static_cast<int>(result[0]);
3555 #endif
3556   }
anysimdutf::ppc64::__anone55652eb0f11::simd::simd83557   simdutf_really_inline bool any() const {
3558     return !vec_all_eq(this->value, (__m128i)vec_splats(0));
3559   }
operator ~simdutf::ppc64::__anone55652eb0f11::simd::simd83560   simdutf_really_inline simd8<bool> operator~() const {
3561     return this->value ^ (__m128i)splat(true);
3562   }
3563 };
3564 
3565 template <typename T> struct base8_numeric : base8<T> {
splatsimdutf::ppc64::__anone55652eb0f11::simd::base8_numeric3566   static simdutf_really_inline simd8<T> splat(T value) {
3567     (void)value;
3568     return (__m128i)vec_splats(value);
3569   }
zerosimdutf::ppc64::__anone55652eb0f11::simd::base8_numeric3570   static simdutf_really_inline simd8<T> zero() { return splat(0); }
loadsimdutf::ppc64::__anone55652eb0f11::simd::base8_numeric3571   static simdutf_really_inline simd8<T> load(const T values[16]) {
3572     return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(values)));
3573   }
3574   // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::ppc64::__anone55652eb0f11::simd::base8_numeric3575   static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
3576                                                    T v5, T v6, T v7, T v8, T v9,
3577                                                    T v10, T v11, T v12, T v13,
3578                                                    T v14, T v15) {
3579     return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
3580                     v14, v15);
3581   }
3582 
base8_numericsimdutf::ppc64::__anone55652eb0f11::simd::base8_numeric3583   simdutf_really_inline base8_numeric() : base8<T>() {}
base8_numericsimdutf::ppc64::__anone55652eb0f11::simd::base8_numeric3584   simdutf_really_inline base8_numeric(const __m128i _value)
3585       : base8<T>(_value) {}
3586 
3587   // Store to array
storesimdutf::ppc64::__anone55652eb0f11::simd::base8_numeric3588   simdutf_really_inline void store(T dst[16]) const {
3589     vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
3590   }
3591 
3592   // Override to distinguish from bool version
operator ~simdutf::ppc64::__anone55652eb0f11::simd::base8_numeric3593   simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
3594 
3595   // Addition/subtraction are the same for signed and unsigned
operator +simdutf::ppc64::__anone55652eb0f11::simd::base8_numeric3596   simdutf_really_inline simd8<T> operator+(const simd8<T> other) const {
3597     return (__m128i)((__m128i)this->value + (__m128i)other);
3598   }
operator -simdutf::ppc64::__anone55652eb0f11::simd::base8_numeric3599   simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
3600     return (__m128i)((__m128i)this->value - (__m128i)other);
3601   }
operator +=simdutf::ppc64::__anone55652eb0f11::simd::base8_numeric3602   simdutf_really_inline simd8<T> &operator+=(const simd8<T> other) {
3603     *this = *this + other;
3604     return *static_cast<simd8<T> *>(this);
3605   }
operator -=simdutf::ppc64::__anone55652eb0f11::simd::base8_numeric3606   simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
3607     *this = *this - other;
3608     return *static_cast<simd8<T> *>(this);
3609   }
3610 
3611   // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
3612   // for out of range values)
3613   template <typename L>
lookup_16simdutf::ppc64::__anone55652eb0f11::simd::base8_numeric3614   simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
3615     return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
3616   }
3617 
3618   template <typename L>
3619   simdutf_really_inline simd8<L>
lookup_16simdutf::ppc64::__anone55652eb0f11::simd::base8_numeric3620   lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
3621             L replace5, L replace6, L replace7, L replace8, L replace9,
3622             L replace10, L replace11, L replace12, L replace13, L replace14,
3623             L replace15) const {
3624     return lookup_16(simd8<L>::repeat_16(
3625         replace0, replace1, replace2, replace3, replace4, replace5, replace6,
3626         replace7, replace8, replace9, replace10, replace11, replace12,
3627         replace13, replace14, replace15));
3628   }
3629 };
3630 
3631 // Signed bytes
3632 template <> struct simd8<int8_t> : base8_numeric<int8_t> {
simd8simdutf::ppc64::__anone55652eb0f11::simd::simd83633   simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
simd8simdutf::ppc64::__anone55652eb0f11::simd::simd83634   simdutf_really_inline simd8(const __m128i _value)
3635       : base8_numeric<int8_t>(_value) {}
3636 
3637   // Splat constructor
simd8simdutf::ppc64::__anone55652eb0f11::simd::simd83638   simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
3639   // Array constructor
simd8simdutf::ppc64::__anone55652eb0f11::simd::simd83640   simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
3641   // Member-by-member initialization
simd8simdutf::ppc64::__anone55652eb0f11::simd::simd83642   simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
3643                                int8_t v4, int8_t v5, int8_t v6, int8_t v7,
3644                                int8_t v8, int8_t v9, int8_t v10, int8_t v11,
3645                                int8_t v12, int8_t v13, int8_t v14, int8_t v15)
3646       : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
3647                                               v8, v9, v10, v11, v12, v13, v14,
3648                                               v15}) {}
3649   // Repeat 16 values as many times as necessary (usually for lookup tables)
3650   simdutf_really_inline static simd8<int8_t>
repeat_16simdutf::ppc64::__anone55652eb0f11::simd::simd83651   repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
3652             int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
3653             int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
3654     return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
3655                          v13, v14, v15);
3656   }
3657 
3658   // Order-sensitive comparisons
3659   simdutf_really_inline simd8<int8_t>
max_valsimdutf::ppc64::__anone55652eb0f11::simd::simd83660   max_val(const simd8<int8_t> other) const {
3661     return (__m128i)vec_max((__vector signed char)this->value,
3662                             (__vector signed char)(__m128i)other);
3663   }
3664   simdutf_really_inline simd8<int8_t>
min_valsimdutf::ppc64::__anone55652eb0f11::simd::simd83665   min_val(const simd8<int8_t> other) const {
3666     return (__m128i)vec_min((__vector signed char)this->value,
3667                             (__vector signed char)(__m128i)other);
3668   }
3669   simdutf_really_inline simd8<bool>
operator >simdutf::ppc64::__anone55652eb0f11::simd::simd83670   operator>(const simd8<int8_t> other) const {
3671     return (__m128i)vec_cmpgt((__vector signed char)this->value,
3672                               (__vector signed char)(__m128i)other);
3673   }
3674   simdutf_really_inline simd8<bool>
operator <simdutf::ppc64::__anone55652eb0f11::simd::simd83675   operator<(const simd8<int8_t> other) const {
3676     return (__m128i)vec_cmplt((__vector signed char)this->value,
3677                               (__vector signed char)(__m128i)other);
3678   }
3679 };
3680 
3681 // Unsigned bytes
3682 template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
simd8simdutf::ppc64::__anone55652eb0f11::simd::simd83683   simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
simd8simdutf::ppc64::__anone55652eb0f11::simd::simd83684   simdutf_really_inline simd8(const __m128i _value)
3685       : base8_numeric<uint8_t>(_value) {}
3686   // Splat constructor
simd8simdutf::ppc64::__anone55652eb0f11::simd::simd83687   simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
3688   // Array constructor
simd8simdutf::ppc64::__anone55652eb0f11::simd::simd83689   simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
3690   // Member-by-member initialization
3691   simdutf_really_inline
simd8simdutf::ppc64::__anone55652eb0f11::simd::simd83692   simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
3693         uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
3694         uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
3695       : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
3696                         v13, v14, v15}) {}
3697   // Repeat 16 values as many times as necessary (usually for lookup tables)
3698   simdutf_really_inline static simd8<uint8_t>
repeat_16simdutf::ppc64::__anone55652eb0f11::simd::simd83699   repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
3700             uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
3701             uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
3702             uint8_t v15) {
3703     return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
3704                           v13, v14, v15);
3705   }
3706 
3707   // Saturated math
3708   simdutf_really_inline simd8<uint8_t>
saturating_addsimdutf::ppc64::__anone55652eb0f11::simd::simd83709   saturating_add(const simd8<uint8_t> other) const {
3710     return (__m128i)vec_adds(this->value, (__m128i)other);
3711   }
3712   simdutf_really_inline simd8<uint8_t>
saturating_subsimdutf::ppc64::__anone55652eb0f11::simd::simd83713   saturating_sub(const simd8<uint8_t> other) const {
3714     return (__m128i)vec_subs(this->value, (__m128i)other);
3715   }
3716 
3717   // Order-specific operations
3718   simdutf_really_inline simd8<uint8_t>
max_valsimdutf::ppc64::__anone55652eb0f11::simd::simd83719   max_val(const simd8<uint8_t> other) const {
3720     return (__m128i)vec_max(this->value, (__m128i)other);
3721   }
3722   simdutf_really_inline simd8<uint8_t>
min_valsimdutf::ppc64::__anone55652eb0f11::simd::simd83723   min_val(const simd8<uint8_t> other) const {
3724     return (__m128i)vec_min(this->value, (__m128i)other);
3725   }
3726   // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
3727   simdutf_really_inline simd8<uint8_t>
gt_bitssimdutf::ppc64::__anone55652eb0f11::simd::simd83728   gt_bits(const simd8<uint8_t> other) const {
3729     return this->saturating_sub(other);
3730   }
3731   // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
3732   simdutf_really_inline simd8<uint8_t>
lt_bitssimdutf::ppc64::__anone55652eb0f11::simd::simd83733   lt_bits(const simd8<uint8_t> other) const {
3734     return other.saturating_sub(*this);
3735   }
3736   simdutf_really_inline simd8<bool>
operator <=simdutf::ppc64::__anone55652eb0f11::simd::simd83737   operator<=(const simd8<uint8_t> other) const {
3738     return other.max_val(*this) == other;
3739   }
3740   simdutf_really_inline simd8<bool>
operator >=simdutf::ppc64::__anone55652eb0f11::simd::simd83741   operator>=(const simd8<uint8_t> other) const {
3742     return other.min_val(*this) == other;
3743   }
3744   simdutf_really_inline simd8<bool>
operator >simdutf::ppc64::__anone55652eb0f11::simd::simd83745   operator>(const simd8<uint8_t> other) const {
3746     return this->gt_bits(other).any_bits_set();
3747   }
3748   simdutf_really_inline simd8<bool>
operator <simdutf::ppc64::__anone55652eb0f11::simd::simd83749   operator<(const simd8<uint8_t> other) const {
3750     return this->gt_bits(other).any_bits_set();
3751   }
3752 
3753   // Bit-specific operations
bits_not_setsimdutf::ppc64::__anone55652eb0f11::simd::simd83754   simdutf_really_inline simd8<bool> bits_not_set() const {
3755     return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
3756   }
bits_not_setsimdutf::ppc64::__anone55652eb0f11::simd::simd83757   simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
3758     return (*this & bits).bits_not_set();
3759   }
any_bits_setsimdutf::ppc64::__anone55652eb0f11::simd::simd83760   simdutf_really_inline simd8<bool> any_bits_set() const {
3761     return ~this->bits_not_set();
3762   }
any_bits_setsimdutf::ppc64::__anone55652eb0f11::simd::simd83763   simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
3764     return ~this->bits_not_set(bits);
3765   }
3766 
is_asciisimdutf::ppc64::__anone55652eb0f11::simd::simd83767   simdutf_really_inline bool is_ascii() const {
3768       return this->saturating_sub(0b01111111u).bits_not_set_anywhere();
3769   }
3770 
bits_not_set_anywheresimdutf::ppc64::__anone55652eb0f11::simd::simd83771   simdutf_really_inline bool bits_not_set_anywhere() const {
3772     return vec_all_eq(this->value, (__m128i)vec_splats(0));
3773   }
any_bits_set_anywheresimdutf::ppc64::__anone55652eb0f11::simd::simd83774   simdutf_really_inline bool any_bits_set_anywhere() const {
3775     return !bits_not_set_anywhere();
3776   }
bits_not_set_anywheresimdutf::ppc64::__anone55652eb0f11::simd::simd83777   simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
3778     return vec_all_eq(vec_and(this->value, (__m128i)bits),
3779                       (__m128i)vec_splats(0));
3780   }
any_bits_set_anywheresimdutf::ppc64::__anone55652eb0f11::simd::simd83781   simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
3782     return !bits_not_set_anywhere(bits);
3783   }
shrsimdutf::ppc64::__anone55652eb0f11::simd::simd83784   template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
3785     return simd8<uint8_t>(
3786         (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
3787   }
shlsimdutf::ppc64::__anone55652eb0f11::simd::simd83788   template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
3789     return simd8<uint8_t>(
3790         (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
3791   }
3792 };
3793 
3794 template <typename T> struct simd8x64 {
3795   static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
3796   static_assert(NUM_CHUNKS == 4,
3797                 "PPC64 kernel should use four registers per 64-byte block.");
3798   simd8<T> chunks[NUM_CHUNKS];
3799 
3800   simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
3801   simd8x64<T> &
3802   operator=(const simd8<T> other) = delete; // no assignment allowed
3803   simd8x64() = delete;                      // no default constructor allowed
3804 
simd8x64simdutf::ppc64::__anone55652eb0f11::simd::simd8x643805   simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
3806                                   const simd8<T> chunk2, const simd8<T> chunk3)
3807       : chunks{chunk0, chunk1, chunk2, chunk3} {}
3808 
simd8x64simdutf::ppc64::__anone55652eb0f11::simd::simd8x643809   simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
3810 
storesimdutf::ppc64::__anone55652eb0f11::simd::simd8x643811   simdutf_really_inline void store(T* ptr) const {
3812     this->chunks[0].store(ptr + sizeof(simd8<T>) * 0/sizeof(T));
3813     this->chunks[1].store(ptr + sizeof(simd8<T>) * 1/sizeof(T));
3814     this->chunks[2].store(ptr + sizeof(simd8<T>) * 2/sizeof(T));
3815     this->chunks[3].store(ptr + sizeof(simd8<T>) * 3/sizeof(T));
3816   }
3817 
3818 
operator |=simdutf::ppc64::__anone55652eb0f11::simd::simd8x643819   simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
3820       this->chunks[0] |= other.chunks[0];
3821       this->chunks[1] |= other.chunks[1];
3822       this->chunks[2] |= other.chunks[2];
3823       this->chunks[3] |= other.chunks[3];
3824       return *this;
3825     }
3826 
reduce_orsimdutf::ppc64::__anone55652eb0f11::simd::simd8x643827   simdutf_really_inline simd8<T> reduce_or() const {
3828     return (this->chunks[0] | this->chunks[1]) |
3829            (this->chunks[2] | this->chunks[3]);
3830   }
3831 
3832 
is_asciisimdutf::ppc64::__anone55652eb0f11::simd::simd8x643833   simdutf_really_inline bool is_ascii() const {
3834     return input.reduce_or().is_ascii();
3835   }
3836 
to_bitmasksimdutf::ppc64::__anone55652eb0f11::simd::simd8x643837   simdutf_really_inline uint64_t to_bitmask() const {
3838     uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
3839     uint64_t r1 = this->chunks[1].to_bitmask();
3840     uint64_t r2 = this->chunks[2].to_bitmask();
3841     uint64_t r3 = this->chunks[3].to_bitmask();
3842     return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
3843   }
3844 
eqsimdutf::ppc64::__anone55652eb0f11::simd::simd8x643845   simdutf_really_inline uint64_t eq(const T m) const {
3846     const simd8<T> mask = simd8<T>::splat(m);
3847     return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
3848                           this->chunks[2] == mask, this->chunks[3] == mask)
3849         .to_bitmask();
3850   }
3851 
eqsimdutf::ppc64::__anone55652eb0f11::simd::simd8x643852   simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
3853     return simd8x64<bool>(this->chunks[0] == other.chunks[0],
3854                           this->chunks[1] == other.chunks[1],
3855                           this->chunks[2] == other.chunks[2],
3856                           this->chunks[3] == other.chunks[3])
3857         .to_bitmask();
3858   }
3859 
lteqsimdutf::ppc64::__anone55652eb0f11::simd::simd8x643860   simdutf_really_inline uint64_t lteq(const T m) const {
3861     const simd8<T> mask = simd8<T>::splat(m);
3862     return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
3863                           this->chunks[2] <= mask, this->chunks[3] <= mask)
3864         .to_bitmask();
3865   }
3866 
in_rangesimdutf::ppc64::__anone55652eb0f11::simd::simd8x643867   simdutf_really_inline uint64_t in_range(const T low, const T high) const {
3868       const simd8<T> mask_low = simd8<T>::splat(low);
3869       const simd8<T> mask_high = simd8<T>::splat(high);
3870 
3871       return  simd8x64<bool>(
3872         (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
3873         (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
3874         (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
3875         (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
3876       ).to_bitmask();
3877   }
not_in_rangesimdutf::ppc64::__anone55652eb0f11::simd::simd8x643878   simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
3879       const simd8<T> mask_low = simd8<T>::splat(low);
3880       const simd8<T> mask_high = simd8<T>::splat(high);
3881       return  simd8x64<bool>(
3882         (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
3883         (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
3884         (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
3885         (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
3886       ).to_bitmask();
3887   }
ltsimdutf::ppc64::__anone55652eb0f11::simd::simd8x643888   simdutf_really_inline uint64_t lt(const T m) const {
3889     const simd8<T> mask = simd8<T>::splat(m);
3890     return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
3891                           this->chunks[2] < mask, this->chunks[3] < mask)
3892         .to_bitmask();
3893   }
3894 
gtsimdutf::ppc64::__anone55652eb0f11::simd::simd8x643895   simdutf_really_inline uint64_t gt(const T m) const {
3896       const simd8<T> mask = simd8<T>::splat(m);
3897       return  simd8x64<bool>(
3898         this->chunks[0] > mask,
3899         this->chunks[1] > mask,
3900         this->chunks[2] > mask,
3901         this->chunks[3] > mask
3902       ).to_bitmask();
3903   }
gteqsimdutf::ppc64::__anone55652eb0f11::simd::simd8x643904   simdutf_really_inline uint64_t gteq(const T m) const {
3905       const simd8<T> mask = simd8<T>::splat(m);
3906       return  simd8x64<bool>(
3907         this->chunks[0] >= mask,
3908         this->chunks[1] >= mask,
3909         this->chunks[2] >= mask,
3910         this->chunks[3] >= mask
3911       ).to_bitmask();
3912   }
gteq_unsignedsimdutf::ppc64::__anone55652eb0f11::simd::simd8x643913   simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
3914       const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
3915       return  simd8x64<bool>(
3916         simd8<uint8_t>(this->chunks[0]) >= mask,
3917         simd8<uint8_t>(this->chunks[1]) >= mask,
3918         simd8<uint8_t>(this->chunks[2]) >= mask,
3919         simd8<uint8_t>(this->chunks[3]) >= mask
3920       ).to_bitmask();
3921   }
3922 }; // struct simd8x64<T>
3923 
3924 } // namespace simd
3925 } // unnamed namespace
3926 } // namespace ppc64
3927 } // namespace simdutf
3928 
3929 #endif // SIMDUTF_PPC64_SIMD_INPUT_H
3930 /* end file src/simdutf/ppc64/simd.h */
3931 
3932 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h
3933 /* begin file src/simdutf/ppc64/end.h */
3934 /* end file src/simdutf/ppc64/end.h */
3935 
3936 #endif // SIMDUTF_IMPLEMENTATION_PPC64
3937 
3938 #endif // SIMDUTF_PPC64_H
3939 /* end file src/simdutf/ppc64.h */
3940 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback.h
3941 /* begin file src/simdutf/fallback.h */
3942 #ifndef SIMDUTF_FALLBACK_H
3943 #define SIMDUTF_FALLBACK_H
3944 
3945 
3946 // Note that fallback.h is always imported last.
3947 
3948 // Default Fallback to on unless a builtin implementation has already been selected.
3949 #ifndef SIMDUTF_IMPLEMENTATION_FALLBACK
3950 #if SIMDUTF_CAN_ALWAYS_RUN_ARM64 || SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL || SIMDUTF_CAN_ALWAYS_RUN_WESTMERE || SIMDUTF_CAN_ALWAYS_RUN_PPC64
3951 #define SIMDUTF_IMPLEMENTATION_FALLBACK 0
3952 #else
3953 #define SIMDUTF_IMPLEMENTATION_FALLBACK 1
3954 #endif
3955 #endif
3956 
3957 #define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
3958 
3959 #if SIMDUTF_IMPLEMENTATION_FALLBACK
3960 
3961 namespace simdutf {
3962 /**
3963  * Fallback implementation (runs on any machine).
3964  */
3965 namespace fallback {
3966 } // namespace fallback
3967 } // namespace simdutf
3968 
3969 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/implementation.h
3970 /* begin file src/simdutf/fallback/implementation.h */
3971 #ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H
3972 #define SIMDUTF_FALLBACK_IMPLEMENTATION_H
3973 
3974 
3975 namespace simdutf {
3976 namespace fallback {
3977 
3978 namespace {
3979 using namespace simdutf;
3980 }
3981 
3982 class implementation final : public simdutf::implementation {
3983 public:
implementation()3984   simdutf_really_inline implementation() : simdutf::implementation(
3985       "fallback",
3986       "Generic fallback implementation",
3987       0
3988   ) {}
3989   simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
3990   simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
3991   simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
3992   simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
3993   simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
3994   simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
3995   simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
3996   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
3997   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
3998   simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
3999   simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
4000   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
4001   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
4002   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
4003   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
4004   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4005   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4006   simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
4007   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
4008   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
4009   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4010   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4011   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4012   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4013   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4014   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4015   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4016   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4017   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4018   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4019   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4020   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4021   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4022   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4023   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4024   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
4025   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
4026   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
4027   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
4028   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
4029   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
4030   void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
4031   simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
4032   simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
4033   simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
4034   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
4035   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
4036   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
4037   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
4038   simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
4039   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
4040   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
4041   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
4042 };
4043 
4044 } // namespace fallback
4045 } // namespace simdutf
4046 
4047 #endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H
4048 /* end file src/simdutf/fallback/implementation.h */
4049 
4050 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h
4051 /* begin file src/simdutf/fallback/begin.h */
4052 // redefining SIMDUTF_IMPLEMENTATION to "fallback"
4053 // #define SIMDUTF_IMPLEMENTATION fallback
4054 /* end file src/simdutf/fallback/begin.h */
4055 
4056 // Declarations
4057 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/bitmanipulation.h
4058 /* begin file src/simdutf/fallback/bitmanipulation.h */
4059 #ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H
4060 #define SIMDUTF_FALLBACK_BITMANIPULATION_H
4061 
4062 #include <limits>
4063 
4064 namespace simdutf {
4065 namespace fallback {
4066 namespace {
4067 
4068 #if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_X64)
_BitScanForward64(unsigned long * ret,uint64_t x)4069 static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
4070   unsigned long x0 = (unsigned long)x, top, bottom;
4071   _BitScanForward(&top, (unsigned long)(x >> 32));
4072   _BitScanForward(&bottom, x0);
4073   *ret = x0 ? bottom : 32 + top;
4074   return x != 0;
4075 }
_BitScanReverse64(unsigned long * ret,uint64_t x)4076 static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
4077   unsigned long x1 = (unsigned long)(x >> 32), top, bottom;
4078   _BitScanReverse(&top, x1);
4079   _BitScanReverse(&bottom, (unsigned long)x);
4080   *ret = x1 ? top + 32 : bottom;
4081   return x != 0;
4082 }
4083 #endif
4084 
4085 } // unnamed namespace
4086 } // namespace fallback
4087 } // namespace simdutf
4088 
4089 #endif // SIMDUTF_FALLBACK_BITMANIPULATION_H
4090 /* end file src/simdutf/fallback/bitmanipulation.h */
4091 
4092 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h
4093 /* begin file src/simdutf/fallback/end.h */
4094 /* end file src/simdutf/fallback/end.h */
4095 
4096 #endif // SIMDUTF_IMPLEMENTATION_FALLBACK
4097 #endif // SIMDUTF_FALLBACK_H
4098 /* end file src/simdutf/fallback.h */
4099 
4100 namespace simdutf {
supported_by_runtime_system() const4101 bool implementation::supported_by_runtime_system() const {
4102   uint32_t required_instruction_sets = this->required_instruction_sets();
4103   uint32_t supported_instruction_sets = internal::detect_supported_architectures();
4104   return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets);
4105 }
4106 
autodetect_encoding(const char * input,size_t length) const4107 simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char * input, size_t length) const noexcept {
4108     // If there is a BOM, then we trust it.
4109     auto bom_encoding = simdutf::BOM::check_bom(input, length);
4110     if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
4111     // UTF8 is common, it includes ASCII, and is commonly represented
4112     // without a BOM, so if it fits, go with that. Note that it is still
4113     // possible to get it wrong, we are only 'guessing'. If some has UTF-16
4114     // data without a BOM, it could pass as UTF-8.
4115     //
4116     // An interesting twist might be to check for UTF-16 ASCII first (every
4117     // other byte is zero).
4118     if(validate_utf8(input, length)) { return encoding_type::UTF8; }
4119     // The next most common encoding that might appear without BOM is probably
4120     // UTF-16LE, so try that next.
4121     if((length % 2) == 0) {
4122       // important: we need to divide by two
4123       if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { return encoding_type::UTF16_LE; }
4124     }
4125     if((length % 4) == 0) {
4126       if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { return encoding_type::UTF32_LE; }
4127     }
4128     return encoding_type::unspecified;
4129 }
4130 
4131 namespace internal {
4132 
4133 // Static array of known implementations. We're hoping these get baked into the executable
4134 // without requiring a static initializer.
4135 
4136 
4137 #if SIMDUTF_IMPLEMENTATION_ICELAKE
4138 const icelake::implementation icelake_singleton{};
4139 #endif
4140 #if SIMDUTF_IMPLEMENTATION_HASWELL
4141 const haswell::implementation haswell_singleton{};
4142 #endif
4143 #if SIMDUTF_IMPLEMENTATION_WESTMERE
4144 const westmere::implementation westmere_singleton{};
4145 #endif
4146 #if SIMDUTF_IMPLEMENTATION_ARM64
4147 const arm64::implementation arm64_singleton{};
4148 #endif
4149 #if SIMDUTF_IMPLEMENTATION_PPC64
4150 const ppc64::implementation ppc64_singleton{};
4151 #endif
4152 #if SIMDUTF_IMPLEMENTATION_FALLBACK
4153 const fallback::implementation fallback_singleton{};
4154 #endif
4155 
4156 /**
4157  * @private Detects best supported implementation on first use, and sets it
4158  */
4159 class detect_best_supported_implementation_on_first_use final : public implementation {
4160 public:
name() const4161   const std::string &name() const noexcept final { return set_best()->name(); }
description() const4162   const std::string &description() const noexcept final { return set_best()->description(); }
required_instruction_sets() const4163   uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
4164 
detect_encodings(const char * input,size_t length) const4165   simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept override {
4166     return set_best()->detect_encodings(input, length);
4167   }
4168 
validate_utf8(const char * buf,size_t len) const4169   simdutf_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override {
4170     return set_best()->validate_utf8(buf, len);
4171   }
4172 
validate_utf8_with_errors(const char * buf,size_t len) const4173   simdutf_warn_unused result validate_utf8_with_errors(const char * buf, size_t len) const noexcept final override {
4174     return set_best()->validate_utf8_with_errors(buf, len);
4175   }
4176 
validate_ascii(const char * buf,size_t len) const4177   simdutf_warn_unused bool validate_ascii(const char * buf, size_t len) const noexcept final override {
4178     return set_best()->validate_ascii(buf, len);
4179   }
4180 
validate_ascii_with_errors(const char * buf,size_t len) const4181   simdutf_warn_unused result validate_ascii_with_errors(const char * buf, size_t len) const noexcept final override {
4182     return set_best()->validate_ascii_with_errors(buf, len);
4183   }
4184 
validate_utf16le(const char16_t * buf,size_t len) const4185   simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) const noexcept final override {
4186     return set_best()->validate_utf16le(buf, len);
4187   }
4188 
validate_utf16be(const char16_t * buf,size_t len) const4189   simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) const noexcept final override {
4190     return set_best()->validate_utf16be(buf, len);
4191   }
4192 
validate_utf16le_with_errors(const char16_t * buf,size_t len) const4193   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) const noexcept final override {
4194     return set_best()->validate_utf16le_with_errors(buf, len);
4195   }
4196 
validate_utf16be_with_errors(const char16_t * buf,size_t len) const4197   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) const noexcept final override {
4198     return set_best()->validate_utf16be_with_errors(buf, len);
4199   }
4200 
validate_utf32(const char32_t * buf,size_t len) const4201   simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) const noexcept final override {
4202     return set_best()->validate_utf32(buf, len);
4203   }
4204 
validate_utf32_with_errors(const char32_t * buf,size_t len) const4205   simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) const noexcept final override {
4206     return set_best()->validate_utf32_with_errors(buf, len);
4207   }
4208 
convert_utf8_to_utf16le(const char * buf,size_t len,char16_t * utf16_output) const4209   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4210     return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
4211   }
4212 
convert_utf8_to_utf16be(const char * buf,size_t len,char16_t * utf16_output) const4213   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4214     return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
4215   }
4216 
convert_utf8_to_utf16le_with_errors(const char * buf,size_t len,char16_t * utf16_output) const4217   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4218     return set_best()->convert_utf8_to_utf16le_with_errors(buf, len, utf16_output);
4219   }
4220 
convert_utf8_to_utf16be_with_errors(const char * buf,size_t len,char16_t * utf16_output) const4221   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4222     return set_best()->convert_utf8_to_utf16be_with_errors(buf, len, utf16_output);
4223   }
4224 
convert_valid_utf8_to_utf16le(const char * buf,size_t len,char16_t * utf16_output) const4225   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4226     return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
4227   }
4228 
convert_valid_utf8_to_utf16be(const char * buf,size_t len,char16_t * utf16_output) const4229   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4230     return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
4231   }
4232 
convert_utf8_to_utf32(const char * buf,size_t len,char32_t * utf32_output) const4233   simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4234     return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
4235   }
4236 
convert_utf8_to_utf32_with_errors(const char * buf,size_t len,char32_t * utf32_output) const4237   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4238     return set_best()->convert_utf8_to_utf32_with_errors(buf, len, utf32_output);
4239   }
4240 
convert_valid_utf8_to_utf32(const char * buf,size_t len,char32_t * utf32_output) const4241   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4242     return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
4243   }
4244 
convert_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const4245   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
4246     return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
4247   }
4248 
convert_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const4249   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
4250     return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
4251   }
4252 
convert_utf16le_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output) const4253   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
4254     return set_best()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_output);
4255   }
4256 
convert_utf16be_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output) const4257   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
4258     return set_best()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_output);
4259   }
4260 
convert_valid_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const4261   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
4262     return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
4263   }
4264 
convert_valid_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const4265   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
4266     return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
4267   }
4268 
convert_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output) const4269   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
4270     return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
4271   }
4272 
convert_utf32_to_utf8_with_errors(const char32_t * buf,size_t len,char * utf8_output) const4273   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
4274     return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
4275   }
4276 
convert_valid_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output) const4277   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
4278     return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
4279   }
4280 
convert_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_output) const4281   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4282     return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
4283   }
4284 
convert_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_output) const4285   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4286     return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
4287   }
4288 
convert_utf32_to_utf16le_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output) const4289   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4290     return set_best()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_output);
4291   }
4292 
convert_utf32_to_utf16be_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output) const4293   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4294     return set_best()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_output);
4295   }
4296 
convert_valid_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_output) const4297   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4298     return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
4299   }
4300 
convert_valid_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_output) const4301   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4302     return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
4303   }
4304 
convert_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const4305   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4306     return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
4307   }
4308 
convert_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const4309   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4310     return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
4311   }
4312 
convert_utf16le_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output) const4313   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4314     return set_best()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_output);
4315   }
4316 
convert_utf16be_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output) const4317   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4318     return set_best()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_output);
4319   }
4320 
convert_valid_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const4321   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4322     return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
4323   }
4324 
convert_valid_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const4325   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4326     return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
4327   }
4328 
change_endianness_utf16(const char16_t * buf,size_t len,char16_t * output) const4329   void change_endianness_utf16(const char16_t * buf, size_t len, char16_t * output) const noexcept final override {
4330     set_best()->change_endianness_utf16(buf, len, output);
4331   }
4332 
count_utf16le(const char16_t * buf,size_t len) const4333   simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t len) const noexcept final override {
4334     return set_best()->count_utf16le(buf, len);
4335   }
4336 
count_utf16be(const char16_t * buf,size_t len) const4337   simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t len) const noexcept final override {
4338     return set_best()->count_utf16be(buf, len);
4339   }
4340 
count_utf8(const char * buf,size_t len) const4341   simdutf_warn_unused size_t count_utf8(const char * buf, size_t len) const noexcept final override {
4342     return set_best()->count_utf8(buf, len);
4343   }
4344 
utf8_length_from_utf16le(const char16_t * buf,size_t len) const4345   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override {
4346     return set_best()->utf8_length_from_utf16le(buf, len);
4347   }
4348 
utf8_length_from_utf16be(const char16_t * buf,size_t len) const4349   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override {
4350     return set_best()->utf8_length_from_utf16be(buf, len);
4351   }
4352 
utf32_length_from_utf16le(const char16_t * buf,size_t len) const4353   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override {
4354     return set_best()->utf32_length_from_utf16le(buf, len);
4355   }
4356 
utf32_length_from_utf16be(const char16_t * buf,size_t len) const4357   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override {
4358     return set_best()->utf32_length_from_utf16be(buf, len);
4359   }
4360 
utf16_length_from_utf8(const char * buf,size_t len) const4361   simdutf_warn_unused size_t utf16_length_from_utf8(const char * buf, size_t len) const noexcept override {
4362     return set_best()->utf16_length_from_utf8(buf, len);
4363   }
4364 
utf8_length_from_utf32(const char32_t * buf,size_t len) const4365   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * buf, size_t len) const noexcept override {
4366     return set_best()->utf8_length_from_utf32(buf, len);
4367   }
4368 
utf16_length_from_utf32(const char32_t * buf,size_t len) const4369   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * buf, size_t len) const noexcept override {
4370     return set_best()->utf16_length_from_utf32(buf, len);
4371   }
4372 
utf32_length_from_utf8(const char * buf,size_t len) const4373   simdutf_warn_unused size_t utf32_length_from_utf8(const char * buf, size_t len) const noexcept override {
4374     return set_best()->utf32_length_from_utf8(buf, len);
4375   }
4376 
detect_best_supported_implementation_on_first_use()4377   simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
4378 
4379 private:
4380   const implementation *set_best() const noexcept;
4381 };
4382 
4383 
4384 const std::initializer_list<const implementation *> available_implementation_pointers {
4385 #if SIMDUTF_IMPLEMENTATION_ICELAKE
4386   &icelake_singleton,
4387 #endif
4388 #if SIMDUTF_IMPLEMENTATION_HASWELL
4389   &haswell_singleton,
4390 #endif
4391 #if SIMDUTF_IMPLEMENTATION_WESTMERE
4392   &westmere_singleton,
4393 #endif
4394 #if SIMDUTF_IMPLEMENTATION_ARM64
4395   &arm64_singleton,
4396 #endif
4397 #if SIMDUTF_IMPLEMENTATION_PPC64
4398   &ppc64_singleton,
4399 #endif
4400 #if SIMDUTF_IMPLEMENTATION_FALLBACK
4401   &fallback_singleton,
4402 #endif
4403 }; // available_implementation_pointers
4404 
4405 // So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
4406 class unsupported_implementation final : public implementation {
4407 public:
detect_encodings(const char *,size_t) const4408   simdutf_warn_unused int detect_encodings(const char *, size_t) const noexcept override {
4409     return encoding_type::unspecified;
4410   }
4411 
validate_utf8(const char *,size_t) const4412   simdutf_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override {
4413     return false; // Just refuse to validate. Given that we have a fallback implementation
4414     // it seems unlikely that unsupported_implementation will ever be used. If it is used,
4415     // then it will flag all strings as invalid. The alternative is to return an error_code
4416     // from which the user has to figure out whether the string is valid UTF-8... which seems
4417     // like a lot of work just to handle the very unlikely case that we have an unsupported
4418     // implementation. And, when it does happen (that we have an unsupported implementation),
4419     // what are the chances that the programmer has a fallback? Given that *we* provide the
4420     // fallback, it implies that the programmer would need a fallback for our fallback.
4421   }
4422 
validate_utf8_with_errors(const char *,size_t) const4423   simdutf_warn_unused result validate_utf8_with_errors(const char *, size_t) const noexcept final override {
4424     return result(error_code::OTHER, 0);
4425   }
4426 
validate_ascii(const char *,size_t) const4427   simdutf_warn_unused bool validate_ascii(const char *, size_t) const noexcept final override {
4428     return false;
4429   }
4430 
validate_ascii_with_errors(const char *,size_t) const4431   simdutf_warn_unused result validate_ascii_with_errors(const char *, size_t) const noexcept final override {
4432     return result(error_code::OTHER, 0);
4433   }
4434 
validate_utf16le(const char16_t *,size_t) const4435   simdutf_warn_unused bool validate_utf16le(const char16_t*, size_t) const noexcept final override {
4436     return false;
4437   }
4438 
validate_utf16be(const char16_t *,size_t) const4439   simdutf_warn_unused bool validate_utf16be(const char16_t*, size_t) const noexcept final override {
4440     return false;
4441   }
4442 
validate_utf16le_with_errors(const char16_t *,size_t) const4443   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t*, size_t) const noexcept final override {
4444     return result(error_code::OTHER, 0);
4445   }
4446 
validate_utf16be_with_errors(const char16_t *,size_t) const4447   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t*, size_t) const noexcept final override {
4448     return result(error_code::OTHER, 0);
4449   }
4450 
validate_utf32(const char32_t *,size_t) const4451   simdutf_warn_unused bool validate_utf32(const char32_t*, size_t) const noexcept final override {
4452     return false;
4453   }
4454 
validate_utf32_with_errors(const char32_t *,size_t) const4455   simdutf_warn_unused result validate_utf32_with_errors(const char32_t*, size_t) const noexcept final override {
4456     return result(error_code::OTHER, 0);
4457   }
4458 
convert_utf8_to_utf16le(const char *,size_t,char16_t *) const4459   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override {
4460     return 0;
4461   }
4462 
convert_utf8_to_utf16be(const char *,size_t,char16_t *) const4463   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override {
4464     return 0;
4465   }
4466 
convert_utf8_to_utf16le_with_errors(const char *,size_t,char16_t *) const4467   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char*, size_t, char16_t*) const noexcept final override {
4468     return result(error_code::OTHER, 0);
4469   }
4470 
convert_utf8_to_utf16be_with_errors(const char *,size_t,char16_t *) const4471   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char*, size_t, char16_t*) const noexcept final override {
4472     return result(error_code::OTHER, 0);
4473   }
4474 
convert_valid_utf8_to_utf16le(const char *,size_t,char16_t *) const4475   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override {
4476     return 0;
4477   }
4478 
convert_valid_utf8_to_utf16be(const char *,size_t,char16_t *) const4479   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override {
4480     return 0;
4481   }
4482 
convert_utf8_to_utf32(const char *,size_t,char32_t *) const4483   simdutf_warn_unused size_t convert_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override {
4484     return 0;
4485   }
4486 
convert_utf8_to_utf32_with_errors(const char *,size_t,char32_t *) const4487   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char*, size_t, char32_t*) const noexcept final override {
4488     return result(error_code::OTHER, 0);
4489   }
4490 
convert_valid_utf8_to_utf32(const char *,size_t,char32_t *) const4491   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override {
4492     return 0;
4493   }
4494 
convert_utf16le_to_utf8(const char16_t *,size_t,char *) const4495   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
4496     return 0;
4497   }
4498 
convert_utf16be_to_utf8(const char16_t *,size_t,char *) const4499   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
4500     return 0;
4501   }
4502 
convert_utf16le_to_utf8_with_errors(const char16_t *,size_t,char *) const4503   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override {
4504     return result(error_code::OTHER, 0);
4505   }
4506 
convert_utf16be_to_utf8_with_errors(const char16_t *,size_t,char *) const4507   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override {
4508     return result(error_code::OTHER, 0);
4509   }
4510 
convert_valid_utf16le_to_utf8(const char16_t *,size_t,char *) const4511   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
4512     return 0;
4513   }
4514 
convert_valid_utf16be_to_utf8(const char16_t *,size_t,char *) const4515   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
4516     return 0;
4517   }
4518 
convert_utf32_to_utf8(const char32_t *,size_t,char *) const4519   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override {
4520     return 0;
4521   }
4522 
convert_utf32_to_utf8_with_errors(const char32_t *,size_t,char *) const4523   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t*, size_t, char*) const noexcept final override {
4524     return result(error_code::OTHER, 0);
4525   }
4526 
convert_valid_utf32_to_utf8(const char32_t *,size_t,char *) const4527   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override {
4528     return 0;
4529   }
4530 
convert_utf32_to_utf16le(const char32_t *,size_t,char16_t *) const4531   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override {
4532     return 0;
4533   }
4534 
convert_utf32_to_utf16be(const char32_t *,size_t,char16_t *) const4535   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override {
4536     return 0;
4537   }
4538 
convert_utf32_to_utf16le_with_errors(const char32_t *,size_t,char16_t *) const4539   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override {
4540     return result(error_code::OTHER, 0);
4541   }
4542 
convert_utf32_to_utf16be_with_errors(const char32_t *,size_t,char16_t *) const4543   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override {
4544     return result(error_code::OTHER, 0);
4545   }
4546 
convert_valid_utf32_to_utf16le(const char32_t *,size_t,char16_t *) const4547   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override {
4548     return 0;
4549   }
4550 
convert_valid_utf32_to_utf16be(const char32_t *,size_t,char16_t *) const4551   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override {
4552     return 0;
4553   }
4554 
convert_utf16le_to_utf32(const char16_t *,size_t,char32_t *) const4555   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
4556     return 0;
4557   }
4558 
convert_utf16be_to_utf32(const char16_t *,size_t,char32_t *) const4559   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
4560     return 0;
4561   }
4562 
convert_utf16le_to_utf32_with_errors(const char16_t *,size_t,char32_t *) const4563   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override {
4564     return result(error_code::OTHER, 0);
4565   }
4566 
convert_utf16be_to_utf32_with_errors(const char16_t *,size_t,char32_t *) const4567   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override {
4568     return result(error_code::OTHER, 0);
4569   }
4570 
convert_valid_utf16le_to_utf32(const char16_t *,size_t,char32_t *) const4571   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
4572     return 0;
4573   }
4574 
convert_valid_utf16be_to_utf32(const char16_t *,size_t,char32_t *) const4575   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
4576     return 0;
4577   }
4578 
change_endianness_utf16(const char16_t *,size_t,char16_t *) const4579   void change_endianness_utf16(const char16_t *, size_t, char16_t *) const noexcept final override {
4580 
4581   }
4582 
count_utf16le(const char16_t *,size_t) const4583   simdutf_warn_unused size_t count_utf16le(const char16_t *, size_t) const noexcept final override {
4584     return 0;
4585   }
4586 
count_utf16be(const char16_t *,size_t) const4587   simdutf_warn_unused size_t count_utf16be(const char16_t *, size_t) const noexcept final override {
4588     return 0;
4589   }
4590 
count_utf8(const char *,size_t) const4591   simdutf_warn_unused size_t count_utf8(const char *, size_t) const noexcept final override {
4592     return 0;
4593   }
4594 
utf8_length_from_utf16le(const char16_t *,size_t) const4595   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override {
4596     return 0;
4597   }
4598 
utf8_length_from_utf16be(const char16_t *,size_t) const4599   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override {
4600     return 0;
4601   }
4602 
utf32_length_from_utf16le(const char16_t *,size_t) const4603   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override {
4604     return 0;
4605   }
4606 
utf32_length_from_utf16be(const char16_t *,size_t) const4607   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override {
4608     return 0;
4609   }
4610 
utf16_length_from_utf8(const char *,size_t) const4611   simdutf_warn_unused size_t utf16_length_from_utf8(const char *, size_t) const noexcept override {
4612     return 0;
4613   }
4614 
utf8_length_from_utf32(const char32_t *,size_t) const4615   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *, size_t) const noexcept override {
4616     return 0;
4617   }
4618 
utf16_length_from_utf32(const char32_t *,size_t) const4619   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *, size_t) const noexcept override {
4620     return 0;
4621   }
4622 
utf32_length_from_utf8(const char *,size_t) const4623   simdutf_warn_unused size_t utf32_length_from_utf8(const char *, size_t) const noexcept override {
4624     return 0;
4625   }
4626 
unsupported_implementation()4627   unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
4628 };
4629 
4630 const unsupported_implementation unsupported_singleton{};
4631 
size() const4632 size_t available_implementation_list::size() const noexcept {
4633   return internal::available_implementation_pointers.size();
4634 }
begin() const4635 const implementation * const *available_implementation_list::begin() const noexcept {
4636   return internal::available_implementation_pointers.begin();
4637 }
end() const4638 const implementation * const *available_implementation_list::end() const noexcept {
4639   return internal::available_implementation_pointers.end();
4640 }
detect_best_supported() const4641 const implementation *available_implementation_list::detect_best_supported() const noexcept {
4642   // They are prelisted in priority order, so we just go down the list
4643   uint32_t supported_instruction_sets = internal::detect_supported_architectures();
4644   for (const implementation *impl : internal::available_implementation_pointers) {
4645     uint32_t required_instruction_sets = impl->required_instruction_sets();
4646     if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; }
4647   }
4648   return &unsupported_singleton; // this should never happen?
4649 }
4650 
set_best() const4651 const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept {
4652   SIMDUTF_PUSH_DISABLE_WARNINGS
4653   SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
4654   char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
4655   SIMDUTF_POP_DISABLE_WARNINGS
4656 
4657   if (force_implementation_name) {
4658     auto force_implementation = get_available_implementations()[force_implementation_name];
4659     if (force_implementation) {
4660       return get_active_implementation() = force_implementation;
4661     } else {
4662       // Note: abort() and stderr usage within the library is forbidden.
4663       return get_active_implementation() = &unsupported_singleton;
4664     }
4665   }
4666   return get_active_implementation() = get_available_implementations().detect_best_supported();
4667 }
4668 
4669 } // namespace internal
4670 
4671 
4672 
4673 /**
4674  * The list of available implementations compiled into simdutf.
4675  */
get_available_implementations()4676 SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations() {
4677   static const internal::available_implementation_list available_implementations{};
4678   return available_implementations;
4679 }
4680 
4681 /**
4682   * The active implementation.
4683   */
get_active_implementation()4684 SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation() {
4685     static const internal::detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton;
4686     static internal::atomic_ptr<const implementation> active_implementation{&detect_best_supported_implementation_on_first_use_singleton};
4687     return active_implementation;
4688 }
4689 
validate_utf8(const char * buf,size_t len)4690 simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
4691   return get_active_implementation()->validate_utf8(buf, len);
4692 }
validate_utf8_with_errors(const char * buf,size_t len)4693 simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept {
4694   return get_active_implementation()->validate_utf8_with_errors(buf, len);
4695 }
validate_ascii(const char * buf,size_t len)4696 simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept {
4697   return get_active_implementation()->validate_ascii(buf, len);
4698 }
validate_ascii_with_errors(const char * buf,size_t len)4699 simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept {
4700   return get_active_implementation()->validate_ascii_with_errors(buf, len);
4701 }
convert_utf8_to_utf16(const char * input,size_t length,char16_t * utf16_output)4702 simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept {
4703   #if SIMDUTF_IS_BIG_ENDIAN
4704   return convert_utf8_to_utf16be(input, length, utf16_output);
4705   #else
4706   return convert_utf8_to_utf16le(input, length, utf16_output);
4707   #endif
4708 }
convert_utf8_to_utf16le(const char * input,size_t length,char16_t * utf16_output)4709 simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept {
4710   return get_active_implementation()->convert_utf8_to_utf16le(input, length, utf16_output);
4711 }
convert_utf8_to_utf16be(const char * input,size_t length,char16_t * utf16_output)4712 simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept {
4713   return get_active_implementation()->convert_utf8_to_utf16be(input, length, utf16_output);
4714 }
convert_utf8_to_utf16_with_errors(const char * input,size_t length,char16_t * utf16_output)4715 simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
4716   #if SIMDUTF_IS_BIG_ENDIAN
4717   return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
4718   #else
4719   return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
4720   #endif
4721 }
convert_utf8_to_utf16le_with_errors(const char * input,size_t length,char16_t * utf16_output)4722 simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
4723   return get_active_implementation()->convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
4724 }
convert_utf8_to_utf16be_with_errors(const char * input,size_t length,char16_t * utf16_output)4725 simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
4726   return get_active_implementation()->convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
4727 }
convert_utf8_to_utf32(const char * input,size_t length,char32_t * utf32_output)4728 simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept {
4729   return get_active_implementation()->convert_utf8_to_utf32(input, length, utf32_output);
4730 }
convert_utf8_to_utf32_with_errors(const char * input,size_t length,char32_t * utf32_output)4731 simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept {
4732   return get_active_implementation()->convert_utf8_to_utf32_with_errors(input, length, utf32_output);
4733 }
validate_utf16(const char16_t * buf,size_t len)4734 simdutf_warn_unused bool validate_utf16(const char16_t * buf, size_t len) noexcept {
4735   #if SIMDUTF_IS_BIG_ENDIAN
4736   return validate_utf16be(buf, len);
4737   #else
4738   return validate_utf16le(buf, len);
4739   #endif
4740 }
validate_utf16le(const char16_t * buf,size_t len)4741 simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) noexcept {
4742   return get_active_implementation()->validate_utf16le(buf, len);
4743 }
validate_utf16be(const char16_t * buf,size_t len)4744 simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) noexcept {
4745   return get_active_implementation()->validate_utf16be(buf, len);
4746 }
validate_utf16_with_errors(const char16_t * buf,size_t len)4747 simdutf_warn_unused result validate_utf16_with_errors(const char16_t * buf, size_t len) noexcept {
4748   #if SIMDUTF_IS_BIG_ENDIAN
4749   return validate_utf16be_with_errors(buf, len);
4750   #else
4751   return validate_utf16le_with_errors(buf, len);
4752   #endif
4753 }
validate_utf16le_with_errors(const char16_t * buf,size_t len)4754 simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) noexcept {
4755   return get_active_implementation()->validate_utf16le_with_errors(buf, len);
4756 }
validate_utf16be_with_errors(const char16_t * buf,size_t len)4757 simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) noexcept {
4758   return get_active_implementation()->validate_utf16be_with_errors(buf, len);
4759 }
validate_utf32(const char32_t * buf,size_t len)4760 simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) noexcept {
4761   return get_active_implementation()->validate_utf32(buf, len);
4762 }
validate_utf32_with_errors(const char32_t * buf,size_t len)4763 simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) noexcept {
4764   return get_active_implementation()->validate_utf32_with_errors(buf, len);
4765 }
convert_valid_utf8_to_utf16(const char * input,size_t length,char16_t * utf16_buffer)4766 simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
4767   #if SIMDUTF_IS_BIG_ENDIAN
4768   return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
4769   #else
4770   return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
4771   #endif
4772 }
convert_valid_utf8_to_utf16le(const char * input,size_t length,char16_t * utf16_buffer)4773 simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
4774   return get_active_implementation()->convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
4775 }
convert_valid_utf8_to_utf16be(const char * input,size_t length,char16_t * utf16_buffer)4776 simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
4777   return get_active_implementation()->convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
4778 }
convert_valid_utf8_to_utf32(const char * input,size_t length,char32_t * utf32_buffer)4779 simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept {
4780   return get_active_implementation()->convert_valid_utf8_to_utf32(input, length, utf32_buffer);
4781 }
convert_utf16_to_utf8(const char16_t * buf,size_t len,char * utf8_buffer)4782 simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
4783   #if SIMDUTF_IS_BIG_ENDIAN
4784   return convert_utf16be_to_utf8(buf, len, utf8_buffer);
4785   #else
4786   return convert_utf16le_to_utf8(buf, len, utf8_buffer);
4787   #endif
4788 }
convert_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_buffer)4789 simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
4790   return get_active_implementation()->convert_utf16le_to_utf8(buf, len, utf8_buffer);
4791 }
convert_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_buffer)4792 simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
4793   return get_active_implementation()->convert_utf16be_to_utf8(buf, len, utf8_buffer);
4794 }
convert_utf16_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_buffer)4795 simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
4796   #if SIMDUTF_IS_BIG_ENDIAN
4797   return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
4798   #else
4799   return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
4800   #endif
4801 }
convert_utf16le_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_buffer)4802 simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
4803   return get_active_implementation()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
4804 }
convert_utf16be_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_buffer)4805 simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
4806   return get_active_implementation()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
4807 }
convert_valid_utf16_to_utf8(const char16_t * buf,size_t len,char * utf8_buffer)4808 simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
4809   #if SIMDUTF_IS_BIG_ENDIAN
4810   return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
4811   #else
4812   return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
4813   #endif
4814 }
convert_valid_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_buffer)4815 simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
4816   return get_active_implementation()->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
4817 }
convert_valid_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_buffer)4818 simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
4819   return get_active_implementation()->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
4820 }
convert_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_buffer)4821 simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
4822   return get_active_implementation()->convert_utf32_to_utf8(buf, len, utf8_buffer);
4823 }
convert_utf32_to_utf8_with_errors(const char32_t * buf,size_t len,char * utf8_buffer)4824 simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
4825   return get_active_implementation()->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer);
4826 }
convert_valid_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_buffer)4827 simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
4828   return get_active_implementation()->convert_valid_utf32_to_utf8(buf, len, utf8_buffer);
4829 }
convert_utf32_to_utf16(const char32_t * buf,size_t len,char16_t * utf16_buffer)4830 simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
4831   #if SIMDUTF_IS_BIG_ENDIAN
4832   return convert_utf32_to_utf16be(buf, len, utf16_buffer);
4833   #else
4834   return convert_utf32_to_utf16le(buf, len, utf16_buffer);
4835   #endif
4836 }
convert_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_buffer)4837 simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
4838   return get_active_implementation()->convert_utf32_to_utf16le(buf, len, utf16_buffer);
4839 }
convert_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_buffer)4840 simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
4841   return get_active_implementation()->convert_utf32_to_utf16be(buf, len, utf16_buffer);
4842 }
convert_utf32_to_utf16_with_errors(const char32_t * buf,size_t len,char16_t * utf16_buffer)4843 simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
4844   #if SIMDUTF_IS_BIG_ENDIAN
4845   return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
4846   #else
4847   return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
4848   #endif
4849 }
convert_utf32_to_utf16le_with_errors(const char32_t * buf,size_t len,char16_t * utf16_buffer)4850 simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
4851   return get_active_implementation()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
4852 }
convert_utf32_to_utf16be_with_errors(const char32_t * buf,size_t len,char16_t * utf16_buffer)4853 simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
4854   return get_active_implementation()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
4855 }
convert_valid_utf32_to_utf16(const char32_t * buf,size_t len,char16_t * utf16_buffer)4856 simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
4857   #if SIMDUTF_IS_BIG_ENDIAN
4858   return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
4859   #else
4860   return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
4861   #endif
4862 }
convert_valid_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_buffer)4863 simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
4864   return get_active_implementation()->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
4865 }
convert_valid_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_buffer)4866 simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
4867   return get_active_implementation()->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
4868 }
convert_utf16_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_buffer)4869 simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
4870   #if SIMDUTF_IS_BIG_ENDIAN
4871   return convert_utf16be_to_utf32(buf, len, utf32_buffer);
4872   #else
4873   return convert_utf16le_to_utf32(buf, len, utf32_buffer);
4874   #endif
4875 }
convert_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_buffer)4876 simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
4877   return get_active_implementation()->convert_utf16le_to_utf32(buf, len, utf32_buffer);
4878 }
convert_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_buffer)4879 simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
4880   return get_active_implementation()->convert_utf16be_to_utf32(buf, len, utf32_buffer);
4881 }
convert_utf16_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_buffer)4882 simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
4883   #if SIMDUTF_IS_BIG_ENDIAN
4884   return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
4885   #else
4886   return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
4887   #endif
4888 }
convert_utf16le_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_buffer)4889 simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
4890   return get_active_implementation()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
4891 }
convert_utf16be_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_buffer)4892 simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
4893   return get_active_implementation()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
4894 }
convert_valid_utf16_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_buffer)4895 simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
4896   #if SIMDUTF_IS_BIG_ENDIAN
4897   return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
4898   #else
4899   return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
4900   #endif
4901 }
convert_valid_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_buffer)4902 simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
4903   return get_active_implementation()->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
4904 }
convert_valid_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_buffer)4905 simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
4906   return get_active_implementation()->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
4907 }
change_endianness_utf16(const char16_t * input,size_t length,char16_t * output)4908 void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept {
4909   get_active_implementation()->change_endianness_utf16(input, length, output);
4910 }
count_utf16(const char16_t * input,size_t length)4911 simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept {
4912   #if SIMDUTF_IS_BIG_ENDIAN
4913   return count_utf16be(input, length);
4914   #else
4915   return count_utf16le(input, length);
4916   #endif
4917 }
count_utf16le(const char16_t * input,size_t length)4918 simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept {
4919   return get_active_implementation()->count_utf16le(input, length);
4920 }
count_utf16be(const char16_t * input,size_t length)4921 simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept {
4922   return get_active_implementation()->count_utf16be(input, length);
4923 }
count_utf8(const char * input,size_t length)4924 simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept {
4925   return get_active_implementation()->count_utf8(input, length);
4926 }
utf8_length_from_utf16(const char16_t * input,size_t length)4927 simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept {
4928   #if SIMDUTF_IS_BIG_ENDIAN
4929   return utf8_length_from_utf16be(input, length);
4930   #else
4931   return utf8_length_from_utf16le(input, length);
4932   #endif
4933 }
utf8_length_from_utf16le(const char16_t * input,size_t length)4934 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept {
4935   return get_active_implementation()->utf8_length_from_utf16le(input, length);
4936 }
utf8_length_from_utf16be(const char16_t * input,size_t length)4937 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept {
4938   return get_active_implementation()->utf8_length_from_utf16be(input, length);
4939 }
utf32_length_from_utf16(const char16_t * input,size_t length)4940 simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept {
4941   #if SIMDUTF_IS_BIG_ENDIAN
4942   return utf32_length_from_utf16be(input, length);
4943   #else
4944   return utf32_length_from_utf16le(input, length);
4945   #endif
4946 }
utf32_length_from_utf16le(const char16_t * input,size_t length)4947 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept {
4948   return get_active_implementation()->utf32_length_from_utf16le(input, length);
4949 }
utf32_length_from_utf16be(const char16_t * input,size_t length)4950 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept {
4951   return get_active_implementation()->utf32_length_from_utf16be(input, length);
4952 }
utf16_length_from_utf8(const char * input,size_t length)4953 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept {
4954   return get_active_implementation()->utf16_length_from_utf8(input, length);
4955 }
utf8_length_from_utf32(const char32_t * input,size_t length)4956 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept {
4957   return get_active_implementation()->utf8_length_from_utf32(input, length);
4958 }
utf16_length_from_utf32(const char32_t * input,size_t length)4959 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept {
4960   return get_active_implementation()->utf16_length_from_utf32(input, length);
4961 }
utf32_length_from_utf8(const char * input,size_t length)4962 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept {
4963   return get_active_implementation()->utf32_length_from_utf8(input, length);
4964 }
autodetect_encoding(const char * buf,size_t length)4965 simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, size_t length) noexcept {
4966   return get_active_implementation()->autodetect_encoding(buf, length);
4967 }
detect_encodings(const char * buf,size_t length)4968 simdutf_warn_unused int detect_encodings(const char * buf, size_t length) noexcept {
4969   return get_active_implementation()->detect_encodings(buf, length);
4970 }
4971 
builtin_implementation()4972 const implementation * builtin_implementation() {
4973   static const implementation * builtin_impl = get_available_implementations()[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)];
4974   return builtin_impl;
4975 }
4976 
4977 
4978 } // namespace simdutf
4979 
4980 /* end file src/implementation.cpp */
4981 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=encoding_types.cpp
4982 /* begin file src/encoding_types.cpp */
4983 
4984 namespace simdutf {
match_system(endianness e)4985 bool match_system(endianness e) {
4986 #if SIMDUTF_IS_BIG_ENDIAN
4987     return e == endianness::BIG;
4988 #else
4989     return e == endianness::LITTLE;
4990 #endif
4991 }
4992 
to_string(encoding_type bom)4993 std::string to_string(encoding_type bom) {
4994   switch (bom) {
4995       case UTF16_LE:     return "UTF16 little-endian";
4996       case UTF16_BE:     return "UTF16 big-endian";
4997       case UTF32_LE:     return "UTF32 little-endian";
4998       case UTF32_BE:     return "UTF32 big-endian";
4999       case UTF8:         return "UTF8";
5000       case unspecified:  return "unknown";
5001       default:           return "error";
5002   }
5003 }
5004 
5005 namespace BOM {
5006 // Note that BOM for UTF8 is discouraged.
check_bom(const uint8_t * byte,size_t length)5007 encoding_type check_bom(const uint8_t* byte, size_t length) {
5008         if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
5009             if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
5010                 return encoding_type::UTF32_LE;
5011             } else {
5012                 return encoding_type::UTF16_LE;
5013             }
5014         } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
5015             return encoding_type::UTF16_BE;
5016         } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and byte[2] == 0xfe and byte[3] == 0xff) {
5017             return encoding_type::UTF32_BE;
5018         } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[3] == 0xbf) {
5019             return encoding_type::UTF8;
5020         }
5021         return encoding_type::unspecified;
5022     }
5023 
check_bom(const char * byte,size_t length)5024 encoding_type check_bom(const char* byte, size_t length) {
5025       return check_bom(reinterpret_cast<const uint8_t*>(byte), length);
5026  }
5027 
bom_byte_size(encoding_type bom)5028  size_t bom_byte_size(encoding_type bom) {
5029         switch (bom) {
5030             case UTF16_LE:     return 2;
5031             case UTF16_BE:     return 2;
5032             case UTF32_LE:     return 4;
5033             case UTF32_BE:     return 4;
5034             case UTF8:         return 3;
5035             case unspecified:  return 0;
5036             default:           return 0;
5037         }
5038 }
5039 
5040 }
5041 }
5042 /* end file src/encoding_types.cpp */
5043 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=error.cpp
5044 /* begin file src/error.cpp */
5045 namespace simdutf {
5046 
result()5047   simdutf_really_inline result::result() : error{error_code::SUCCESS}, count{0} {};
5048 
result(error_code _err,size_t _pos)5049   simdutf_really_inline result::result(error_code _err, size_t _pos) : error{_err}, count{_pos} {};
5050 
5051 }
5052 /* end file src/error.cpp */
5053 // The large tables should be included once and they
5054 // should not depend on a kernel.
5055 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=tables/utf8_to_utf16_tables.h
5056 /* begin file src/tables/utf8_to_utf16_tables.h */
5057 #ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H
5058 #define SIMDUTF_UTF8_TO_UTF16_TABLES_H
5059 #include <cstdint>
5060 
5061 namespace simdutf {
5062 namespace {
5063 namespace tables {
5064 namespace utf8_to_utf16 {
5065 /**
5066  * utf8bigindex uses about 8 kB
5067  * shufutf8 uses about 3344 B
5068  *
5069  * So we use a bit over 11 kB. It would be
5070  * easy to save about 4 kB by only
5071  * storing the index in utf8bigindex, and
5072  * deriving the consumed bytes otherwise.
5073  * However, this may come at a significant (10% to 20%)
5074  * performance penalty.
5075  */
5076 
5077 const uint8_t shufutf8[209][16] =
5078 {	{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0},
5079  	{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0},
5080  	{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0},
5081  	{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0},
5082  	{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0},
5083  	{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0},
5084  	{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0},
5085  	{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0},
5086  	{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
5087  	{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
5088  	{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
5089  	{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
5090  	{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
5091  	{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
5092  	{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
5093  	{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
5094  	{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
5095  	{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
5096  	{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
5097  	{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
5098  	{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
5099  	{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
5100  	{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
5101  	{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
5102  	{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
5103  	{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
5104  	{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
5105  	{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
5106  	{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
5107  	{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
5108  	{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
5109  	{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
5110  	{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
5111  	{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
5112  	{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
5113  	{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
5114  	{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
5115  	{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
5116  	{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
5117  	{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
5118  	{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
5119  	{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
5120  	{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
5121  	{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
5122  	{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
5123  	{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
5124  	{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
5125  	{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
5126  	{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
5127  	{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
5128  	{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
5129  	{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
5130  	{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
5131  	{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
5132  	{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
5133  	{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
5134  	{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0},
5135  	{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0},
5136  	{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0},
5137  	{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0},
5138  	{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0},
5139  	{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0},
5140  	{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0},
5141  	{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0},
5142  	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255},
5143  	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255},
5144  	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255},
5145  	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255},
5146  	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255},
5147  	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255},
5148  	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255},
5149  	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255},
5150  	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255},
5151  	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
5152  	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
5153  	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
5154  	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
5155  	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
5156  	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
5157  	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
5158  	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
5159  	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
5160  	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255},
5161  	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255},
5162  	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255},
5163  	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255},
5164  	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255},
5165  	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255},
5166  	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255},
5167  	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255},
5168  	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255},
5169  	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
5170  	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
5171  	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
5172  	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
5173  	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
5174  	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
5175  	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
5176  	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
5177  	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
5178  	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
5179  	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
5180  	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
5181  	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
5182  	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
5183  	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
5184  	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
5185  	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
5186  	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
5187  	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255},
5188  	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255},
5189  	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255},
5190  	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255},
5191  	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255},
5192  	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255},
5193  	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255},
5194  	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255},
5195  	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255},
5196  	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
5197  	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
5198  	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
5199  	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
5200  	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
5201  	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
5202  	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
5203  	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
5204  	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
5205  	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255},
5206  	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255},
5207  	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255},
5208  	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255},
5209  	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255},
5210  	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255},
5211  	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255},
5212  	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255},
5213  	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255},
5214  	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255},
5215  	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255},
5216  	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255},
5217  	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255},
5218  	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255},
5219  	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255},
5220  	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255},
5221  	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255},
5222  	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255},
5223  	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0},
5224  	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0},
5225  	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0},
5226  	{0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0},
5227  	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
5228  	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
5229  	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
5230  	{0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
5231  	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0},
5232  	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0},
5233  	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0},
5234  	{0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0},
5235  	{0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0},
5236  	{0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0},
5237  	{0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0},
5238  	{0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0},
5239  	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
5240  	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
5241  	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
5242  	{1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
5243  	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
5244  	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
5245  	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
5246  	{1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
5247  	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0},
5248  	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0},
5249  	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0},
5250  	{1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0},
5251  	{1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0},
5252  	{1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0},
5253  	{1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0},
5254  	{1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0},
5255  	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
5256  	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
5257  	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
5258  	{2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
5259  	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
5260  	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
5261  	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
5262  	{2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
5263  	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0},
5264  	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0},
5265  	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0},
5266  	{2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0},
5267  	{2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0},
5268  	{2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0},
5269  	{2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0},
5270  	{2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0},
5271  	{3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
5272  	{3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
5273  	{3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
5274  	{3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
5275  	{3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0},
5276  	{3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0},
5277  	{3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0},
5278  	{3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0},
5279  	{3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0},
5280  	{3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0},
5281  	{3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0},
5282  	{3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0},
5283  	{3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0},
5284  	{3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0},
5285  	{3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0},
5286  	{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}};
5287 /* number of two bytes : 64 */
5288 /* number of two + three bytes : 145 */
5289 /* number of two + three + four bytes : 209 */
5290 const uint8_t utf8bigindex[4096][2] =
5291 {	{209, 12},
5292  	{209, 12},
5293  	{209, 12},
5294  	{209, 12},
5295  	{209, 12},
5296  	{209, 12},
5297  	{209, 12},
5298  	{145, 3},
5299  	{209, 12},
5300  	{209, 12},
5301  	{209, 12},
5302  	{146, 4},
5303  	{209, 12},
5304  	{149, 4},
5305  	{161, 4},
5306  	{64, 4},
5307  	{209, 12},
5308  	{209, 12},
5309  	{209, 12},
5310  	{147, 5},
5311  	{209, 12},
5312  	{150, 5},
5313  	{162, 5},
5314  	{65, 5},
5315  	{209, 12},
5316  	{153, 5},
5317  	{165, 5},
5318  	{67, 5},
5319  	{177, 5},
5320  	{73, 5},
5321  	{91, 5},
5322  	{64, 4},
5323  	{209, 12},
5324  	{209, 12},
5325  	{209, 12},
5326  	{148, 6},
5327  	{209, 12},
5328  	{151, 6},
5329  	{163, 6},
5330  	{66, 6},
5331  	{209, 12},
5332  	{154, 6},
5333  	{166, 6},
5334  	{68, 6},
5335  	{178, 6},
5336  	{74, 6},
5337  	{92, 6},
5338  	{64, 4},
5339  	{209, 12},
5340  	{157, 6},
5341  	{169, 6},
5342  	{70, 6},
5343  	{181, 6},
5344  	{76, 6},
5345  	{94, 6},
5346  	{65, 5},
5347  	{193, 6},
5348  	{82, 6},
5349  	{100, 6},
5350  	{67, 5},
5351  	{118, 6},
5352  	{73, 5},
5353  	{91, 5},
5354  	{0, 6},
5355  	{209, 12},
5356  	{209, 12},
5357  	{209, 12},
5358  	{209, 12},
5359  	{209, 12},
5360  	{152, 7},
5361  	{164, 7},
5362  	{145, 3},
5363  	{209, 12},
5364  	{155, 7},
5365  	{167, 7},
5366  	{69, 7},
5367  	{179, 7},
5368  	{75, 7},
5369  	{93, 7},
5370  	{64, 4},
5371  	{209, 12},
5372  	{158, 7},
5373  	{170, 7},
5374  	{71, 7},
5375  	{182, 7},
5376  	{77, 7},
5377  	{95, 7},
5378  	{65, 5},
5379  	{194, 7},
5380  	{83, 7},
5381  	{101, 7},
5382  	{67, 5},
5383  	{119, 7},
5384  	{73, 5},
5385  	{91, 5},
5386  	{1, 7},
5387  	{209, 12},
5388  	{209, 12},
5389  	{173, 7},
5390  	{148, 6},
5391  	{185, 7},
5392  	{79, 7},
5393  	{97, 7},
5394  	{66, 6},
5395  	{197, 7},
5396  	{85, 7},
5397  	{103, 7},
5398  	{68, 6},
5399  	{121, 7},
5400  	{74, 6},
5401  	{92, 6},
5402  	{2, 7},
5403  	{209, 12},
5404  	{157, 6},
5405  	{109, 7},
5406  	{70, 6},
5407  	{127, 7},
5408  	{76, 6},
5409  	{94, 6},
5410  	{4, 7},
5411  	{193, 6},
5412  	{82, 6},
5413  	{100, 6},
5414  	{8, 7},
5415  	{118, 6},
5416  	{16, 7},
5417  	{32, 7},
5418  	{0, 6},
5419  	{209, 12},
5420  	{209, 12},
5421  	{209, 12},
5422  	{209, 12},
5423  	{209, 12},
5424  	{209, 12},
5425  	{209, 12},
5426  	{145, 3},
5427  	{209, 12},
5428  	{156, 8},
5429  	{168, 8},
5430  	{146, 4},
5431  	{180, 8},
5432  	{149, 4},
5433  	{161, 4},
5434  	{64, 4},
5435  	{209, 12},
5436  	{159, 8},
5437  	{171, 8},
5438  	{72, 8},
5439  	{183, 8},
5440  	{78, 8},
5441  	{96, 8},
5442  	{65, 5},
5443  	{195, 8},
5444  	{84, 8},
5445  	{102, 8},
5446  	{67, 5},
5447  	{120, 8},
5448  	{73, 5},
5449  	{91, 5},
5450  	{64, 4},
5451  	{209, 12},
5452  	{209, 12},
5453  	{174, 8},
5454  	{148, 6},
5455  	{186, 8},
5456  	{80, 8},
5457  	{98, 8},
5458  	{66, 6},
5459  	{198, 8},
5460  	{86, 8},
5461  	{104, 8},
5462  	{68, 6},
5463  	{122, 8},
5464  	{74, 6},
5465  	{92, 6},
5466  	{3, 8},
5467  	{209, 12},
5468  	{157, 6},
5469  	{110, 8},
5470  	{70, 6},
5471  	{128, 8},
5472  	{76, 6},
5473  	{94, 6},
5474  	{5, 8},
5475  	{193, 6},
5476  	{82, 6},
5477  	{100, 6},
5478  	{9, 8},
5479  	{118, 6},
5480  	{17, 8},
5481  	{33, 8},
5482  	{0, 6},
5483  	{209, 12},
5484  	{209, 12},
5485  	{209, 12},
5486  	{209, 12},
5487  	{189, 8},
5488  	{152, 7},
5489  	{164, 7},
5490  	{145, 3},
5491  	{201, 8},
5492  	{88, 8},
5493  	{106, 8},
5494  	{69, 7},
5495  	{124, 8},
5496  	{75, 7},
5497  	{93, 7},
5498  	{64, 4},
5499  	{209, 12},
5500  	{158, 7},
5501  	{112, 8},
5502  	{71, 7},
5503  	{130, 8},
5504  	{77, 7},
5505  	{95, 7},
5506  	{6, 8},
5507  	{194, 7},
5508  	{83, 7},
5509  	{101, 7},
5510  	{10, 8},
5511  	{119, 7},
5512  	{18, 8},
5513  	{34, 8},
5514  	{1, 7},
5515  	{209, 12},
5516  	{209, 12},
5517  	{173, 7},
5518  	{148, 6},
5519  	{136, 8},
5520  	{79, 7},
5521  	{97, 7},
5522  	{66, 6},
5523  	{197, 7},
5524  	{85, 7},
5525  	{103, 7},
5526  	{12, 8},
5527  	{121, 7},
5528  	{20, 8},
5529  	{36, 8},
5530  	{2, 7},
5531  	{209, 12},
5532  	{157, 6},
5533  	{109, 7},
5534  	{70, 6},
5535  	{127, 7},
5536  	{24, 8},
5537  	{40, 8},
5538  	{4, 7},
5539  	{193, 6},
5540  	{82, 6},
5541  	{48, 8},
5542  	{8, 7},
5543  	{118, 6},
5544  	{16, 7},
5545  	{32, 7},
5546  	{0, 6},
5547  	{209, 12},
5548  	{209, 12},
5549  	{209, 12},
5550  	{209, 12},
5551  	{209, 12},
5552  	{209, 12},
5553  	{209, 12},
5554  	{145, 3},
5555  	{209, 12},
5556  	{209, 12},
5557  	{209, 12},
5558  	{146, 4},
5559  	{209, 12},
5560  	{149, 4},
5561  	{161, 4},
5562  	{64, 4},
5563  	{209, 12},
5564  	{160, 9},
5565  	{172, 9},
5566  	{147, 5},
5567  	{184, 9},
5568  	{150, 5},
5569  	{162, 5},
5570  	{65, 5},
5571  	{196, 9},
5572  	{153, 5},
5573  	{165, 5},
5574  	{67, 5},
5575  	{177, 5},
5576  	{73, 5},
5577  	{91, 5},
5578  	{64, 4},
5579  	{209, 12},
5580  	{209, 12},
5581  	{175, 9},
5582  	{148, 6},
5583  	{187, 9},
5584  	{81, 9},
5585  	{99, 9},
5586  	{66, 6},
5587  	{199, 9},
5588  	{87, 9},
5589  	{105, 9},
5590  	{68, 6},
5591  	{123, 9},
5592  	{74, 6},
5593  	{92, 6},
5594  	{64, 4},
5595  	{209, 12},
5596  	{157, 6},
5597  	{111, 9},
5598  	{70, 6},
5599  	{129, 9},
5600  	{76, 6},
5601  	{94, 6},
5602  	{65, 5},
5603  	{193, 6},
5604  	{82, 6},
5605  	{100, 6},
5606  	{67, 5},
5607  	{118, 6},
5608  	{73, 5},
5609  	{91, 5},
5610  	{0, 6},
5611  	{209, 12},
5612  	{209, 12},
5613  	{209, 12},
5614  	{209, 12},
5615  	{190, 9},
5616  	{152, 7},
5617  	{164, 7},
5618  	{145, 3},
5619  	{202, 9},
5620  	{89, 9},
5621  	{107, 9},
5622  	{69, 7},
5623  	{125, 9},
5624  	{75, 7},
5625  	{93, 7},
5626  	{64, 4},
5627  	{209, 12},
5628  	{158, 7},
5629  	{113, 9},
5630  	{71, 7},
5631  	{131, 9},
5632  	{77, 7},
5633  	{95, 7},
5634  	{7, 9},
5635  	{194, 7},
5636  	{83, 7},
5637  	{101, 7},
5638  	{11, 9},
5639  	{119, 7},
5640  	{19, 9},
5641  	{35, 9},
5642  	{1, 7},
5643  	{209, 12},
5644  	{209, 12},
5645  	{173, 7},
5646  	{148, 6},
5647  	{137, 9},
5648  	{79, 7},
5649  	{97, 7},
5650  	{66, 6},
5651  	{197, 7},
5652  	{85, 7},
5653  	{103, 7},
5654  	{13, 9},
5655  	{121, 7},
5656  	{21, 9},
5657  	{37, 9},
5658  	{2, 7},
5659  	{209, 12},
5660  	{157, 6},
5661  	{109, 7},
5662  	{70, 6},
5663  	{127, 7},
5664  	{25, 9},
5665  	{41, 9},
5666  	{4, 7},
5667  	{193, 6},
5668  	{82, 6},
5669  	{49, 9},
5670  	{8, 7},
5671  	{118, 6},
5672  	{16, 7},
5673  	{32, 7},
5674  	{0, 6},
5675  	{209, 12},
5676  	{209, 12},
5677  	{209, 12},
5678  	{209, 12},
5679  	{209, 12},
5680  	{209, 12},
5681  	{209, 12},
5682  	{145, 3},
5683  	{205, 9},
5684  	{156, 8},
5685  	{168, 8},
5686  	{146, 4},
5687  	{180, 8},
5688  	{149, 4},
5689  	{161, 4},
5690  	{64, 4},
5691  	{209, 12},
5692  	{159, 8},
5693  	{115, 9},
5694  	{72, 8},
5695  	{133, 9},
5696  	{78, 8},
5697  	{96, 8},
5698  	{65, 5},
5699  	{195, 8},
5700  	{84, 8},
5701  	{102, 8},
5702  	{67, 5},
5703  	{120, 8},
5704  	{73, 5},
5705  	{91, 5},
5706  	{64, 4},
5707  	{209, 12},
5708  	{209, 12},
5709  	{174, 8},
5710  	{148, 6},
5711  	{139, 9},
5712  	{80, 8},
5713  	{98, 8},
5714  	{66, 6},
5715  	{198, 8},
5716  	{86, 8},
5717  	{104, 8},
5718  	{14, 9},
5719  	{122, 8},
5720  	{22, 9},
5721  	{38, 9},
5722  	{3, 8},
5723  	{209, 12},
5724  	{157, 6},
5725  	{110, 8},
5726  	{70, 6},
5727  	{128, 8},
5728  	{26, 9},
5729  	{42, 9},
5730  	{5, 8},
5731  	{193, 6},
5732  	{82, 6},
5733  	{50, 9},
5734  	{9, 8},
5735  	{118, 6},
5736  	{17, 8},
5737  	{33, 8},
5738  	{0, 6},
5739  	{209, 12},
5740  	{209, 12},
5741  	{209, 12},
5742  	{209, 12},
5743  	{189, 8},
5744  	{152, 7},
5745  	{164, 7},
5746  	{145, 3},
5747  	{201, 8},
5748  	{88, 8},
5749  	{106, 8},
5750  	{69, 7},
5751  	{124, 8},
5752  	{75, 7},
5753  	{93, 7},
5754  	{64, 4},
5755  	{209, 12},
5756  	{158, 7},
5757  	{112, 8},
5758  	{71, 7},
5759  	{130, 8},
5760  	{28, 9},
5761  	{44, 9},
5762  	{6, 8},
5763  	{194, 7},
5764  	{83, 7},
5765  	{52, 9},
5766  	{10, 8},
5767  	{119, 7},
5768  	{18, 8},
5769  	{34, 8},
5770  	{1, 7},
5771  	{209, 12},
5772  	{209, 12},
5773  	{173, 7},
5774  	{148, 6},
5775  	{136, 8},
5776  	{79, 7},
5777  	{97, 7},
5778  	{66, 6},
5779  	{197, 7},
5780  	{85, 7},
5781  	{56, 9},
5782  	{12, 8},
5783  	{121, 7},
5784  	{20, 8},
5785  	{36, 8},
5786  	{2, 7},
5787  	{209, 12},
5788  	{157, 6},
5789  	{109, 7},
5790  	{70, 6},
5791  	{127, 7},
5792  	{24, 8},
5793  	{40, 8},
5794  	{4, 7},
5795  	{193, 6},
5796  	{82, 6},
5797  	{48, 8},
5798  	{8, 7},
5799  	{118, 6},
5800  	{16, 7},
5801  	{32, 7},
5802  	{0, 6},
5803  	{209, 12},
5804  	{209, 12},
5805  	{209, 12},
5806  	{209, 12},
5807  	{209, 12},
5808  	{209, 12},
5809  	{209, 12},
5810  	{145, 3},
5811  	{209, 12},
5812  	{209, 12},
5813  	{209, 12},
5814  	{146, 4},
5815  	{209, 12},
5816  	{149, 4},
5817  	{161, 4},
5818  	{64, 4},
5819  	{209, 12},
5820  	{209, 12},
5821  	{209, 12},
5822  	{147, 5},
5823  	{209, 12},
5824  	{150, 5},
5825  	{162, 5},
5826  	{65, 5},
5827  	{209, 12},
5828  	{153, 5},
5829  	{165, 5},
5830  	{67, 5},
5831  	{177, 5},
5832  	{73, 5},
5833  	{91, 5},
5834  	{64, 4},
5835  	{209, 12},
5836  	{209, 12},
5837  	{176, 10},
5838  	{148, 6},
5839  	{188, 10},
5840  	{151, 6},
5841  	{163, 6},
5842  	{66, 6},
5843  	{200, 10},
5844  	{154, 6},
5845  	{166, 6},
5846  	{68, 6},
5847  	{178, 6},
5848  	{74, 6},
5849  	{92, 6},
5850  	{64, 4},
5851  	{209, 12},
5852  	{157, 6},
5853  	{169, 6},
5854  	{70, 6},
5855  	{181, 6},
5856  	{76, 6},
5857  	{94, 6},
5858  	{65, 5},
5859  	{193, 6},
5860  	{82, 6},
5861  	{100, 6},
5862  	{67, 5},
5863  	{118, 6},
5864  	{73, 5},
5865  	{91, 5},
5866  	{0, 6},
5867  	{209, 12},
5868  	{209, 12},
5869  	{209, 12},
5870  	{209, 12},
5871  	{191, 10},
5872  	{152, 7},
5873  	{164, 7},
5874  	{145, 3},
5875  	{203, 10},
5876  	{90, 10},
5877  	{108, 10},
5878  	{69, 7},
5879  	{126, 10},
5880  	{75, 7},
5881  	{93, 7},
5882  	{64, 4},
5883  	{209, 12},
5884  	{158, 7},
5885  	{114, 10},
5886  	{71, 7},
5887  	{132, 10},
5888  	{77, 7},
5889  	{95, 7},
5890  	{65, 5},
5891  	{194, 7},
5892  	{83, 7},
5893  	{101, 7},
5894  	{67, 5},
5895  	{119, 7},
5896  	{73, 5},
5897  	{91, 5},
5898  	{1, 7},
5899  	{209, 12},
5900  	{209, 12},
5901  	{173, 7},
5902  	{148, 6},
5903  	{138, 10},
5904  	{79, 7},
5905  	{97, 7},
5906  	{66, 6},
5907  	{197, 7},
5908  	{85, 7},
5909  	{103, 7},
5910  	{68, 6},
5911  	{121, 7},
5912  	{74, 6},
5913  	{92, 6},
5914  	{2, 7},
5915  	{209, 12},
5916  	{157, 6},
5917  	{109, 7},
5918  	{70, 6},
5919  	{127, 7},
5920  	{76, 6},
5921  	{94, 6},
5922  	{4, 7},
5923  	{193, 6},
5924  	{82, 6},
5925  	{100, 6},
5926  	{8, 7},
5927  	{118, 6},
5928  	{16, 7},
5929  	{32, 7},
5930  	{0, 6},
5931  	{209, 12},
5932  	{209, 12},
5933  	{209, 12},
5934  	{209, 12},
5935  	{209, 12},
5936  	{209, 12},
5937  	{209, 12},
5938  	{145, 3},
5939  	{206, 10},
5940  	{156, 8},
5941  	{168, 8},
5942  	{146, 4},
5943  	{180, 8},
5944  	{149, 4},
5945  	{161, 4},
5946  	{64, 4},
5947  	{209, 12},
5948  	{159, 8},
5949  	{116, 10},
5950  	{72, 8},
5951  	{134, 10},
5952  	{78, 8},
5953  	{96, 8},
5954  	{65, 5},
5955  	{195, 8},
5956  	{84, 8},
5957  	{102, 8},
5958  	{67, 5},
5959  	{120, 8},
5960  	{73, 5},
5961  	{91, 5},
5962  	{64, 4},
5963  	{209, 12},
5964  	{209, 12},
5965  	{174, 8},
5966  	{148, 6},
5967  	{140, 10},
5968  	{80, 8},
5969  	{98, 8},
5970  	{66, 6},
5971  	{198, 8},
5972  	{86, 8},
5973  	{104, 8},
5974  	{15, 10},
5975  	{122, 8},
5976  	{23, 10},
5977  	{39, 10},
5978  	{3, 8},
5979  	{209, 12},
5980  	{157, 6},
5981  	{110, 8},
5982  	{70, 6},
5983  	{128, 8},
5984  	{27, 10},
5985  	{43, 10},
5986  	{5, 8},
5987  	{193, 6},
5988  	{82, 6},
5989  	{51, 10},
5990  	{9, 8},
5991  	{118, 6},
5992  	{17, 8},
5993  	{33, 8},
5994  	{0, 6},
5995  	{209, 12},
5996  	{209, 12},
5997  	{209, 12},
5998  	{209, 12},
5999  	{189, 8},
6000  	{152, 7},
6001  	{164, 7},
6002  	{145, 3},
6003  	{201, 8},
6004  	{88, 8},
6005  	{106, 8},
6006  	{69, 7},
6007  	{124, 8},
6008  	{75, 7},
6009  	{93, 7},
6010  	{64, 4},
6011  	{209, 12},
6012  	{158, 7},
6013  	{112, 8},
6014  	{71, 7},
6015  	{130, 8},
6016  	{29, 10},
6017  	{45, 10},
6018  	{6, 8},
6019  	{194, 7},
6020  	{83, 7},
6021  	{53, 10},
6022  	{10, 8},
6023  	{119, 7},
6024  	{18, 8},
6025  	{34, 8},
6026  	{1, 7},
6027  	{209, 12},
6028  	{209, 12},
6029  	{173, 7},
6030  	{148, 6},
6031  	{136, 8},
6032  	{79, 7},
6033  	{97, 7},
6034  	{66, 6},
6035  	{197, 7},
6036  	{85, 7},
6037  	{57, 10},
6038  	{12, 8},
6039  	{121, 7},
6040  	{20, 8},
6041  	{36, 8},
6042  	{2, 7},
6043  	{209, 12},
6044  	{157, 6},
6045  	{109, 7},
6046  	{70, 6},
6047  	{127, 7},
6048  	{24, 8},
6049  	{40, 8},
6050  	{4, 7},
6051  	{193, 6},
6052  	{82, 6},
6053  	{48, 8},
6054  	{8, 7},
6055  	{118, 6},
6056  	{16, 7},
6057  	{32, 7},
6058  	{0, 6},
6059  	{209, 12},
6060  	{209, 12},
6061  	{209, 12},
6062  	{209, 12},
6063  	{209, 12},
6064  	{209, 12},
6065  	{209, 12},
6066  	{145, 3},
6067  	{209, 12},
6068  	{209, 12},
6069  	{209, 12},
6070  	{146, 4},
6071  	{209, 12},
6072  	{149, 4},
6073  	{161, 4},
6074  	{64, 4},
6075  	{209, 12},
6076  	{160, 9},
6077  	{172, 9},
6078  	{147, 5},
6079  	{184, 9},
6080  	{150, 5},
6081  	{162, 5},
6082  	{65, 5},
6083  	{196, 9},
6084  	{153, 5},
6085  	{165, 5},
6086  	{67, 5},
6087  	{177, 5},
6088  	{73, 5},
6089  	{91, 5},
6090  	{64, 4},
6091  	{209, 12},
6092  	{209, 12},
6093  	{175, 9},
6094  	{148, 6},
6095  	{142, 10},
6096  	{81, 9},
6097  	{99, 9},
6098  	{66, 6},
6099  	{199, 9},
6100  	{87, 9},
6101  	{105, 9},
6102  	{68, 6},
6103  	{123, 9},
6104  	{74, 6},
6105  	{92, 6},
6106  	{64, 4},
6107  	{209, 12},
6108  	{157, 6},
6109  	{111, 9},
6110  	{70, 6},
6111  	{129, 9},
6112  	{76, 6},
6113  	{94, 6},
6114  	{65, 5},
6115  	{193, 6},
6116  	{82, 6},
6117  	{100, 6},
6118  	{67, 5},
6119  	{118, 6},
6120  	{73, 5},
6121  	{91, 5},
6122  	{0, 6},
6123  	{209, 12},
6124  	{209, 12},
6125  	{209, 12},
6126  	{209, 12},
6127  	{190, 9},
6128  	{152, 7},
6129  	{164, 7},
6130  	{145, 3},
6131  	{202, 9},
6132  	{89, 9},
6133  	{107, 9},
6134  	{69, 7},
6135  	{125, 9},
6136  	{75, 7},
6137  	{93, 7},
6138  	{64, 4},
6139  	{209, 12},
6140  	{158, 7},
6141  	{113, 9},
6142  	{71, 7},
6143  	{131, 9},
6144  	{30, 10},
6145  	{46, 10},
6146  	{7, 9},
6147  	{194, 7},
6148  	{83, 7},
6149  	{54, 10},
6150  	{11, 9},
6151  	{119, 7},
6152  	{19, 9},
6153  	{35, 9},
6154  	{1, 7},
6155  	{209, 12},
6156  	{209, 12},
6157  	{173, 7},
6158  	{148, 6},
6159  	{137, 9},
6160  	{79, 7},
6161  	{97, 7},
6162  	{66, 6},
6163  	{197, 7},
6164  	{85, 7},
6165  	{58, 10},
6166  	{13, 9},
6167  	{121, 7},
6168  	{21, 9},
6169  	{37, 9},
6170  	{2, 7},
6171  	{209, 12},
6172  	{157, 6},
6173  	{109, 7},
6174  	{70, 6},
6175  	{127, 7},
6176  	{25, 9},
6177  	{41, 9},
6178  	{4, 7},
6179  	{193, 6},
6180  	{82, 6},
6181  	{49, 9},
6182  	{8, 7},
6183  	{118, 6},
6184  	{16, 7},
6185  	{32, 7},
6186  	{0, 6},
6187  	{209, 12},
6188  	{209, 12},
6189  	{209, 12},
6190  	{209, 12},
6191  	{209, 12},
6192  	{209, 12},
6193  	{209, 12},
6194  	{145, 3},
6195  	{205, 9},
6196  	{156, 8},
6197  	{168, 8},
6198  	{146, 4},
6199  	{180, 8},
6200  	{149, 4},
6201  	{161, 4},
6202  	{64, 4},
6203  	{209, 12},
6204  	{159, 8},
6205  	{115, 9},
6206  	{72, 8},
6207  	{133, 9},
6208  	{78, 8},
6209  	{96, 8},
6210  	{65, 5},
6211  	{195, 8},
6212  	{84, 8},
6213  	{102, 8},
6214  	{67, 5},
6215  	{120, 8},
6216  	{73, 5},
6217  	{91, 5},
6218  	{64, 4},
6219  	{209, 12},
6220  	{209, 12},
6221  	{174, 8},
6222  	{148, 6},
6223  	{139, 9},
6224  	{80, 8},
6225  	{98, 8},
6226  	{66, 6},
6227  	{198, 8},
6228  	{86, 8},
6229  	{60, 10},
6230  	{14, 9},
6231  	{122, 8},
6232  	{22, 9},
6233  	{38, 9},
6234  	{3, 8},
6235  	{209, 12},
6236  	{157, 6},
6237  	{110, 8},
6238  	{70, 6},
6239  	{128, 8},
6240  	{26, 9},
6241  	{42, 9},
6242  	{5, 8},
6243  	{193, 6},
6244  	{82, 6},
6245  	{50, 9},
6246  	{9, 8},
6247  	{118, 6},
6248  	{17, 8},
6249  	{33, 8},
6250  	{0, 6},
6251  	{209, 12},
6252  	{209, 12},
6253  	{209, 12},
6254  	{209, 12},
6255  	{189, 8},
6256  	{152, 7},
6257  	{164, 7},
6258  	{145, 3},
6259  	{201, 8},
6260  	{88, 8},
6261  	{106, 8},
6262  	{69, 7},
6263  	{124, 8},
6264  	{75, 7},
6265  	{93, 7},
6266  	{64, 4},
6267  	{209, 12},
6268  	{158, 7},
6269  	{112, 8},
6270  	{71, 7},
6271  	{130, 8},
6272  	{28, 9},
6273  	{44, 9},
6274  	{6, 8},
6275  	{194, 7},
6276  	{83, 7},
6277  	{52, 9},
6278  	{10, 8},
6279  	{119, 7},
6280  	{18, 8},
6281  	{34, 8},
6282  	{1, 7},
6283  	{209, 12},
6284  	{209, 12},
6285  	{173, 7},
6286  	{148, 6},
6287  	{136, 8},
6288  	{79, 7},
6289  	{97, 7},
6290  	{66, 6},
6291  	{197, 7},
6292  	{85, 7},
6293  	{56, 9},
6294  	{12, 8},
6295  	{121, 7},
6296  	{20, 8},
6297  	{36, 8},
6298  	{2, 7},
6299  	{209, 12},
6300  	{157, 6},
6301  	{109, 7},
6302  	{70, 6},
6303  	{127, 7},
6304  	{24, 8},
6305  	{40, 8},
6306  	{4, 7},
6307  	{193, 6},
6308  	{82, 6},
6309  	{48, 8},
6310  	{8, 7},
6311  	{118, 6},
6312  	{16, 7},
6313  	{32, 7},
6314  	{0, 6},
6315  	{209, 12},
6316  	{209, 12},
6317  	{209, 12},
6318  	{209, 12},
6319  	{209, 12},
6320  	{209, 12},
6321  	{209, 12},
6322  	{145, 3},
6323  	{209, 12},
6324  	{209, 12},
6325  	{209, 12},
6326  	{146, 4},
6327  	{209, 12},
6328  	{149, 4},
6329  	{161, 4},
6330  	{64, 4},
6331  	{209, 12},
6332  	{209, 12},
6333  	{209, 12},
6334  	{147, 5},
6335  	{209, 12},
6336  	{150, 5},
6337  	{162, 5},
6338  	{65, 5},
6339  	{209, 12},
6340  	{153, 5},
6341  	{165, 5},
6342  	{67, 5},
6343  	{177, 5},
6344  	{73, 5},
6345  	{91, 5},
6346  	{64, 4},
6347  	{209, 12},
6348  	{209, 12},
6349  	{209, 12},
6350  	{148, 6},
6351  	{209, 12},
6352  	{151, 6},
6353  	{163, 6},
6354  	{66, 6},
6355  	{209, 12},
6356  	{154, 6},
6357  	{166, 6},
6358  	{68, 6},
6359  	{178, 6},
6360  	{74, 6},
6361  	{92, 6},
6362  	{64, 4},
6363  	{209, 12},
6364  	{157, 6},
6365  	{169, 6},
6366  	{70, 6},
6367  	{181, 6},
6368  	{76, 6},
6369  	{94, 6},
6370  	{65, 5},
6371  	{193, 6},
6372  	{82, 6},
6373  	{100, 6},
6374  	{67, 5},
6375  	{118, 6},
6376  	{73, 5},
6377  	{91, 5},
6378  	{0, 6},
6379  	{209, 12},
6380  	{209, 12},
6381  	{209, 12},
6382  	{209, 12},
6383  	{192, 11},
6384  	{152, 7},
6385  	{164, 7},
6386  	{145, 3},
6387  	{204, 11},
6388  	{155, 7},
6389  	{167, 7},
6390  	{69, 7},
6391  	{179, 7},
6392  	{75, 7},
6393  	{93, 7},
6394  	{64, 4},
6395  	{209, 12},
6396  	{158, 7},
6397  	{170, 7},
6398  	{71, 7},
6399  	{182, 7},
6400  	{77, 7},
6401  	{95, 7},
6402  	{65, 5},
6403  	{194, 7},
6404  	{83, 7},
6405  	{101, 7},
6406  	{67, 5},
6407  	{119, 7},
6408  	{73, 5},
6409  	{91, 5},
6410  	{1, 7},
6411  	{209, 12},
6412  	{209, 12},
6413  	{173, 7},
6414  	{148, 6},
6415  	{185, 7},
6416  	{79, 7},
6417  	{97, 7},
6418  	{66, 6},
6419  	{197, 7},
6420  	{85, 7},
6421  	{103, 7},
6422  	{68, 6},
6423  	{121, 7},
6424  	{74, 6},
6425  	{92, 6},
6426  	{2, 7},
6427  	{209, 12},
6428  	{157, 6},
6429  	{109, 7},
6430  	{70, 6},
6431  	{127, 7},
6432  	{76, 6},
6433  	{94, 6},
6434  	{4, 7},
6435  	{193, 6},
6436  	{82, 6},
6437  	{100, 6},
6438  	{8, 7},
6439  	{118, 6},
6440  	{16, 7},
6441  	{32, 7},
6442  	{0, 6},
6443  	{209, 12},
6444  	{209, 12},
6445  	{209, 12},
6446  	{209, 12},
6447  	{209, 12},
6448  	{209, 12},
6449  	{209, 12},
6450  	{145, 3},
6451  	{207, 11},
6452  	{156, 8},
6453  	{168, 8},
6454  	{146, 4},
6455  	{180, 8},
6456  	{149, 4},
6457  	{161, 4},
6458  	{64, 4},
6459  	{209, 12},
6460  	{159, 8},
6461  	{117, 11},
6462  	{72, 8},
6463  	{135, 11},
6464  	{78, 8},
6465  	{96, 8},
6466  	{65, 5},
6467  	{195, 8},
6468  	{84, 8},
6469  	{102, 8},
6470  	{67, 5},
6471  	{120, 8},
6472  	{73, 5},
6473  	{91, 5},
6474  	{64, 4},
6475  	{209, 12},
6476  	{209, 12},
6477  	{174, 8},
6478  	{148, 6},
6479  	{141, 11},
6480  	{80, 8},
6481  	{98, 8},
6482  	{66, 6},
6483  	{198, 8},
6484  	{86, 8},
6485  	{104, 8},
6486  	{68, 6},
6487  	{122, 8},
6488  	{74, 6},
6489  	{92, 6},
6490  	{3, 8},
6491  	{209, 12},
6492  	{157, 6},
6493  	{110, 8},
6494  	{70, 6},
6495  	{128, 8},
6496  	{76, 6},
6497  	{94, 6},
6498  	{5, 8},
6499  	{193, 6},
6500  	{82, 6},
6501  	{100, 6},
6502  	{9, 8},
6503  	{118, 6},
6504  	{17, 8},
6505  	{33, 8},
6506  	{0, 6},
6507  	{209, 12},
6508  	{209, 12},
6509  	{209, 12},
6510  	{209, 12},
6511  	{189, 8},
6512  	{152, 7},
6513  	{164, 7},
6514  	{145, 3},
6515  	{201, 8},
6516  	{88, 8},
6517  	{106, 8},
6518  	{69, 7},
6519  	{124, 8},
6520  	{75, 7},
6521  	{93, 7},
6522  	{64, 4},
6523  	{209, 12},
6524  	{158, 7},
6525  	{112, 8},
6526  	{71, 7},
6527  	{130, 8},
6528  	{77, 7},
6529  	{95, 7},
6530  	{6, 8},
6531  	{194, 7},
6532  	{83, 7},
6533  	{101, 7},
6534  	{10, 8},
6535  	{119, 7},
6536  	{18, 8},
6537  	{34, 8},
6538  	{1, 7},
6539  	{209, 12},
6540  	{209, 12},
6541  	{173, 7},
6542  	{148, 6},
6543  	{136, 8},
6544  	{79, 7},
6545  	{97, 7},
6546  	{66, 6},
6547  	{197, 7},
6548  	{85, 7},
6549  	{103, 7},
6550  	{12, 8},
6551  	{121, 7},
6552  	{20, 8},
6553  	{36, 8},
6554  	{2, 7},
6555  	{209, 12},
6556  	{157, 6},
6557  	{109, 7},
6558  	{70, 6},
6559  	{127, 7},
6560  	{24, 8},
6561  	{40, 8},
6562  	{4, 7},
6563  	{193, 6},
6564  	{82, 6},
6565  	{48, 8},
6566  	{8, 7},
6567  	{118, 6},
6568  	{16, 7},
6569  	{32, 7},
6570  	{0, 6},
6571  	{209, 12},
6572  	{209, 12},
6573  	{209, 12},
6574  	{209, 12},
6575  	{209, 12},
6576  	{209, 12},
6577  	{209, 12},
6578  	{145, 3},
6579  	{209, 12},
6580  	{209, 12},
6581  	{209, 12},
6582  	{146, 4},
6583  	{209, 12},
6584  	{149, 4},
6585  	{161, 4},
6586  	{64, 4},
6587  	{209, 12},
6588  	{160, 9},
6589  	{172, 9},
6590  	{147, 5},
6591  	{184, 9},
6592  	{150, 5},
6593  	{162, 5},
6594  	{65, 5},
6595  	{196, 9},
6596  	{153, 5},
6597  	{165, 5},
6598  	{67, 5},
6599  	{177, 5},
6600  	{73, 5},
6601  	{91, 5},
6602  	{64, 4},
6603  	{209, 12},
6604  	{209, 12},
6605  	{175, 9},
6606  	{148, 6},
6607  	{143, 11},
6608  	{81, 9},
6609  	{99, 9},
6610  	{66, 6},
6611  	{199, 9},
6612  	{87, 9},
6613  	{105, 9},
6614  	{68, 6},
6615  	{123, 9},
6616  	{74, 6},
6617  	{92, 6},
6618  	{64, 4},
6619  	{209, 12},
6620  	{157, 6},
6621  	{111, 9},
6622  	{70, 6},
6623  	{129, 9},
6624  	{76, 6},
6625  	{94, 6},
6626  	{65, 5},
6627  	{193, 6},
6628  	{82, 6},
6629  	{100, 6},
6630  	{67, 5},
6631  	{118, 6},
6632  	{73, 5},
6633  	{91, 5},
6634  	{0, 6},
6635  	{209, 12},
6636  	{209, 12},
6637  	{209, 12},
6638  	{209, 12},
6639  	{190, 9},
6640  	{152, 7},
6641  	{164, 7},
6642  	{145, 3},
6643  	{202, 9},
6644  	{89, 9},
6645  	{107, 9},
6646  	{69, 7},
6647  	{125, 9},
6648  	{75, 7},
6649  	{93, 7},
6650  	{64, 4},
6651  	{209, 12},
6652  	{158, 7},
6653  	{113, 9},
6654  	{71, 7},
6655  	{131, 9},
6656  	{31, 11},
6657  	{47, 11},
6658  	{7, 9},
6659  	{194, 7},
6660  	{83, 7},
6661  	{55, 11},
6662  	{11, 9},
6663  	{119, 7},
6664  	{19, 9},
6665  	{35, 9},
6666  	{1, 7},
6667  	{209, 12},
6668  	{209, 12},
6669  	{173, 7},
6670  	{148, 6},
6671  	{137, 9},
6672  	{79, 7},
6673  	{97, 7},
6674  	{66, 6},
6675  	{197, 7},
6676  	{85, 7},
6677  	{59, 11},
6678  	{13, 9},
6679  	{121, 7},
6680  	{21, 9},
6681  	{37, 9},
6682  	{2, 7},
6683  	{209, 12},
6684  	{157, 6},
6685  	{109, 7},
6686  	{70, 6},
6687  	{127, 7},
6688  	{25, 9},
6689  	{41, 9},
6690  	{4, 7},
6691  	{193, 6},
6692  	{82, 6},
6693  	{49, 9},
6694  	{8, 7},
6695  	{118, 6},
6696  	{16, 7},
6697  	{32, 7},
6698  	{0, 6},
6699  	{209, 12},
6700  	{209, 12},
6701  	{209, 12},
6702  	{209, 12},
6703  	{209, 12},
6704  	{209, 12},
6705  	{209, 12},
6706  	{145, 3},
6707  	{205, 9},
6708  	{156, 8},
6709  	{168, 8},
6710  	{146, 4},
6711  	{180, 8},
6712  	{149, 4},
6713  	{161, 4},
6714  	{64, 4},
6715  	{209, 12},
6716  	{159, 8},
6717  	{115, 9},
6718  	{72, 8},
6719  	{133, 9},
6720  	{78, 8},
6721  	{96, 8},
6722  	{65, 5},
6723  	{195, 8},
6724  	{84, 8},
6725  	{102, 8},
6726  	{67, 5},
6727  	{120, 8},
6728  	{73, 5},
6729  	{91, 5},
6730  	{64, 4},
6731  	{209, 12},
6732  	{209, 12},
6733  	{174, 8},
6734  	{148, 6},
6735  	{139, 9},
6736  	{80, 8},
6737  	{98, 8},
6738  	{66, 6},
6739  	{198, 8},
6740  	{86, 8},
6741  	{61, 11},
6742  	{14, 9},
6743  	{122, 8},
6744  	{22, 9},
6745  	{38, 9},
6746  	{3, 8},
6747  	{209, 12},
6748  	{157, 6},
6749  	{110, 8},
6750  	{70, 6},
6751  	{128, 8},
6752  	{26, 9},
6753  	{42, 9},
6754  	{5, 8},
6755  	{193, 6},
6756  	{82, 6},
6757  	{50, 9},
6758  	{9, 8},
6759  	{118, 6},
6760  	{17, 8},
6761  	{33, 8},
6762  	{0, 6},
6763  	{209, 12},
6764  	{209, 12},
6765  	{209, 12},
6766  	{209, 12},
6767  	{189, 8},
6768  	{152, 7},
6769  	{164, 7},
6770  	{145, 3},
6771  	{201, 8},
6772  	{88, 8},
6773  	{106, 8},
6774  	{69, 7},
6775  	{124, 8},
6776  	{75, 7},
6777  	{93, 7},
6778  	{64, 4},
6779  	{209, 12},
6780  	{158, 7},
6781  	{112, 8},
6782  	{71, 7},
6783  	{130, 8},
6784  	{28, 9},
6785  	{44, 9},
6786  	{6, 8},
6787  	{194, 7},
6788  	{83, 7},
6789  	{52, 9},
6790  	{10, 8},
6791  	{119, 7},
6792  	{18, 8},
6793  	{34, 8},
6794  	{1, 7},
6795  	{209, 12},
6796  	{209, 12},
6797  	{173, 7},
6798  	{148, 6},
6799  	{136, 8},
6800  	{79, 7},
6801  	{97, 7},
6802  	{66, 6},
6803  	{197, 7},
6804  	{85, 7},
6805  	{56, 9},
6806  	{12, 8},
6807  	{121, 7},
6808  	{20, 8},
6809  	{36, 8},
6810  	{2, 7},
6811  	{209, 12},
6812  	{157, 6},
6813  	{109, 7},
6814  	{70, 6},
6815  	{127, 7},
6816  	{24, 8},
6817  	{40, 8},
6818  	{4, 7},
6819  	{193, 6},
6820  	{82, 6},
6821  	{48, 8},
6822  	{8, 7},
6823  	{118, 6},
6824  	{16, 7},
6825  	{32, 7},
6826  	{0, 6},
6827  	{209, 12},
6828  	{209, 12},
6829  	{209, 12},
6830  	{209, 12},
6831  	{209, 12},
6832  	{209, 12},
6833  	{209, 12},
6834  	{145, 3},
6835  	{209, 12},
6836  	{209, 12},
6837  	{209, 12},
6838  	{146, 4},
6839  	{209, 12},
6840  	{149, 4},
6841  	{161, 4},
6842  	{64, 4},
6843  	{209, 12},
6844  	{209, 12},
6845  	{209, 12},
6846  	{147, 5},
6847  	{209, 12},
6848  	{150, 5},
6849  	{162, 5},
6850  	{65, 5},
6851  	{209, 12},
6852  	{153, 5},
6853  	{165, 5},
6854  	{67, 5},
6855  	{177, 5},
6856  	{73, 5},
6857  	{91, 5},
6858  	{64, 4},
6859  	{209, 12},
6860  	{209, 12},
6861  	{176, 10},
6862  	{148, 6},
6863  	{188, 10},
6864  	{151, 6},
6865  	{163, 6},
6866  	{66, 6},
6867  	{200, 10},
6868  	{154, 6},
6869  	{166, 6},
6870  	{68, 6},
6871  	{178, 6},
6872  	{74, 6},
6873  	{92, 6},
6874  	{64, 4},
6875  	{209, 12},
6876  	{157, 6},
6877  	{169, 6},
6878  	{70, 6},
6879  	{181, 6},
6880  	{76, 6},
6881  	{94, 6},
6882  	{65, 5},
6883  	{193, 6},
6884  	{82, 6},
6885  	{100, 6},
6886  	{67, 5},
6887  	{118, 6},
6888  	{73, 5},
6889  	{91, 5},
6890  	{0, 6},
6891  	{209, 12},
6892  	{209, 12},
6893  	{209, 12},
6894  	{209, 12},
6895  	{191, 10},
6896  	{152, 7},
6897  	{164, 7},
6898  	{145, 3},
6899  	{203, 10},
6900  	{90, 10},
6901  	{108, 10},
6902  	{69, 7},
6903  	{126, 10},
6904  	{75, 7},
6905  	{93, 7},
6906  	{64, 4},
6907  	{209, 12},
6908  	{158, 7},
6909  	{114, 10},
6910  	{71, 7},
6911  	{132, 10},
6912  	{77, 7},
6913  	{95, 7},
6914  	{65, 5},
6915  	{194, 7},
6916  	{83, 7},
6917  	{101, 7},
6918  	{67, 5},
6919  	{119, 7},
6920  	{73, 5},
6921  	{91, 5},
6922  	{1, 7},
6923  	{209, 12},
6924  	{209, 12},
6925  	{173, 7},
6926  	{148, 6},
6927  	{138, 10},
6928  	{79, 7},
6929  	{97, 7},
6930  	{66, 6},
6931  	{197, 7},
6932  	{85, 7},
6933  	{103, 7},
6934  	{68, 6},
6935  	{121, 7},
6936  	{74, 6},
6937  	{92, 6},
6938  	{2, 7},
6939  	{209, 12},
6940  	{157, 6},
6941  	{109, 7},
6942  	{70, 6},
6943  	{127, 7},
6944  	{76, 6},
6945  	{94, 6},
6946  	{4, 7},
6947  	{193, 6},
6948  	{82, 6},
6949  	{100, 6},
6950  	{8, 7},
6951  	{118, 6},
6952  	{16, 7},
6953  	{32, 7},
6954  	{0, 6},
6955  	{209, 12},
6956  	{209, 12},
6957  	{209, 12},
6958  	{209, 12},
6959  	{209, 12},
6960  	{209, 12},
6961  	{209, 12},
6962  	{145, 3},
6963  	{206, 10},
6964  	{156, 8},
6965  	{168, 8},
6966  	{146, 4},
6967  	{180, 8},
6968  	{149, 4},
6969  	{161, 4},
6970  	{64, 4},
6971  	{209, 12},
6972  	{159, 8},
6973  	{116, 10},
6974  	{72, 8},
6975  	{134, 10},
6976  	{78, 8},
6977  	{96, 8},
6978  	{65, 5},
6979  	{195, 8},
6980  	{84, 8},
6981  	{102, 8},
6982  	{67, 5},
6983  	{120, 8},
6984  	{73, 5},
6985  	{91, 5},
6986  	{64, 4},
6987  	{209, 12},
6988  	{209, 12},
6989  	{174, 8},
6990  	{148, 6},
6991  	{140, 10},
6992  	{80, 8},
6993  	{98, 8},
6994  	{66, 6},
6995  	{198, 8},
6996  	{86, 8},
6997  	{62, 11},
6998  	{15, 10},
6999  	{122, 8},
7000  	{23, 10},
7001  	{39, 10},
7002  	{3, 8},
7003  	{209, 12},
7004  	{157, 6},
7005  	{110, 8},
7006  	{70, 6},
7007  	{128, 8},
7008  	{27, 10},
7009  	{43, 10},
7010  	{5, 8},
7011  	{193, 6},
7012  	{82, 6},
7013  	{51, 10},
7014  	{9, 8},
7015  	{118, 6},
7016  	{17, 8},
7017  	{33, 8},
7018  	{0, 6},
7019  	{209, 12},
7020  	{209, 12},
7021  	{209, 12},
7022  	{209, 12},
7023  	{189, 8},
7024  	{152, 7},
7025  	{164, 7},
7026  	{145, 3},
7027  	{201, 8},
7028  	{88, 8},
7029  	{106, 8},
7030  	{69, 7},
7031  	{124, 8},
7032  	{75, 7},
7033  	{93, 7},
7034  	{64, 4},
7035  	{209, 12},
7036  	{158, 7},
7037  	{112, 8},
7038  	{71, 7},
7039  	{130, 8},
7040  	{29, 10},
7041  	{45, 10},
7042  	{6, 8},
7043  	{194, 7},
7044  	{83, 7},
7045  	{53, 10},
7046  	{10, 8},
7047  	{119, 7},
7048  	{18, 8},
7049  	{34, 8},
7050  	{1, 7},
7051  	{209, 12},
7052  	{209, 12},
7053  	{173, 7},
7054  	{148, 6},
7055  	{136, 8},
7056  	{79, 7},
7057  	{97, 7},
7058  	{66, 6},
7059  	{197, 7},
7060  	{85, 7},
7061  	{57, 10},
7062  	{12, 8},
7063  	{121, 7},
7064  	{20, 8},
7065  	{36, 8},
7066  	{2, 7},
7067  	{209, 12},
7068  	{157, 6},
7069  	{109, 7},
7070  	{70, 6},
7071  	{127, 7},
7072  	{24, 8},
7073  	{40, 8},
7074  	{4, 7},
7075  	{193, 6},
7076  	{82, 6},
7077  	{48, 8},
7078  	{8, 7},
7079  	{118, 6},
7080  	{16, 7},
7081  	{32, 7},
7082  	{0, 6},
7083  	{209, 12},
7084  	{209, 12},
7085  	{209, 12},
7086  	{209, 12},
7087  	{209, 12},
7088  	{209, 12},
7089  	{209, 12},
7090  	{145, 3},
7091  	{209, 12},
7092  	{209, 12},
7093  	{209, 12},
7094  	{146, 4},
7095  	{209, 12},
7096  	{149, 4},
7097  	{161, 4},
7098  	{64, 4},
7099  	{209, 12},
7100  	{160, 9},
7101  	{172, 9},
7102  	{147, 5},
7103  	{184, 9},
7104  	{150, 5},
7105  	{162, 5},
7106  	{65, 5},
7107  	{196, 9},
7108  	{153, 5},
7109  	{165, 5},
7110  	{67, 5},
7111  	{177, 5},
7112  	{73, 5},
7113  	{91, 5},
7114  	{64, 4},
7115  	{209, 12},
7116  	{209, 12},
7117  	{175, 9},
7118  	{148, 6},
7119  	{142, 10},
7120  	{81, 9},
7121  	{99, 9},
7122  	{66, 6},
7123  	{199, 9},
7124  	{87, 9},
7125  	{105, 9},
7126  	{68, 6},
7127  	{123, 9},
7128  	{74, 6},
7129  	{92, 6},
7130  	{64, 4},
7131  	{209, 12},
7132  	{157, 6},
7133  	{111, 9},
7134  	{70, 6},
7135  	{129, 9},
7136  	{76, 6},
7137  	{94, 6},
7138  	{65, 5},
7139  	{193, 6},
7140  	{82, 6},
7141  	{100, 6},
7142  	{67, 5},
7143  	{118, 6},
7144  	{73, 5},
7145  	{91, 5},
7146  	{0, 6},
7147  	{209, 12},
7148  	{209, 12},
7149  	{209, 12},
7150  	{209, 12},
7151  	{190, 9},
7152  	{152, 7},
7153  	{164, 7},
7154  	{145, 3},
7155  	{202, 9},
7156  	{89, 9},
7157  	{107, 9},
7158  	{69, 7},
7159  	{125, 9},
7160  	{75, 7},
7161  	{93, 7},
7162  	{64, 4},
7163  	{209, 12},
7164  	{158, 7},
7165  	{113, 9},
7166  	{71, 7},
7167  	{131, 9},
7168  	{30, 10},
7169  	{46, 10},
7170  	{7, 9},
7171  	{194, 7},
7172  	{83, 7},
7173  	{54, 10},
7174  	{11, 9},
7175  	{119, 7},
7176  	{19, 9},
7177  	{35, 9},
7178  	{1, 7},
7179  	{209, 12},
7180  	{209, 12},
7181  	{173, 7},
7182  	{148, 6},
7183  	{137, 9},
7184  	{79, 7},
7185  	{97, 7},
7186  	{66, 6},
7187  	{197, 7},
7188  	{85, 7},
7189  	{58, 10},
7190  	{13, 9},
7191  	{121, 7},
7192  	{21, 9},
7193  	{37, 9},
7194  	{2, 7},
7195  	{209, 12},
7196  	{157, 6},
7197  	{109, 7},
7198  	{70, 6},
7199  	{127, 7},
7200  	{25, 9},
7201  	{41, 9},
7202  	{4, 7},
7203  	{193, 6},
7204  	{82, 6},
7205  	{49, 9},
7206  	{8, 7},
7207  	{118, 6},
7208  	{16, 7},
7209  	{32, 7},
7210  	{0, 6},
7211  	{209, 12},
7212  	{209, 12},
7213  	{209, 12},
7214  	{209, 12},
7215  	{209, 12},
7216  	{209, 12},
7217  	{209, 12},
7218  	{145, 3},
7219  	{205, 9},
7220  	{156, 8},
7221  	{168, 8},
7222  	{146, 4},
7223  	{180, 8},
7224  	{149, 4},
7225  	{161, 4},
7226  	{64, 4},
7227  	{209, 12},
7228  	{159, 8},
7229  	{115, 9},
7230  	{72, 8},
7231  	{133, 9},
7232  	{78, 8},
7233  	{96, 8},
7234  	{65, 5},
7235  	{195, 8},
7236  	{84, 8},
7237  	{102, 8},
7238  	{67, 5},
7239  	{120, 8},
7240  	{73, 5},
7241  	{91, 5},
7242  	{64, 4},
7243  	{209, 12},
7244  	{209, 12},
7245  	{174, 8},
7246  	{148, 6},
7247  	{139, 9},
7248  	{80, 8},
7249  	{98, 8},
7250  	{66, 6},
7251  	{198, 8},
7252  	{86, 8},
7253  	{60, 10},
7254  	{14, 9},
7255  	{122, 8},
7256  	{22, 9},
7257  	{38, 9},
7258  	{3, 8},
7259  	{209, 12},
7260  	{157, 6},
7261  	{110, 8},
7262  	{70, 6},
7263  	{128, 8},
7264  	{26, 9},
7265  	{42, 9},
7266  	{5, 8},
7267  	{193, 6},
7268  	{82, 6},
7269  	{50, 9},
7270  	{9, 8},
7271  	{118, 6},
7272  	{17, 8},
7273  	{33, 8},
7274  	{0, 6},
7275  	{209, 12},
7276  	{209, 12},
7277  	{209, 12},
7278  	{209, 12},
7279  	{189, 8},
7280  	{152, 7},
7281  	{164, 7},
7282  	{145, 3},
7283  	{201, 8},
7284  	{88, 8},
7285  	{106, 8},
7286  	{69, 7},
7287  	{124, 8},
7288  	{75, 7},
7289  	{93, 7},
7290  	{64, 4},
7291  	{209, 12},
7292  	{158, 7},
7293  	{112, 8},
7294  	{71, 7},
7295  	{130, 8},
7296  	{28, 9},
7297  	{44, 9},
7298  	{6, 8},
7299  	{194, 7},
7300  	{83, 7},
7301  	{52, 9},
7302  	{10, 8},
7303  	{119, 7},
7304  	{18, 8},
7305  	{34, 8},
7306  	{1, 7},
7307  	{209, 12},
7308  	{209, 12},
7309  	{173, 7},
7310  	{148, 6},
7311  	{136, 8},
7312  	{79, 7},
7313  	{97, 7},
7314  	{66, 6},
7315  	{197, 7},
7316  	{85, 7},
7317  	{56, 9},
7318  	{12, 8},
7319  	{121, 7},
7320  	{20, 8},
7321  	{36, 8},
7322  	{2, 7},
7323  	{209, 12},
7324  	{157, 6},
7325  	{109, 7},
7326  	{70, 6},
7327  	{127, 7},
7328  	{24, 8},
7329  	{40, 8},
7330  	{4, 7},
7331  	{193, 6},
7332  	{82, 6},
7333  	{48, 8},
7334  	{8, 7},
7335  	{118, 6},
7336  	{16, 7},
7337  	{32, 7},
7338  	{0, 6},
7339  	{209, 12},
7340  	{209, 12},
7341  	{209, 12},
7342  	{209, 12},
7343  	{209, 12},
7344  	{209, 12},
7345  	{209, 12},
7346  	{145, 3},
7347  	{209, 12},
7348  	{209, 12},
7349  	{209, 12},
7350  	{146, 4},
7351  	{209, 12},
7352  	{149, 4},
7353  	{161, 4},
7354  	{64, 4},
7355  	{209, 12},
7356  	{209, 12},
7357  	{209, 12},
7358  	{147, 5},
7359  	{209, 12},
7360  	{150, 5},
7361  	{162, 5},
7362  	{65, 5},
7363  	{209, 12},
7364  	{153, 5},
7365  	{165, 5},
7366  	{67, 5},
7367  	{177, 5},
7368  	{73, 5},
7369  	{91, 5},
7370  	{64, 4},
7371  	{209, 12},
7372  	{209, 12},
7373  	{209, 12},
7374  	{148, 6},
7375  	{209, 12},
7376  	{151, 6},
7377  	{163, 6},
7378  	{66, 6},
7379  	{209, 12},
7380  	{154, 6},
7381  	{166, 6},
7382  	{68, 6},
7383  	{178, 6},
7384  	{74, 6},
7385  	{92, 6},
7386  	{64, 4},
7387  	{209, 12},
7388  	{157, 6},
7389  	{169, 6},
7390  	{70, 6},
7391  	{181, 6},
7392  	{76, 6},
7393  	{94, 6},
7394  	{65, 5},
7395  	{193, 6},
7396  	{82, 6},
7397  	{100, 6},
7398  	{67, 5},
7399  	{118, 6},
7400  	{73, 5},
7401  	{91, 5},
7402  	{0, 6},
7403  	{209, 12},
7404  	{209, 12},
7405  	{209, 12},
7406  	{209, 12},
7407  	{209, 12},
7408  	{152, 7},
7409  	{164, 7},
7410  	{145, 3},
7411  	{209, 12},
7412  	{155, 7},
7413  	{167, 7},
7414  	{69, 7},
7415  	{179, 7},
7416  	{75, 7},
7417  	{93, 7},
7418  	{64, 4},
7419  	{209, 12},
7420  	{158, 7},
7421  	{170, 7},
7422  	{71, 7},
7423  	{182, 7},
7424  	{77, 7},
7425  	{95, 7},
7426  	{65, 5},
7427  	{194, 7},
7428  	{83, 7},
7429  	{101, 7},
7430  	{67, 5},
7431  	{119, 7},
7432  	{73, 5},
7433  	{91, 5},
7434  	{1, 7},
7435  	{209, 12},
7436  	{209, 12},
7437  	{173, 7},
7438  	{148, 6},
7439  	{185, 7},
7440  	{79, 7},
7441  	{97, 7},
7442  	{66, 6},
7443  	{197, 7},
7444  	{85, 7},
7445  	{103, 7},
7446  	{68, 6},
7447  	{121, 7},
7448  	{74, 6},
7449  	{92, 6},
7450  	{2, 7},
7451  	{209, 12},
7452  	{157, 6},
7453  	{109, 7},
7454  	{70, 6},
7455  	{127, 7},
7456  	{76, 6},
7457  	{94, 6},
7458  	{4, 7},
7459  	{193, 6},
7460  	{82, 6},
7461  	{100, 6},
7462  	{8, 7},
7463  	{118, 6},
7464  	{16, 7},
7465  	{32, 7},
7466  	{0, 6},
7467  	{209, 12},
7468  	{209, 12},
7469  	{209, 12},
7470  	{209, 12},
7471  	{209, 12},
7472  	{209, 12},
7473  	{209, 12},
7474  	{145, 3},
7475  	{208, 12},
7476  	{156, 8},
7477  	{168, 8},
7478  	{146, 4},
7479  	{180, 8},
7480  	{149, 4},
7481  	{161, 4},
7482  	{64, 4},
7483  	{209, 12},
7484  	{159, 8},
7485  	{171, 8},
7486  	{72, 8},
7487  	{183, 8},
7488  	{78, 8},
7489  	{96, 8},
7490  	{65, 5},
7491  	{195, 8},
7492  	{84, 8},
7493  	{102, 8},
7494  	{67, 5},
7495  	{120, 8},
7496  	{73, 5},
7497  	{91, 5},
7498  	{64, 4},
7499  	{209, 12},
7500  	{209, 12},
7501  	{174, 8},
7502  	{148, 6},
7503  	{186, 8},
7504  	{80, 8},
7505  	{98, 8},
7506  	{66, 6},
7507  	{198, 8},
7508  	{86, 8},
7509  	{104, 8},
7510  	{68, 6},
7511  	{122, 8},
7512  	{74, 6},
7513  	{92, 6},
7514  	{3, 8},
7515  	{209, 12},
7516  	{157, 6},
7517  	{110, 8},
7518  	{70, 6},
7519  	{128, 8},
7520  	{76, 6},
7521  	{94, 6},
7522  	{5, 8},
7523  	{193, 6},
7524  	{82, 6},
7525  	{100, 6},
7526  	{9, 8},
7527  	{118, 6},
7528  	{17, 8},
7529  	{33, 8},
7530  	{0, 6},
7531  	{209, 12},
7532  	{209, 12},
7533  	{209, 12},
7534  	{209, 12},
7535  	{189, 8},
7536  	{152, 7},
7537  	{164, 7},
7538  	{145, 3},
7539  	{201, 8},
7540  	{88, 8},
7541  	{106, 8},
7542  	{69, 7},
7543  	{124, 8},
7544  	{75, 7},
7545  	{93, 7},
7546  	{64, 4},
7547  	{209, 12},
7548  	{158, 7},
7549  	{112, 8},
7550  	{71, 7},
7551  	{130, 8},
7552  	{77, 7},
7553  	{95, 7},
7554  	{6, 8},
7555  	{194, 7},
7556  	{83, 7},
7557  	{101, 7},
7558  	{10, 8},
7559  	{119, 7},
7560  	{18, 8},
7561  	{34, 8},
7562  	{1, 7},
7563  	{209, 12},
7564  	{209, 12},
7565  	{173, 7},
7566  	{148, 6},
7567  	{136, 8},
7568  	{79, 7},
7569  	{97, 7},
7570  	{66, 6},
7571  	{197, 7},
7572  	{85, 7},
7573  	{103, 7},
7574  	{12, 8},
7575  	{121, 7},
7576  	{20, 8},
7577  	{36, 8},
7578  	{2, 7},
7579  	{209, 12},
7580  	{157, 6},
7581  	{109, 7},
7582  	{70, 6},
7583  	{127, 7},
7584  	{24, 8},
7585  	{40, 8},
7586  	{4, 7},
7587  	{193, 6},
7588  	{82, 6},
7589  	{48, 8},
7590  	{8, 7},
7591  	{118, 6},
7592  	{16, 7},
7593  	{32, 7},
7594  	{0, 6},
7595  	{209, 12},
7596  	{209, 12},
7597  	{209, 12},
7598  	{209, 12},
7599  	{209, 12},
7600  	{209, 12},
7601  	{209, 12},
7602  	{145, 3},
7603  	{209, 12},
7604  	{209, 12},
7605  	{209, 12},
7606  	{146, 4},
7607  	{209, 12},
7608  	{149, 4},
7609  	{161, 4},
7610  	{64, 4},
7611  	{209, 12},
7612  	{160, 9},
7613  	{172, 9},
7614  	{147, 5},
7615  	{184, 9},
7616  	{150, 5},
7617  	{162, 5},
7618  	{65, 5},
7619  	{196, 9},
7620  	{153, 5},
7621  	{165, 5},
7622  	{67, 5},
7623  	{177, 5},
7624  	{73, 5},
7625  	{91, 5},
7626  	{64, 4},
7627  	{209, 12},
7628  	{209, 12},
7629  	{175, 9},
7630  	{148, 6},
7631  	{144, 12},
7632  	{81, 9},
7633  	{99, 9},
7634  	{66, 6},
7635  	{199, 9},
7636  	{87, 9},
7637  	{105, 9},
7638  	{68, 6},
7639  	{123, 9},
7640  	{74, 6},
7641  	{92, 6},
7642  	{64, 4},
7643  	{209, 12},
7644  	{157, 6},
7645  	{111, 9},
7646  	{70, 6},
7647  	{129, 9},
7648  	{76, 6},
7649  	{94, 6},
7650  	{65, 5},
7651  	{193, 6},
7652  	{82, 6},
7653  	{100, 6},
7654  	{67, 5},
7655  	{118, 6},
7656  	{73, 5},
7657  	{91, 5},
7658  	{0, 6},
7659  	{209, 12},
7660  	{209, 12},
7661  	{209, 12},
7662  	{209, 12},
7663  	{190, 9},
7664  	{152, 7},
7665  	{164, 7},
7666  	{145, 3},
7667  	{202, 9},
7668  	{89, 9},
7669  	{107, 9},
7670  	{69, 7},
7671  	{125, 9},
7672  	{75, 7},
7673  	{93, 7},
7674  	{64, 4},
7675  	{209, 12},
7676  	{158, 7},
7677  	{113, 9},
7678  	{71, 7},
7679  	{131, 9},
7680  	{77, 7},
7681  	{95, 7},
7682  	{7, 9},
7683  	{194, 7},
7684  	{83, 7},
7685  	{101, 7},
7686  	{11, 9},
7687  	{119, 7},
7688  	{19, 9},
7689  	{35, 9},
7690  	{1, 7},
7691  	{209, 12},
7692  	{209, 12},
7693  	{173, 7},
7694  	{148, 6},
7695  	{137, 9},
7696  	{79, 7},
7697  	{97, 7},
7698  	{66, 6},
7699  	{197, 7},
7700  	{85, 7},
7701  	{103, 7},
7702  	{13, 9},
7703  	{121, 7},
7704  	{21, 9},
7705  	{37, 9},
7706  	{2, 7},
7707  	{209, 12},
7708  	{157, 6},
7709  	{109, 7},
7710  	{70, 6},
7711  	{127, 7},
7712  	{25, 9},
7713  	{41, 9},
7714  	{4, 7},
7715  	{193, 6},
7716  	{82, 6},
7717  	{49, 9},
7718  	{8, 7},
7719  	{118, 6},
7720  	{16, 7},
7721  	{32, 7},
7722  	{0, 6},
7723  	{209, 12},
7724  	{209, 12},
7725  	{209, 12},
7726  	{209, 12},
7727  	{209, 12},
7728  	{209, 12},
7729  	{209, 12},
7730  	{145, 3},
7731  	{205, 9},
7732  	{156, 8},
7733  	{168, 8},
7734  	{146, 4},
7735  	{180, 8},
7736  	{149, 4},
7737  	{161, 4},
7738  	{64, 4},
7739  	{209, 12},
7740  	{159, 8},
7741  	{115, 9},
7742  	{72, 8},
7743  	{133, 9},
7744  	{78, 8},
7745  	{96, 8},
7746  	{65, 5},
7747  	{195, 8},
7748  	{84, 8},
7749  	{102, 8},
7750  	{67, 5},
7751  	{120, 8},
7752  	{73, 5},
7753  	{91, 5},
7754  	{64, 4},
7755  	{209, 12},
7756  	{209, 12},
7757  	{174, 8},
7758  	{148, 6},
7759  	{139, 9},
7760  	{80, 8},
7761  	{98, 8},
7762  	{66, 6},
7763  	{198, 8},
7764  	{86, 8},
7765  	{104, 8},
7766  	{14, 9},
7767  	{122, 8},
7768  	{22, 9},
7769  	{38, 9},
7770  	{3, 8},
7771  	{209, 12},
7772  	{157, 6},
7773  	{110, 8},
7774  	{70, 6},
7775  	{128, 8},
7776  	{26, 9},
7777  	{42, 9},
7778  	{5, 8},
7779  	{193, 6},
7780  	{82, 6},
7781  	{50, 9},
7782  	{9, 8},
7783  	{118, 6},
7784  	{17, 8},
7785  	{33, 8},
7786  	{0, 6},
7787  	{209, 12},
7788  	{209, 12},
7789  	{209, 12},
7790  	{209, 12},
7791  	{189, 8},
7792  	{152, 7},
7793  	{164, 7},
7794  	{145, 3},
7795  	{201, 8},
7796  	{88, 8},
7797  	{106, 8},
7798  	{69, 7},
7799  	{124, 8},
7800  	{75, 7},
7801  	{93, 7},
7802  	{64, 4},
7803  	{209, 12},
7804  	{158, 7},
7805  	{112, 8},
7806  	{71, 7},
7807  	{130, 8},
7808  	{28, 9},
7809  	{44, 9},
7810  	{6, 8},
7811  	{194, 7},
7812  	{83, 7},
7813  	{52, 9},
7814  	{10, 8},
7815  	{119, 7},
7816  	{18, 8},
7817  	{34, 8},
7818  	{1, 7},
7819  	{209, 12},
7820  	{209, 12},
7821  	{173, 7},
7822  	{148, 6},
7823  	{136, 8},
7824  	{79, 7},
7825  	{97, 7},
7826  	{66, 6},
7827  	{197, 7},
7828  	{85, 7},
7829  	{56, 9},
7830  	{12, 8},
7831  	{121, 7},
7832  	{20, 8},
7833  	{36, 8},
7834  	{2, 7},
7835  	{209, 12},
7836  	{157, 6},
7837  	{109, 7},
7838  	{70, 6},
7839  	{127, 7},
7840  	{24, 8},
7841  	{40, 8},
7842  	{4, 7},
7843  	{193, 6},
7844  	{82, 6},
7845  	{48, 8},
7846  	{8, 7},
7847  	{118, 6},
7848  	{16, 7},
7849  	{32, 7},
7850  	{0, 6},
7851  	{209, 12},
7852  	{209, 12},
7853  	{209, 12},
7854  	{209, 12},
7855  	{209, 12},
7856  	{209, 12},
7857  	{209, 12},
7858  	{145, 3},
7859  	{209, 12},
7860  	{209, 12},
7861  	{209, 12},
7862  	{146, 4},
7863  	{209, 12},
7864  	{149, 4},
7865  	{161, 4},
7866  	{64, 4},
7867  	{209, 12},
7868  	{209, 12},
7869  	{209, 12},
7870  	{147, 5},
7871  	{209, 12},
7872  	{150, 5},
7873  	{162, 5},
7874  	{65, 5},
7875  	{209, 12},
7876  	{153, 5},
7877  	{165, 5},
7878  	{67, 5},
7879  	{177, 5},
7880  	{73, 5},
7881  	{91, 5},
7882  	{64, 4},
7883  	{209, 12},
7884  	{209, 12},
7885  	{176, 10},
7886  	{148, 6},
7887  	{188, 10},
7888  	{151, 6},
7889  	{163, 6},
7890  	{66, 6},
7891  	{200, 10},
7892  	{154, 6},
7893  	{166, 6},
7894  	{68, 6},
7895  	{178, 6},
7896  	{74, 6},
7897  	{92, 6},
7898  	{64, 4},
7899  	{209, 12},
7900  	{157, 6},
7901  	{169, 6},
7902  	{70, 6},
7903  	{181, 6},
7904  	{76, 6},
7905  	{94, 6},
7906  	{65, 5},
7907  	{193, 6},
7908  	{82, 6},
7909  	{100, 6},
7910  	{67, 5},
7911  	{118, 6},
7912  	{73, 5},
7913  	{91, 5},
7914  	{0, 6},
7915  	{209, 12},
7916  	{209, 12},
7917  	{209, 12},
7918  	{209, 12},
7919  	{191, 10},
7920  	{152, 7},
7921  	{164, 7},
7922  	{145, 3},
7923  	{203, 10},
7924  	{90, 10},
7925  	{108, 10},
7926  	{69, 7},
7927  	{126, 10},
7928  	{75, 7},
7929  	{93, 7},
7930  	{64, 4},
7931  	{209, 12},
7932  	{158, 7},
7933  	{114, 10},
7934  	{71, 7},
7935  	{132, 10},
7936  	{77, 7},
7937  	{95, 7},
7938  	{65, 5},
7939  	{194, 7},
7940  	{83, 7},
7941  	{101, 7},
7942  	{67, 5},
7943  	{119, 7},
7944  	{73, 5},
7945  	{91, 5},
7946  	{1, 7},
7947  	{209, 12},
7948  	{209, 12},
7949  	{173, 7},
7950  	{148, 6},
7951  	{138, 10},
7952  	{79, 7},
7953  	{97, 7},
7954  	{66, 6},
7955  	{197, 7},
7956  	{85, 7},
7957  	{103, 7},
7958  	{68, 6},
7959  	{121, 7},
7960  	{74, 6},
7961  	{92, 6},
7962  	{2, 7},
7963  	{209, 12},
7964  	{157, 6},
7965  	{109, 7},
7966  	{70, 6},
7967  	{127, 7},
7968  	{76, 6},
7969  	{94, 6},
7970  	{4, 7},
7971  	{193, 6},
7972  	{82, 6},
7973  	{100, 6},
7974  	{8, 7},
7975  	{118, 6},
7976  	{16, 7},
7977  	{32, 7},
7978  	{0, 6},
7979  	{209, 12},
7980  	{209, 12},
7981  	{209, 12},
7982  	{209, 12},
7983  	{209, 12},
7984  	{209, 12},
7985  	{209, 12},
7986  	{145, 3},
7987  	{206, 10},
7988  	{156, 8},
7989  	{168, 8},
7990  	{146, 4},
7991  	{180, 8},
7992  	{149, 4},
7993  	{161, 4},
7994  	{64, 4},
7995  	{209, 12},
7996  	{159, 8},
7997  	{116, 10},
7998  	{72, 8},
7999  	{134, 10},
8000  	{78, 8},
8001  	{96, 8},
8002  	{65, 5},
8003  	{195, 8},
8004  	{84, 8},
8005  	{102, 8},
8006  	{67, 5},
8007  	{120, 8},
8008  	{73, 5},
8009  	{91, 5},
8010  	{64, 4},
8011  	{209, 12},
8012  	{209, 12},
8013  	{174, 8},
8014  	{148, 6},
8015  	{140, 10},
8016  	{80, 8},
8017  	{98, 8},
8018  	{66, 6},
8019  	{198, 8},
8020  	{86, 8},
8021  	{63, 12},
8022  	{15, 10},
8023  	{122, 8},
8024  	{23, 10},
8025  	{39, 10},
8026  	{3, 8},
8027  	{209, 12},
8028  	{157, 6},
8029  	{110, 8},
8030  	{70, 6},
8031  	{128, 8},
8032  	{27, 10},
8033  	{43, 10},
8034  	{5, 8},
8035  	{193, 6},
8036  	{82, 6},
8037  	{51, 10},
8038  	{9, 8},
8039  	{118, 6},
8040  	{17, 8},
8041  	{33, 8},
8042  	{0, 6},
8043  	{209, 12},
8044  	{209, 12},
8045  	{209, 12},
8046  	{209, 12},
8047  	{189, 8},
8048  	{152, 7},
8049  	{164, 7},
8050  	{145, 3},
8051  	{201, 8},
8052  	{88, 8},
8053  	{106, 8},
8054  	{69, 7},
8055  	{124, 8},
8056  	{75, 7},
8057  	{93, 7},
8058  	{64, 4},
8059  	{209, 12},
8060  	{158, 7},
8061  	{112, 8},
8062  	{71, 7},
8063  	{130, 8},
8064  	{29, 10},
8065  	{45, 10},
8066  	{6, 8},
8067  	{194, 7},
8068  	{83, 7},
8069  	{53, 10},
8070  	{10, 8},
8071  	{119, 7},
8072  	{18, 8},
8073  	{34, 8},
8074  	{1, 7},
8075  	{209, 12},
8076  	{209, 12},
8077  	{173, 7},
8078  	{148, 6},
8079  	{136, 8},
8080  	{79, 7},
8081  	{97, 7},
8082  	{66, 6},
8083  	{197, 7},
8084  	{85, 7},
8085  	{57, 10},
8086  	{12, 8},
8087  	{121, 7},
8088  	{20, 8},
8089  	{36, 8},
8090  	{2, 7},
8091  	{209, 12},
8092  	{157, 6},
8093  	{109, 7},
8094  	{70, 6},
8095  	{127, 7},
8096  	{24, 8},
8097  	{40, 8},
8098  	{4, 7},
8099  	{193, 6},
8100  	{82, 6},
8101  	{48, 8},
8102  	{8, 7},
8103  	{118, 6},
8104  	{16, 7},
8105  	{32, 7},
8106  	{0, 6},
8107  	{209, 12},
8108  	{209, 12},
8109  	{209, 12},
8110  	{209, 12},
8111  	{209, 12},
8112  	{209, 12},
8113  	{209, 12},
8114  	{145, 3},
8115  	{209, 12},
8116  	{209, 12},
8117  	{209, 12},
8118  	{146, 4},
8119  	{209, 12},
8120  	{149, 4},
8121  	{161, 4},
8122  	{64, 4},
8123  	{209, 12},
8124  	{160, 9},
8125  	{172, 9},
8126  	{147, 5},
8127  	{184, 9},
8128  	{150, 5},
8129  	{162, 5},
8130  	{65, 5},
8131  	{196, 9},
8132  	{153, 5},
8133  	{165, 5},
8134  	{67, 5},
8135  	{177, 5},
8136  	{73, 5},
8137  	{91, 5},
8138  	{64, 4},
8139  	{209, 12},
8140  	{209, 12},
8141  	{175, 9},
8142  	{148, 6},
8143  	{142, 10},
8144  	{81, 9},
8145  	{99, 9},
8146  	{66, 6},
8147  	{199, 9},
8148  	{87, 9},
8149  	{105, 9},
8150  	{68, 6},
8151  	{123, 9},
8152  	{74, 6},
8153  	{92, 6},
8154  	{64, 4},
8155  	{209, 12},
8156  	{157, 6},
8157  	{111, 9},
8158  	{70, 6},
8159  	{129, 9},
8160  	{76, 6},
8161  	{94, 6},
8162  	{65, 5},
8163  	{193, 6},
8164  	{82, 6},
8165  	{100, 6},
8166  	{67, 5},
8167  	{118, 6},
8168  	{73, 5},
8169  	{91, 5},
8170  	{0, 6},
8171  	{209, 12},
8172  	{209, 12},
8173  	{209, 12},
8174  	{209, 12},
8175  	{190, 9},
8176  	{152, 7},
8177  	{164, 7},
8178  	{145, 3},
8179  	{202, 9},
8180  	{89, 9},
8181  	{107, 9},
8182  	{69, 7},
8183  	{125, 9},
8184  	{75, 7},
8185  	{93, 7},
8186  	{64, 4},
8187  	{209, 12},
8188  	{158, 7},
8189  	{113, 9},
8190  	{71, 7},
8191  	{131, 9},
8192  	{30, 10},
8193  	{46, 10},
8194  	{7, 9},
8195  	{194, 7},
8196  	{83, 7},
8197  	{54, 10},
8198  	{11, 9},
8199  	{119, 7},
8200  	{19, 9},
8201  	{35, 9},
8202  	{1, 7},
8203  	{209, 12},
8204  	{209, 12},
8205  	{173, 7},
8206  	{148, 6},
8207  	{137, 9},
8208  	{79, 7},
8209  	{97, 7},
8210  	{66, 6},
8211  	{197, 7},
8212  	{85, 7},
8213  	{58, 10},
8214  	{13, 9},
8215  	{121, 7},
8216  	{21, 9},
8217  	{37, 9},
8218  	{2, 7},
8219  	{209, 12},
8220  	{157, 6},
8221  	{109, 7},
8222  	{70, 6},
8223  	{127, 7},
8224  	{25, 9},
8225  	{41, 9},
8226  	{4, 7},
8227  	{193, 6},
8228  	{82, 6},
8229  	{49, 9},
8230  	{8, 7},
8231  	{118, 6},
8232  	{16, 7},
8233  	{32, 7},
8234  	{0, 6},
8235  	{209, 12},
8236  	{209, 12},
8237  	{209, 12},
8238  	{209, 12},
8239  	{209, 12},
8240  	{209, 12},
8241  	{209, 12},
8242  	{145, 3},
8243  	{205, 9},
8244  	{156, 8},
8245  	{168, 8},
8246  	{146, 4},
8247  	{180, 8},
8248  	{149, 4},
8249  	{161, 4},
8250  	{64, 4},
8251  	{209, 12},
8252  	{159, 8},
8253  	{115, 9},
8254  	{72, 8},
8255  	{133, 9},
8256  	{78, 8},
8257  	{96, 8},
8258  	{65, 5},
8259  	{195, 8},
8260  	{84, 8},
8261  	{102, 8},
8262  	{67, 5},
8263  	{120, 8},
8264  	{73, 5},
8265  	{91, 5},
8266  	{64, 4},
8267  	{209, 12},
8268  	{209, 12},
8269  	{174, 8},
8270  	{148, 6},
8271  	{139, 9},
8272  	{80, 8},
8273  	{98, 8},
8274  	{66, 6},
8275  	{198, 8},
8276  	{86, 8},
8277  	{60, 10},
8278  	{14, 9},
8279  	{122, 8},
8280  	{22, 9},
8281  	{38, 9},
8282  	{3, 8},
8283  	{209, 12},
8284  	{157, 6},
8285  	{110, 8},
8286  	{70, 6},
8287  	{128, 8},
8288  	{26, 9},
8289  	{42, 9},
8290  	{5, 8},
8291  	{193, 6},
8292  	{82, 6},
8293  	{50, 9},
8294  	{9, 8},
8295  	{118, 6},
8296  	{17, 8},
8297  	{33, 8},
8298  	{0, 6},
8299  	{209, 12},
8300  	{209, 12},
8301  	{209, 12},
8302  	{209, 12},
8303  	{189, 8},
8304  	{152, 7},
8305  	{164, 7},
8306  	{145, 3},
8307  	{201, 8},
8308  	{88, 8},
8309  	{106, 8},
8310  	{69, 7},
8311  	{124, 8},
8312  	{75, 7},
8313  	{93, 7},
8314  	{64, 4},
8315  	{209, 12},
8316  	{158, 7},
8317  	{112, 8},
8318  	{71, 7},
8319  	{130, 8},
8320  	{28, 9},
8321  	{44, 9},
8322  	{6, 8},
8323  	{194, 7},
8324  	{83, 7},
8325  	{52, 9},
8326  	{10, 8},
8327  	{119, 7},
8328  	{18, 8},
8329  	{34, 8},
8330  	{1, 7},
8331  	{209, 12},
8332  	{209, 12},
8333  	{173, 7},
8334  	{148, 6},
8335  	{136, 8},
8336  	{79, 7},
8337  	{97, 7},
8338  	{66, 6},
8339  	{197, 7},
8340  	{85, 7},
8341  	{56, 9},
8342  	{12, 8},
8343  	{121, 7},
8344  	{20, 8},
8345  	{36, 8},
8346  	{2, 7},
8347  	{209, 12},
8348  	{157, 6},
8349  	{109, 7},
8350  	{70, 6},
8351  	{127, 7},
8352  	{24, 8},
8353  	{40, 8},
8354  	{4, 7},
8355  	{193, 6},
8356  	{82, 6},
8357  	{48, 8},
8358  	{8, 7},
8359  	{118, 6},
8360  	{16, 7},
8361  	{32, 7},
8362  	{0, 6},
8363  	{209, 12},
8364  	{209, 12},
8365  	{209, 12},
8366  	{209, 12},
8367  	{209, 12},
8368  	{209, 12},
8369  	{209, 12},
8370  	{145, 3},
8371  	{209, 12},
8372  	{209, 12},
8373  	{209, 12},
8374  	{146, 4},
8375  	{209, 12},
8376  	{149, 4},
8377  	{161, 4},
8378  	{64, 4},
8379  	{209, 12},
8380  	{209, 12},
8381  	{209, 12},
8382  	{147, 5},
8383  	{209, 12},
8384  	{150, 5},
8385  	{162, 5},
8386  	{65, 5},
8387  	{209, 12},
8388  	{153, 5},
8389  	{165, 5},
8390  	{67, 5},
8391  	{177, 5},
8392  	{73, 5},
8393  	{91, 5},
8394  	{64, 4},
8395  	{209, 12},
8396  	{209, 12},
8397  	{209, 12},
8398  	{148, 6},
8399  	{209, 12},
8400  	{151, 6},
8401  	{163, 6},
8402  	{66, 6},
8403  	{209, 12},
8404  	{154, 6},
8405  	{166, 6},
8406  	{68, 6},
8407  	{178, 6},
8408  	{74, 6},
8409  	{92, 6},
8410  	{64, 4},
8411  	{209, 12},
8412  	{157, 6},
8413  	{169, 6},
8414  	{70, 6},
8415  	{181, 6},
8416  	{76, 6},
8417  	{94, 6},
8418  	{65, 5},
8419  	{193, 6},
8420  	{82, 6},
8421  	{100, 6},
8422  	{67, 5},
8423  	{118, 6},
8424  	{73, 5},
8425  	{91, 5},
8426  	{0, 6},
8427  	{209, 12},
8428  	{209, 12},
8429  	{209, 12},
8430  	{209, 12},
8431  	{192, 11},
8432  	{152, 7},
8433  	{164, 7},
8434  	{145, 3},
8435  	{204, 11},
8436  	{155, 7},
8437  	{167, 7},
8438  	{69, 7},
8439  	{179, 7},
8440  	{75, 7},
8441  	{93, 7},
8442  	{64, 4},
8443  	{209, 12},
8444  	{158, 7},
8445  	{170, 7},
8446  	{71, 7},
8447  	{182, 7},
8448  	{77, 7},
8449  	{95, 7},
8450  	{65, 5},
8451  	{194, 7},
8452  	{83, 7},
8453  	{101, 7},
8454  	{67, 5},
8455  	{119, 7},
8456  	{73, 5},
8457  	{91, 5},
8458  	{1, 7},
8459  	{209, 12},
8460  	{209, 12},
8461  	{173, 7},
8462  	{148, 6},
8463  	{185, 7},
8464  	{79, 7},
8465  	{97, 7},
8466  	{66, 6},
8467  	{197, 7},
8468  	{85, 7},
8469  	{103, 7},
8470  	{68, 6},
8471  	{121, 7},
8472  	{74, 6},
8473  	{92, 6},
8474  	{2, 7},
8475  	{209, 12},
8476  	{157, 6},
8477  	{109, 7},
8478  	{70, 6},
8479  	{127, 7},
8480  	{76, 6},
8481  	{94, 6},
8482  	{4, 7},
8483  	{193, 6},
8484  	{82, 6},
8485  	{100, 6},
8486  	{8, 7},
8487  	{118, 6},
8488  	{16, 7},
8489  	{32, 7},
8490  	{0, 6},
8491  	{209, 12},
8492  	{209, 12},
8493  	{209, 12},
8494  	{209, 12},
8495  	{209, 12},
8496  	{209, 12},
8497  	{209, 12},
8498  	{145, 3},
8499  	{207, 11},
8500  	{156, 8},
8501  	{168, 8},
8502  	{146, 4},
8503  	{180, 8},
8504  	{149, 4},
8505  	{161, 4},
8506  	{64, 4},
8507  	{209, 12},
8508  	{159, 8},
8509  	{117, 11},
8510  	{72, 8},
8511  	{135, 11},
8512  	{78, 8},
8513  	{96, 8},
8514  	{65, 5},
8515  	{195, 8},
8516  	{84, 8},
8517  	{102, 8},
8518  	{67, 5},
8519  	{120, 8},
8520  	{73, 5},
8521  	{91, 5},
8522  	{64, 4},
8523  	{209, 12},
8524  	{209, 12},
8525  	{174, 8},
8526  	{148, 6},
8527  	{141, 11},
8528  	{80, 8},
8529  	{98, 8},
8530  	{66, 6},
8531  	{198, 8},
8532  	{86, 8},
8533  	{104, 8},
8534  	{68, 6},
8535  	{122, 8},
8536  	{74, 6},
8537  	{92, 6},
8538  	{3, 8},
8539  	{209, 12},
8540  	{157, 6},
8541  	{110, 8},
8542  	{70, 6},
8543  	{128, 8},
8544  	{76, 6},
8545  	{94, 6},
8546  	{5, 8},
8547  	{193, 6},
8548  	{82, 6},
8549  	{100, 6},
8550  	{9, 8},
8551  	{118, 6},
8552  	{17, 8},
8553  	{33, 8},
8554  	{0, 6},
8555  	{209, 12},
8556  	{209, 12},
8557  	{209, 12},
8558  	{209, 12},
8559  	{189, 8},
8560  	{152, 7},
8561  	{164, 7},
8562  	{145, 3},
8563  	{201, 8},
8564  	{88, 8},
8565  	{106, 8},
8566  	{69, 7},
8567  	{124, 8},
8568  	{75, 7},
8569  	{93, 7},
8570  	{64, 4},
8571  	{209, 12},
8572  	{158, 7},
8573  	{112, 8},
8574  	{71, 7},
8575  	{130, 8},
8576  	{77, 7},
8577  	{95, 7},
8578  	{6, 8},
8579  	{194, 7},
8580  	{83, 7},
8581  	{101, 7},
8582  	{10, 8},
8583  	{119, 7},
8584  	{18, 8},
8585  	{34, 8},
8586  	{1, 7},
8587  	{209, 12},
8588  	{209, 12},
8589  	{173, 7},
8590  	{148, 6},
8591  	{136, 8},
8592  	{79, 7},
8593  	{97, 7},
8594  	{66, 6},
8595  	{197, 7},
8596  	{85, 7},
8597  	{103, 7},
8598  	{12, 8},
8599  	{121, 7},
8600  	{20, 8},
8601  	{36, 8},
8602  	{2, 7},
8603  	{209, 12},
8604  	{157, 6},
8605  	{109, 7},
8606  	{70, 6},
8607  	{127, 7},
8608  	{24, 8},
8609  	{40, 8},
8610  	{4, 7},
8611  	{193, 6},
8612  	{82, 6},
8613  	{48, 8},
8614  	{8, 7},
8615  	{118, 6},
8616  	{16, 7},
8617  	{32, 7},
8618  	{0, 6},
8619  	{209, 12},
8620  	{209, 12},
8621  	{209, 12},
8622  	{209, 12},
8623  	{209, 12},
8624  	{209, 12},
8625  	{209, 12},
8626  	{145, 3},
8627  	{209, 12},
8628  	{209, 12},
8629  	{209, 12},
8630  	{146, 4},
8631  	{209, 12},
8632  	{149, 4},
8633  	{161, 4},
8634  	{64, 4},
8635  	{209, 12},
8636  	{160, 9},
8637  	{172, 9},
8638  	{147, 5},
8639  	{184, 9},
8640  	{150, 5},
8641  	{162, 5},
8642  	{65, 5},
8643  	{196, 9},
8644  	{153, 5},
8645  	{165, 5},
8646  	{67, 5},
8647  	{177, 5},
8648  	{73, 5},
8649  	{91, 5},
8650  	{64, 4},
8651  	{209, 12},
8652  	{209, 12},
8653  	{175, 9},
8654  	{148, 6},
8655  	{143, 11},
8656  	{81, 9},
8657  	{99, 9},
8658  	{66, 6},
8659  	{199, 9},
8660  	{87, 9},
8661  	{105, 9},
8662  	{68, 6},
8663  	{123, 9},
8664  	{74, 6},
8665  	{92, 6},
8666  	{64, 4},
8667  	{209, 12},
8668  	{157, 6},
8669  	{111, 9},
8670  	{70, 6},
8671  	{129, 9},
8672  	{76, 6},
8673  	{94, 6},
8674  	{65, 5},
8675  	{193, 6},
8676  	{82, 6},
8677  	{100, 6},
8678  	{67, 5},
8679  	{118, 6},
8680  	{73, 5},
8681  	{91, 5},
8682  	{0, 6},
8683  	{209, 12},
8684  	{209, 12},
8685  	{209, 12},
8686  	{209, 12},
8687  	{190, 9},
8688  	{152, 7},
8689  	{164, 7},
8690  	{145, 3},
8691  	{202, 9},
8692  	{89, 9},
8693  	{107, 9},
8694  	{69, 7},
8695  	{125, 9},
8696  	{75, 7},
8697  	{93, 7},
8698  	{64, 4},
8699  	{209, 12},
8700  	{158, 7},
8701  	{113, 9},
8702  	{71, 7},
8703  	{131, 9},
8704  	{31, 11},
8705  	{47, 11},
8706  	{7, 9},
8707  	{194, 7},
8708  	{83, 7},
8709  	{55, 11},
8710  	{11, 9},
8711  	{119, 7},
8712  	{19, 9},
8713  	{35, 9},
8714  	{1, 7},
8715  	{209, 12},
8716  	{209, 12},
8717  	{173, 7},
8718  	{148, 6},
8719  	{137, 9},
8720  	{79, 7},
8721  	{97, 7},
8722  	{66, 6},
8723  	{197, 7},
8724  	{85, 7},
8725  	{59, 11},
8726  	{13, 9},
8727  	{121, 7},
8728  	{21, 9},
8729  	{37, 9},
8730  	{2, 7},
8731  	{209, 12},
8732  	{157, 6},
8733  	{109, 7},
8734  	{70, 6},
8735  	{127, 7},
8736  	{25, 9},
8737  	{41, 9},
8738  	{4, 7},
8739  	{193, 6},
8740  	{82, 6},
8741  	{49, 9},
8742  	{8, 7},
8743  	{118, 6},
8744  	{16, 7},
8745  	{32, 7},
8746  	{0, 6},
8747  	{209, 12},
8748  	{209, 12},
8749  	{209, 12},
8750  	{209, 12},
8751  	{209, 12},
8752  	{209, 12},
8753  	{209, 12},
8754  	{145, 3},
8755  	{205, 9},
8756  	{156, 8},
8757  	{168, 8},
8758  	{146, 4},
8759  	{180, 8},
8760  	{149, 4},
8761  	{161, 4},
8762  	{64, 4},
8763  	{209, 12},
8764  	{159, 8},
8765  	{115, 9},
8766  	{72, 8},
8767  	{133, 9},
8768  	{78, 8},
8769  	{96, 8},
8770  	{65, 5},
8771  	{195, 8},
8772  	{84, 8},
8773  	{102, 8},
8774  	{67, 5},
8775  	{120, 8},
8776  	{73, 5},
8777  	{91, 5},
8778  	{64, 4},
8779  	{209, 12},
8780  	{209, 12},
8781  	{174, 8},
8782  	{148, 6},
8783  	{139, 9},
8784  	{80, 8},
8785  	{98, 8},
8786  	{66, 6},
8787  	{198, 8},
8788  	{86, 8},
8789  	{61, 11},
8790  	{14, 9},
8791  	{122, 8},
8792  	{22, 9},
8793  	{38, 9},
8794  	{3, 8},
8795  	{209, 12},
8796  	{157, 6},
8797  	{110, 8},
8798  	{70, 6},
8799  	{128, 8},
8800  	{26, 9},
8801  	{42, 9},
8802  	{5, 8},
8803  	{193, 6},
8804  	{82, 6},
8805  	{50, 9},
8806  	{9, 8},
8807  	{118, 6},
8808  	{17, 8},
8809  	{33, 8},
8810  	{0, 6},
8811  	{209, 12},
8812  	{209, 12},
8813  	{209, 12},
8814  	{209, 12},
8815  	{189, 8},
8816  	{152, 7},
8817  	{164, 7},
8818  	{145, 3},
8819  	{201, 8},
8820  	{88, 8},
8821  	{106, 8},
8822  	{69, 7},
8823  	{124, 8},
8824  	{75, 7},
8825  	{93, 7},
8826  	{64, 4},
8827  	{209, 12},
8828  	{158, 7},
8829  	{112, 8},
8830  	{71, 7},
8831  	{130, 8},
8832  	{28, 9},
8833  	{44, 9},
8834  	{6, 8},
8835  	{194, 7},
8836  	{83, 7},
8837  	{52, 9},
8838  	{10, 8},
8839  	{119, 7},
8840  	{18, 8},
8841  	{34, 8},
8842  	{1, 7},
8843  	{209, 12},
8844  	{209, 12},
8845  	{173, 7},
8846  	{148, 6},
8847  	{136, 8},
8848  	{79, 7},
8849  	{97, 7},
8850  	{66, 6},
8851  	{197, 7},
8852  	{85, 7},
8853  	{56, 9},
8854  	{12, 8},
8855  	{121, 7},
8856  	{20, 8},
8857  	{36, 8},
8858  	{2, 7},
8859  	{209, 12},
8860  	{157, 6},
8861  	{109, 7},
8862  	{70, 6},
8863  	{127, 7},
8864  	{24, 8},
8865  	{40, 8},
8866  	{4, 7},
8867  	{193, 6},
8868  	{82, 6},
8869  	{48, 8},
8870  	{8, 7},
8871  	{118, 6},
8872  	{16, 7},
8873  	{32, 7},
8874  	{0, 6},
8875  	{209, 12},
8876  	{209, 12},
8877  	{209, 12},
8878  	{209, 12},
8879  	{209, 12},
8880  	{209, 12},
8881  	{209, 12},
8882  	{145, 3},
8883  	{209, 12},
8884  	{209, 12},
8885  	{209, 12},
8886  	{146, 4},
8887  	{209, 12},
8888  	{149, 4},
8889  	{161, 4},
8890  	{64, 4},
8891  	{209, 12},
8892  	{209, 12},
8893  	{209, 12},
8894  	{147, 5},
8895  	{209, 12},
8896  	{150, 5},
8897  	{162, 5},
8898  	{65, 5},
8899  	{209, 12},
8900  	{153, 5},
8901  	{165, 5},
8902  	{67, 5},
8903  	{177, 5},
8904  	{73, 5},
8905  	{91, 5},
8906  	{64, 4},
8907  	{209, 12},
8908  	{209, 12},
8909  	{176, 10},
8910  	{148, 6},
8911  	{188, 10},
8912  	{151, 6},
8913  	{163, 6},
8914  	{66, 6},
8915  	{200, 10},
8916  	{154, 6},
8917  	{166, 6},
8918  	{68, 6},
8919  	{178, 6},
8920  	{74, 6},
8921  	{92, 6},
8922  	{64, 4},
8923  	{209, 12},
8924  	{157, 6},
8925  	{169, 6},
8926  	{70, 6},
8927  	{181, 6},
8928  	{76, 6},
8929  	{94, 6},
8930  	{65, 5},
8931  	{193, 6},
8932  	{82, 6},
8933  	{100, 6},
8934  	{67, 5},
8935  	{118, 6},
8936  	{73, 5},
8937  	{91, 5},
8938  	{0, 6},
8939  	{209, 12},
8940  	{209, 12},
8941  	{209, 12},
8942  	{209, 12},
8943  	{191, 10},
8944  	{152, 7},
8945  	{164, 7},
8946  	{145, 3},
8947  	{203, 10},
8948  	{90, 10},
8949  	{108, 10},
8950  	{69, 7},
8951  	{126, 10},
8952  	{75, 7},
8953  	{93, 7},
8954  	{64, 4},
8955  	{209, 12},
8956  	{158, 7},
8957  	{114, 10},
8958  	{71, 7},
8959  	{132, 10},
8960  	{77, 7},
8961  	{95, 7},
8962  	{65, 5},
8963  	{194, 7},
8964  	{83, 7},
8965  	{101, 7},
8966  	{67, 5},
8967  	{119, 7},
8968  	{73, 5},
8969  	{91, 5},
8970  	{1, 7},
8971  	{209, 12},
8972  	{209, 12},
8973  	{173, 7},
8974  	{148, 6},
8975  	{138, 10},
8976  	{79, 7},
8977  	{97, 7},
8978  	{66, 6},
8979  	{197, 7},
8980  	{85, 7},
8981  	{103, 7},
8982  	{68, 6},
8983  	{121, 7},
8984  	{74, 6},
8985  	{92, 6},
8986  	{2, 7},
8987  	{209, 12},
8988  	{157, 6},
8989  	{109, 7},
8990  	{70, 6},
8991  	{127, 7},
8992  	{76, 6},
8993  	{94, 6},
8994  	{4, 7},
8995  	{193, 6},
8996  	{82, 6},
8997  	{100, 6},
8998  	{8, 7},
8999  	{118, 6},
9000  	{16, 7},
9001  	{32, 7},
9002  	{0, 6},
9003  	{209, 12},
9004  	{209, 12},
9005  	{209, 12},
9006  	{209, 12},
9007  	{209, 12},
9008  	{209, 12},
9009  	{209, 12},
9010  	{145, 3},
9011  	{206, 10},
9012  	{156, 8},
9013  	{168, 8},
9014  	{146, 4},
9015  	{180, 8},
9016  	{149, 4},
9017  	{161, 4},
9018  	{64, 4},
9019  	{209, 12},
9020  	{159, 8},
9021  	{116, 10},
9022  	{72, 8},
9023  	{134, 10},
9024  	{78, 8},
9025  	{96, 8},
9026  	{65, 5},
9027  	{195, 8},
9028  	{84, 8},
9029  	{102, 8},
9030  	{67, 5},
9031  	{120, 8},
9032  	{73, 5},
9033  	{91, 5},
9034  	{64, 4},
9035  	{209, 12},
9036  	{209, 12},
9037  	{174, 8},
9038  	{148, 6},
9039  	{140, 10},
9040  	{80, 8},
9041  	{98, 8},
9042  	{66, 6},
9043  	{198, 8},
9044  	{86, 8},
9045  	{62, 11},
9046  	{15, 10},
9047  	{122, 8},
9048  	{23, 10},
9049  	{39, 10},
9050  	{3, 8},
9051  	{209, 12},
9052  	{157, 6},
9053  	{110, 8},
9054  	{70, 6},
9055  	{128, 8},
9056  	{27, 10},
9057  	{43, 10},
9058  	{5, 8},
9059  	{193, 6},
9060  	{82, 6},
9061  	{51, 10},
9062  	{9, 8},
9063  	{118, 6},
9064  	{17, 8},
9065  	{33, 8},
9066  	{0, 6},
9067  	{209, 12},
9068  	{209, 12},
9069  	{209, 12},
9070  	{209, 12},
9071  	{189, 8},
9072  	{152, 7},
9073  	{164, 7},
9074  	{145, 3},
9075  	{201, 8},
9076  	{88, 8},
9077  	{106, 8},
9078  	{69, 7},
9079  	{124, 8},
9080  	{75, 7},
9081  	{93, 7},
9082  	{64, 4},
9083  	{209, 12},
9084  	{158, 7},
9085  	{112, 8},
9086  	{71, 7},
9087  	{130, 8},
9088  	{29, 10},
9089  	{45, 10},
9090  	{6, 8},
9091  	{194, 7},
9092  	{83, 7},
9093  	{53, 10},
9094  	{10, 8},
9095  	{119, 7},
9096  	{18, 8},
9097  	{34, 8},
9098  	{1, 7},
9099  	{209, 12},
9100  	{209, 12},
9101  	{173, 7},
9102  	{148, 6},
9103  	{136, 8},
9104  	{79, 7},
9105  	{97, 7},
9106  	{66, 6},
9107  	{197, 7},
9108  	{85, 7},
9109  	{57, 10},
9110  	{12, 8},
9111  	{121, 7},
9112  	{20, 8},
9113  	{36, 8},
9114  	{2, 7},
9115  	{209, 12},
9116  	{157, 6},
9117  	{109, 7},
9118  	{70, 6},
9119  	{127, 7},
9120  	{24, 8},
9121  	{40, 8},
9122  	{4, 7},
9123  	{193, 6},
9124  	{82, 6},
9125  	{48, 8},
9126  	{8, 7},
9127  	{118, 6},
9128  	{16, 7},
9129  	{32, 7},
9130  	{0, 6},
9131  	{209, 12},
9132  	{209, 12},
9133  	{209, 12},
9134  	{209, 12},
9135  	{209, 12},
9136  	{209, 12},
9137  	{209, 12},
9138  	{145, 3},
9139  	{209, 12},
9140  	{209, 12},
9141  	{209, 12},
9142  	{146, 4},
9143  	{209, 12},
9144  	{149, 4},
9145  	{161, 4},
9146  	{64, 4},
9147  	{209, 12},
9148  	{160, 9},
9149  	{172, 9},
9150  	{147, 5},
9151  	{184, 9},
9152  	{150, 5},
9153  	{162, 5},
9154  	{65, 5},
9155  	{196, 9},
9156  	{153, 5},
9157  	{165, 5},
9158  	{67, 5},
9159  	{177, 5},
9160  	{73, 5},
9161  	{91, 5},
9162  	{64, 4},
9163  	{209, 12},
9164  	{209, 12},
9165  	{175, 9},
9166  	{148, 6},
9167  	{142, 10},
9168  	{81, 9},
9169  	{99, 9},
9170  	{66, 6},
9171  	{199, 9},
9172  	{87, 9},
9173  	{105, 9},
9174  	{68, 6},
9175  	{123, 9},
9176  	{74, 6},
9177  	{92, 6},
9178  	{64, 4},
9179  	{209, 12},
9180  	{157, 6},
9181  	{111, 9},
9182  	{70, 6},
9183  	{129, 9},
9184  	{76, 6},
9185  	{94, 6},
9186  	{65, 5},
9187  	{193, 6},
9188  	{82, 6},
9189  	{100, 6},
9190  	{67, 5},
9191  	{118, 6},
9192  	{73, 5},
9193  	{91, 5},
9194  	{0, 6},
9195  	{209, 12},
9196  	{209, 12},
9197  	{209, 12},
9198  	{209, 12},
9199  	{190, 9},
9200  	{152, 7},
9201  	{164, 7},
9202  	{145, 3},
9203  	{202, 9},
9204  	{89, 9},
9205  	{107, 9},
9206  	{69, 7},
9207  	{125, 9},
9208  	{75, 7},
9209  	{93, 7},
9210  	{64, 4},
9211  	{209, 12},
9212  	{158, 7},
9213  	{113, 9},
9214  	{71, 7},
9215  	{131, 9},
9216  	{30, 10},
9217  	{46, 10},
9218  	{7, 9},
9219  	{194, 7},
9220  	{83, 7},
9221  	{54, 10},
9222  	{11, 9},
9223  	{119, 7},
9224  	{19, 9},
9225  	{35, 9},
9226  	{1, 7},
9227  	{209, 12},
9228  	{209, 12},
9229  	{173, 7},
9230  	{148, 6},
9231  	{137, 9},
9232  	{79, 7},
9233  	{97, 7},
9234  	{66, 6},
9235  	{197, 7},
9236  	{85, 7},
9237  	{58, 10},
9238  	{13, 9},
9239  	{121, 7},
9240  	{21, 9},
9241  	{37, 9},
9242  	{2, 7},
9243  	{209, 12},
9244  	{157, 6},
9245  	{109, 7},
9246  	{70, 6},
9247  	{127, 7},
9248  	{25, 9},
9249  	{41, 9},
9250  	{4, 7},
9251  	{193, 6},
9252  	{82, 6},
9253  	{49, 9},
9254  	{8, 7},
9255  	{118, 6},
9256  	{16, 7},
9257  	{32, 7},
9258  	{0, 6},
9259  	{209, 12},
9260  	{209, 12},
9261  	{209, 12},
9262  	{209, 12},
9263  	{209, 12},
9264  	{209, 12},
9265  	{209, 12},
9266  	{145, 3},
9267  	{205, 9},
9268  	{156, 8},
9269  	{168, 8},
9270  	{146, 4},
9271  	{180, 8},
9272  	{149, 4},
9273  	{161, 4},
9274  	{64, 4},
9275  	{209, 12},
9276  	{159, 8},
9277  	{115, 9},
9278  	{72, 8},
9279  	{133, 9},
9280  	{78, 8},
9281  	{96, 8},
9282  	{65, 5},
9283  	{195, 8},
9284  	{84, 8},
9285  	{102, 8},
9286  	{67, 5},
9287  	{120, 8},
9288  	{73, 5},
9289  	{91, 5},
9290  	{64, 4},
9291  	{209, 12},
9292  	{209, 12},
9293  	{174, 8},
9294  	{148, 6},
9295  	{139, 9},
9296  	{80, 8},
9297  	{98, 8},
9298  	{66, 6},
9299  	{198, 8},
9300  	{86, 8},
9301  	{60, 10},
9302  	{14, 9},
9303  	{122, 8},
9304  	{22, 9},
9305  	{38, 9},
9306  	{3, 8},
9307  	{209, 12},
9308  	{157, 6},
9309  	{110, 8},
9310  	{70, 6},
9311  	{128, 8},
9312  	{26, 9},
9313  	{42, 9},
9314  	{5, 8},
9315  	{193, 6},
9316  	{82, 6},
9317  	{50, 9},
9318  	{9, 8},
9319  	{118, 6},
9320  	{17, 8},
9321  	{33, 8},
9322  	{0, 6},
9323  	{209, 12},
9324  	{209, 12},
9325  	{209, 12},
9326  	{209, 12},
9327  	{189, 8},
9328  	{152, 7},
9329  	{164, 7},
9330  	{145, 3},
9331  	{201, 8},
9332  	{88, 8},
9333  	{106, 8},
9334  	{69, 7},
9335  	{124, 8},
9336  	{75, 7},
9337  	{93, 7},
9338  	{64, 4},
9339  	{209, 12},
9340  	{158, 7},
9341  	{112, 8},
9342  	{71, 7},
9343  	{130, 8},
9344  	{28, 9},
9345  	{44, 9},
9346  	{6, 8},
9347  	{194, 7},
9348  	{83, 7},
9349  	{52, 9},
9350  	{10, 8},
9351  	{119, 7},
9352  	{18, 8},
9353  	{34, 8},
9354  	{1, 7},
9355  	{209, 12},
9356  	{209, 12},
9357  	{173, 7},
9358  	{148, 6},
9359  	{136, 8},
9360  	{79, 7},
9361  	{97, 7},
9362  	{66, 6},
9363  	{197, 7},
9364  	{85, 7},
9365  	{56, 9},
9366  	{12, 8},
9367  	{121, 7},
9368  	{20, 8},
9369  	{36, 8},
9370  	{2, 7},
9371  	{209, 12},
9372  	{157, 6},
9373  	{109, 7},
9374  	{70, 6},
9375  	{127, 7},
9376  	{24, 8},
9377  	{40, 8},
9378  	{4, 7},
9379  	{193, 6},
9380  	{82, 6},
9381  	{48, 8},
9382  	{8, 7},
9383  	{118, 6},
9384  	{16, 7},
9385  	{32, 7},
9386  	{0, 6}};
9387 } // utf8_to_utf16 namespace
9388 } // tables namespace
9389 } // unnamed namespace
9390 } // namespace simdutf
9391 
9392 #endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H
9393 /* end file src/tables/utf8_to_utf16_tables.h */
9394 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=tables/utf16_to_utf8_tables.h
9395 /* begin file src/tables/utf16_to_utf8_tables.h */
9396 // file generated by scripts/sse_convert_utf16_to_utf8.py
9397 #ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H
9398 #define SIMDUTF_UTF16_TO_UTF8_TABLES_H
9399 
9400 namespace simdutf {
9401 namespace {
9402 namespace tables {
9403 namespace utf16_to_utf8 {
9404 
9405   // 1 byte for length, 16 bytes for mask
9406   const uint8_t pack_1_2_utf8_bytes[256][17] = {
9407     {16,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
9408     {15,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80},
9409     {15,1,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80},
9410     {14,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
9411     {15,1,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80},
9412     {14,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
9413     {14,1,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
9414     {13,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
9415     {15,1,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80},
9416     {14,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
9417     {14,1,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80},
9418     {13,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
9419     {14,1,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
9420     {13,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
9421     {13,1,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
9422     {12,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
9423     {15,1,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80},
9424     {14,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
9425     {14,1,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
9426     {13,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
9427     {14,1,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
9428     {13,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
9429     {13,1,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
9430     {12,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
9431     {14,1,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
9432     {13,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
9433     {13,1,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
9434     {12,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
9435     {13,1,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
9436     {12,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
9437     {12,1,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
9438     {11,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
9439     {15,1,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80},
9440     {14,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
9441     {14,1,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80},
9442     {13,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
9443     {14,1,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
9444     {13,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
9445     {13,1,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
9446     {12,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
9447     {14,1,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80},
9448     {13,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
9449     {13,1,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80},
9450     {12,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
9451     {13,1,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
9452     {12,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
9453     {12,1,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
9454     {11,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9455     {14,1,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
9456     {13,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
9457     {13,1,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
9458     {12,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
9459     {13,1,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
9460     {12,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
9461     {12,1,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
9462     {11,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9463     {13,1,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
9464     {12,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
9465     {12,1,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
9466     {11,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9467     {12,1,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
9468     {11,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9469     {11,1,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9470     {10,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
9471     {15,1,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80},
9472     {14,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
9473     {14,1,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80},
9474     {13,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
9475     {14,1,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
9476     {13,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
9477     {13,1,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
9478     {12,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
9479     {14,1,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80},
9480     {13,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
9481     {13,1,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80},
9482     {12,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
9483     {13,1,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
9484     {12,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
9485     {12,1,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
9486     {11,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
9487     {14,1,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
9488     {13,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
9489     {13,1,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
9490     {12,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
9491     {13,1,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
9492     {12,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
9493     {12,1,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
9494     {11,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
9495     {13,1,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
9496     {12,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
9497     {12,1,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
9498     {11,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
9499     {12,1,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
9500     {11,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
9501     {11,1,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
9502     {10,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
9503     {14,1,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80},
9504     {13,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
9505     {13,1,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80},
9506     {12,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
9507     {13,1,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
9508     {12,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
9509     {12,1,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
9510     {11,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9511     {13,1,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80},
9512     {12,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
9513     {12,1,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
9514     {11,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9515     {12,1,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
9516     {11,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9517     {11,1,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9518     {10,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
9519     {13,1,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
9520     {12,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
9521     {12,1,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
9522     {11,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9523     {12,1,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
9524     {11,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9525     {11,1,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9526     {10,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
9527     {12,1,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
9528     {11,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9529     {11,1,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9530     {10,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
9531     {11,1,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
9532     {10,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
9533     {10,1,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
9534     {9,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9535     {15,1,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80},
9536     {14,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
9537     {14,1,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80},
9538     {13,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
9539     {14,1,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
9540     {13,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
9541     {13,1,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
9542     {12,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
9543     {14,1,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80},
9544     {13,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
9545     {13,1,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80},
9546     {12,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
9547     {13,1,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
9548     {12,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
9549     {12,1,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
9550     {11,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9551     {14,1,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
9552     {13,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
9553     {13,1,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
9554     {12,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
9555     {13,1,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
9556     {12,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
9557     {12,1,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
9558     {11,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9559     {13,1,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
9560     {12,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
9561     {12,1,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
9562     {11,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9563     {12,1,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
9564     {11,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9565     {11,1,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9566     {10,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9567     {14,1,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80},
9568     {13,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
9569     {13,1,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80},
9570     {12,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
9571     {13,1,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
9572     {12,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
9573     {12,1,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
9574     {11,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
9575     {13,1,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80},
9576     {12,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
9577     {12,1,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80},
9578     {11,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
9579     {12,1,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
9580     {11,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
9581     {11,1,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
9582     {10,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9583     {13,1,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
9584     {12,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
9585     {12,1,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
9586     {11,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
9587     {12,1,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
9588     {11,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
9589     {11,1,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
9590     {10,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9591     {12,1,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
9592     {11,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
9593     {11,1,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
9594     {10,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9595     {11,1,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
9596     {10,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9597     {10,1,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9598     {9,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9599     {14,1,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80},
9600     {13,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
9601     {13,1,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80},
9602     {12,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
9603     {13,1,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
9604     {12,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
9605     {12,1,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
9606     {11,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9607     {13,1,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80},
9608     {12,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
9609     {12,1,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
9610     {11,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9611     {12,1,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
9612     {11,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9613     {11,1,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9614     {10,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9615     {13,1,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
9616     {12,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
9617     {12,1,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
9618     {11,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9619     {12,1,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
9620     {11,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9621     {11,1,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9622     {10,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9623     {12,1,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
9624     {11,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9625     {11,1,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9626     {10,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9627     {11,1,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
9628     {10,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9629     {10,1,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9630     {9,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9631     {13,1,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80},
9632     {12,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
9633     {12,1,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
9634     {11,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
9635     {12,1,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
9636     {11,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
9637     {11,1,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
9638     {10,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9639     {12,1,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
9640     {11,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
9641     {11,1,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
9642     {10,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9643     {11,1,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
9644     {10,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9645     {10,1,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9646     {9,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9647     {12,1,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
9648     {11,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
9649     {11,1,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
9650     {10,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9651     {11,1,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
9652     {10,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9653     {10,1,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9654     {9,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9655     {11,1,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
9656     {10,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9657     {10,1,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9658     {9,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9659     {10,1,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
9660     {9,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9661     {9,1,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9662     {8,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}
9663   };
9664 
9665   // 1 byte for length, 16 bytes for mask
9666   const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
9667     {12,2,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80},
9668     {9,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9669     {11,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
9670     {10,0,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
9671     {9,2,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9672     {6,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9673     {8,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9674     {7,0,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9675     {11,2,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
9676     {8,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9677     {10,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
9678     {9,0,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9679     {10,2,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
9680     {7,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9681     {9,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9682     {8,0,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9683     {9,2,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9684     {6,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9685     {8,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9686     {7,0,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9687     {6,2,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9688     {3,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9689     {5,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9690     {4,0,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9691     {8,2,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9692     {5,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9693     {7,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9694     {6,0,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9695     {7,2,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9696     {4,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9697     {6,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9698     {5,0,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9699     {11,2,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
9700     {8,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9701     {10,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
9702     {9,0,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9703     {8,2,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9704     {5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9705     {7,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9706     {6,0,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9707     {10,2,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
9708     {7,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9709     {9,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9710     {8,0,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9711     {9,2,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9712     {6,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9713     {8,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9714     {7,0,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9715     {10,2,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
9716     {7,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9717     {9,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9718     {8,0,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9719     {7,2,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9720     {4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9721     {6,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9722     {5,0,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9723     {9,2,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9724     {6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9725     {8,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9726     {7,0,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9727     {8,2,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9728     {5,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9729     {7,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9730     {6,0,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9731     {9,2,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9732     {6,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9733     {8,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9734     {7,0,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9735     {6,2,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9736     {3,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9737     {5,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9738     {4,0,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9739     {8,2,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9740     {5,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9741     {7,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9742     {6,0,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9743     {7,2,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9744     {4,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9745     {6,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9746     {5,0,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9747     {6,2,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9748     {3,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9749     {5,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9750     {4,0,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9751     {3,2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9752     {0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9753     {2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9754     {1,0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9755     {5,2,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9756     {2,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9757     {4,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9758     {3,0,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9759     {4,2,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9760     {1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9761     {3,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9762     {2,0,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9763     {8,2,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9764     {5,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9765     {7,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9766     {6,0,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9767     {5,2,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9768     {2,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9769     {4,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9770     {3,0,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9771     {7,2,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9772     {4,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9773     {6,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9774     {5,0,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9775     {6,2,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9776     {3,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9777     {5,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9778     {4,0,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9779     {7,2,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9780     {4,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9781     {6,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9782     {5,0,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9783     {4,2,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9784     {1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9785     {3,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9786     {2,0,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9787     {6,2,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9788     {3,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9789     {5,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9790     {4,0,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9791     {5,2,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9792     {2,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9793     {4,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9794     {3,0,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9795     {11,2,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80},
9796     {8,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9797     {10,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
9798     {9,0,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9799     {8,2,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9800     {5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9801     {7,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9802     {6,0,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9803     {10,2,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
9804     {7,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9805     {9,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9806     {8,0,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9807     {9,2,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9808     {6,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9809     {8,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9810     {7,0,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9811     {8,2,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9812     {5,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9813     {7,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9814     {6,0,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9815     {5,2,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9816     {2,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9817     {4,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9818     {3,0,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9819     {7,2,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9820     {4,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9821     {6,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9822     {5,0,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9823     {6,2,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9824     {3,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9825     {5,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9826     {4,0,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9827     {10,2,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
9828     {7,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9829     {9,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9830     {8,0,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9831     {7,2,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9832     {4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9833     {6,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9834     {5,0,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9835     {9,2,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9836     {6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9837     {8,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9838     {7,0,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9839     {8,2,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9840     {5,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9841     {7,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9842     {6,0,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9843     {9,2,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9844     {6,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9845     {8,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9846     {7,0,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9847     {6,2,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9848     {3,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9849     {5,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9850     {4,0,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9851     {8,2,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9852     {5,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9853     {7,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9854     {6,0,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9855     {7,2,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9856     {4,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9857     {6,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9858     {5,0,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9859     {10,2,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80},
9860     {7,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9861     {9,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9862     {8,0,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9863     {7,2,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9864     {4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9865     {6,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9866     {5,0,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9867     {9,2,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9868     {6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9869     {8,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9870     {7,0,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9871     {8,2,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9872     {5,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9873     {7,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9874     {6,0,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9875     {7,2,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9876     {4,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9877     {6,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9878     {5,0,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9879     {4,2,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9880     {1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9881     {3,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9882     {2,0,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9883     {6,2,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9884     {3,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9885     {5,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9886     {4,0,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9887     {5,2,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9888     {2,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9889     {4,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9890     {3,0,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9891     {9,2,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9892     {6,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9893     {8,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9894     {7,0,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9895     {6,2,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9896     {3,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9897     {5,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9898     {4,0,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9899     {8,2,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9900     {5,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9901     {7,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9902     {6,0,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9903     {7,2,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9904     {4,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9905     {6,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9906     {5,0,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9907     {8,2,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9908     {5,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9909     {7,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9910     {6,0,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9911     {5,2,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9912     {2,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9913     {4,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9914     {3,0,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9915     {7,2,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9916     {4,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9917     {6,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9918     {5,0,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9919     {6,2,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9920     {3,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9921     {5,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
9922     {4,0,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}
9923   };
9924 
9925 } // utf16_to_utf8 namespace
9926 } // tables namespace
9927 } // unnamed namespace
9928 } // namespace simdutf
9929 
9930 #endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
9931 /* end file src/tables/utf16_to_utf8_tables.h */
9932 // End of tables.
9933 
9934 // The scalar routines should be included once.
9935 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/ascii.h
9936 /* begin file src/scalar/ascii.h */
9937 #ifndef SIMDUTF_ASCII_H
9938 #define SIMDUTF_ASCII_H
9939 
9940 namespace simdutf {
9941 namespace scalar {
9942 namespace {
9943 namespace ascii {
9944 #if SIMDUTF_IMPLEMENTATION_FALLBACK
9945 // Only used by the fallback kernel.
validate(const char * buf,size_t len)9946 inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
9947     const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
9948     uint64_t pos = 0;
9949     // process in blocks of 16 bytes when possible
9950     for (;pos + 16 < len; pos += 16) {
9951         uint64_t v1;
9952         std::memcpy(&v1, data + pos, sizeof(uint64_t));
9953         uint64_t v2;
9954         std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
9955         uint64_t v{v1 | v2};
9956         if ((v & 0x8080808080808080) != 0) { return false; }
9957     }
9958     // process the tail byte-by-byte
9959     for (;pos < len; pos ++) {
9960         if (data[pos] >= 0b10000000) { return false; }
9961     }
9962     return true;
9963 }
9964 #endif
9965 
validate_with_errors(const char * buf,size_t len)9966 inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept {
9967     const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
9968     size_t pos = 0;
9969     // process in blocks of 16 bytes when possible
9970     for (;pos + 16 < len; pos += 16) {
9971         uint64_t v1;
9972         std::memcpy(&v1, data + pos, sizeof(uint64_t));
9973         uint64_t v2;
9974         std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
9975         uint64_t v{v1 | v2};
9976         if ((v & 0x8080808080808080) != 0) {
9977             for (;pos < len; pos ++) {
9978                 if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); }
9979             }
9980         }
9981     }
9982     // process the tail byte-by-byte
9983     for (;pos < len; pos ++) {
9984         if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); }
9985     }
9986     return result(error_code::SUCCESS, pos);
9987 }
9988 
9989 } // ascii namespace
9990 } // unnamed namespace
9991 } // namespace scalar
9992 } // namespace simdutf
9993 
9994 #endif
9995 /* end file src/scalar/ascii.h */
9996 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf8.h
9997 /* begin file src/scalar/utf8.h */
9998 #ifndef SIMDUTF_UTF8_H
9999 #define SIMDUTF_UTF8_H
10000 
10001 namespace simdutf {
10002 namespace scalar {
10003 namespace {
10004 namespace utf8 {
10005 #if SIMDUTF_IMPLEMENTATION_FALLBACK
10006 // only used by the fallback kernel.
10007 // credit: based on code from Google Fuchsia (Apache Licensed)
validate(const char * buf,size_t len)10008 inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
10009   const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
10010   uint64_t pos = 0;
10011   uint32_t code_point = 0;
10012   while (pos < len) {
10013     // check of the next 8 bytes are ascii.
10014     uint64_t next_pos = pos + 16;
10015     if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
10016       uint64_t v1;
10017       std::memcpy(&v1, data + pos, sizeof(uint64_t));
10018       uint64_t v2;
10019       std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
10020       uint64_t v{v1 | v2};
10021       if ((v & 0x8080808080808080) == 0) {
10022         pos = next_pos;
10023         continue;
10024       }
10025     }
10026     unsigned char byte = data[pos];
10027 
10028     while (byte < 0b10000000) {
10029       if (++pos == len) { return true; }
10030       byte = data[pos];
10031     }
10032 
10033     if ((byte & 0b11100000) == 0b11000000) {
10034       next_pos = pos + 2;
10035       if (next_pos > len) { return false; }
10036       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
10037       // range check
10038       code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
10039       if ((code_point < 0x80) || (0x7ff < code_point)) { return false; }
10040     } else if ((byte & 0b11110000) == 0b11100000) {
10041       next_pos = pos + 3;
10042       if (next_pos > len) { return false; }
10043       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
10044       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
10045       // range check
10046       code_point = (byte & 0b00001111) << 12 |
10047                    (data[pos + 1] & 0b00111111) << 6 |
10048                    (data[pos + 2] & 0b00111111);
10049       if ((code_point < 0x800) || (0xffff < code_point) ||
10050           (0xd7ff < code_point && code_point < 0xe000)) {
10051         return false;
10052       }
10053     } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
10054       next_pos = pos + 4;
10055       if (next_pos > len) { return false; }
10056       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
10057       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
10058       if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
10059       // range check
10060       code_point =
10061           (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
10062           (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
10063       if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
10064     } else {
10065       // we may have a continuation
10066       return false;
10067     }
10068     pos = next_pos;
10069   }
10070   return true;
10071 }
10072 #endif
10073 
validate_with_errors(const char * buf,size_t len)10074 inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept {
10075   const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
10076   size_t pos = 0;
10077   uint32_t code_point = 0;
10078   while (pos < len) {
10079     // check of the next 8 bytes are ascii.
10080     size_t next_pos = pos + 16;
10081     if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
10082       uint64_t v1;
10083       std::memcpy(&v1, data + pos, sizeof(uint64_t));
10084       uint64_t v2;
10085       std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
10086       uint64_t v{v1 | v2};
10087       if ((v & 0x8080808080808080) == 0) {
10088         pos = next_pos;
10089         continue;
10090       }
10091     }
10092     unsigned char byte = data[pos];
10093 
10094     while (byte < 0b10000000) {
10095       if (++pos == len) { return result(error_code::SUCCESS, len); }
10096       byte = data[pos];
10097     }
10098 
10099     if ((byte & 0b11100000) == 0b11000000) {
10100       next_pos = pos + 2;
10101       if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
10102       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
10103       // range check
10104       code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
10105       if ((code_point < 0x80) || (0x7ff < code_point)) { return result(error_code::OVERLONG, pos); }
10106     } else if ((byte & 0b11110000) == 0b11100000) {
10107       next_pos = pos + 3;
10108       if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
10109       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
10110       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
10111       // range check
10112       code_point = (byte & 0b00001111) << 12 |
10113                    (data[pos + 1] & 0b00111111) << 6 |
10114                    (data[pos + 2] & 0b00111111);
10115       if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);}
10116       if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
10117     } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
10118       next_pos = pos + 4;
10119       if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
10120       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
10121       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
10122       if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
10123       // range check
10124       code_point =
10125           (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
10126           (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
10127       if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
10128       if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
10129     } else {
10130       // we either have too many continuation bytes or an invalid leading byte
10131       if ((byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
10132       else { return result(error_code::HEADER_BITS, pos); }
10133     }
10134     pos = next_pos;
10135   }
10136   return result(error_code::SUCCESS, len);
10137 }
10138 
10139 // Finds the previous leading byte and validates with errors from there
10140 // Used to pinpoint the location of an error when an invalid chunk is detected
rewind_and_validate_with_errors(const char * buf,size_t len)10141 inline simdutf_warn_unused result rewind_and_validate_with_errors(const char *buf, size_t len) noexcept {
10142   size_t extra_len{0};
10143   // A leading byte cannot be further than 4 bytes away
10144   for(int i = 0; i < 5; i++) {
10145     unsigned char byte = *buf;
10146     if ((byte & 0b11000000) != 0b10000000) {
10147       break;
10148     } else {
10149       buf--;
10150       extra_len++;
10151     }
10152   }
10153 
10154   result res = validate_with_errors(buf, len + extra_len);
10155   res.count -= extra_len;
10156   return res;
10157 }
10158 
count_code_points(const char * buf,size_t len)10159 inline size_t count_code_points(const char* buf, size_t len) {
10160     const int8_t * p = reinterpret_cast<const int8_t *>(buf);
10161     size_t counter{0};
10162     for(size_t i = 0; i < len; i++) {
10163         // -65 is 0b10111111, anything larger in two-complement's should start a new code point.
10164         if(p[i] > -65) { counter++; }
10165     }
10166     return counter;
10167 }
10168 
utf16_length_from_utf8(const char * buf,size_t len)10169 inline size_t utf16_length_from_utf8(const char* buf, size_t len) {
10170     const int8_t * p = reinterpret_cast<const int8_t *>(buf);
10171     size_t counter{0};
10172     for(size_t i = 0; i < len; i++) {
10173         if(p[i] > -65) { counter++; }
10174         if(uint8_t(p[i]) >= 240) { counter++; }
10175     }
10176     return counter;
10177 }
10178 
10179 } // utf8 namespace
10180 } // unnamed namespace
10181 } // namespace scalar
10182 } // namespace simdutf
10183 
10184 #endif
10185 /* end file src/scalar/utf8.h */
10186 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf16.h
10187 /* begin file src/scalar/utf16.h */
10188 #ifndef SIMDUTF_UTF16_H
10189 #define SIMDUTF_UTF16_H
10190 
10191 namespace simdutf {
10192 namespace scalar {
10193 namespace {
10194 namespace utf16 {
10195 
swap_bytes(const uint16_t word)10196 inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
10197   return uint16_t((word >> 8) | (word << 8));
10198 }
10199 
10200 template <endianness big_endian>
validate(const char16_t * buf,size_t len)10201 inline simdutf_warn_unused bool validate(const char16_t *buf, size_t len) noexcept {
10202   const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
10203   uint64_t pos = 0;
10204   while (pos < len) {
10205     uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
10206     if((word &0xF800) == 0xD800) {
10207         if(pos + 1 >= len) { return false; }
10208         uint16_t diff = uint16_t(word - 0xD800);
10209         if(diff > 0x3FF) { return false; }
10210         uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
10211         uint16_t diff2 = uint16_t(next_word - 0xDC00);
10212         if(diff2 > 0x3FF) { return false; }
10213         pos += 2;
10214     } else {
10215         pos++;
10216     }
10217   }
10218   return true;
10219 }
10220 
10221 template <endianness big_endian>
validate_with_errors(const char16_t * buf,size_t len)10222 inline simdutf_warn_unused result validate_with_errors(const char16_t *buf, size_t len) noexcept {
10223   const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
10224   size_t pos = 0;
10225   while (pos < len) {
10226     uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
10227     if((word & 0xF800) == 0xD800) {
10228         if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); }
10229         uint16_t diff = uint16_t(word - 0xD800);
10230         if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
10231         uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
10232         uint16_t diff2 = uint16_t(next_word - 0xDC00);
10233         if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
10234         pos += 2;
10235     } else {
10236         pos++;
10237     }
10238   }
10239   return result(error_code::SUCCESS, pos);
10240 }
10241 
10242 template <endianness big_endian>
count_code_points(const char16_t * buf,size_t len)10243 inline size_t count_code_points(const char16_t* buf, size_t len) {
10244   // We are not BOM aware.
10245   const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
10246   size_t counter{0};
10247   for(size_t i = 0; i < len; i++) {
10248     uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
10249     counter += ((word & 0xFC00) != 0xDC00);
10250   }
10251   return counter;
10252 }
10253 
10254 template <endianness big_endian>
utf8_length_from_utf16(const char16_t * buf,size_t len)10255 inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len) {
10256   // We are not BOM aware.
10257   const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
10258   size_t counter{0};
10259   for(size_t i = 0; i < len; i++) {
10260     uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
10261     /** ASCII **/
10262     if(word <= 0x7F) { counter++; }
10263     /** two-byte **/
10264     else if (word <= 0x7FF) { counter += 2; }
10265     /** three-byte **/
10266     else if((word <= 0xD7FF) || (word >= 0xE000)) { counter += 3; }
10267     /** surrogates -- 4 bytes **/
10268     else { counter += 2; }
10269   }
10270   return counter;
10271 }
10272 
10273 template <endianness big_endian>
utf32_length_from_utf16(const char16_t * buf,size_t len)10274 inline size_t utf32_length_from_utf16(const char16_t* buf, size_t len) {
10275   // We are not BOM aware.
10276   const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
10277   size_t counter{0};
10278   for(size_t i = 0; i < len; i++) {
10279     uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
10280     counter += ((word & 0xFC00) != 0xDC00);
10281   }
10282   return counter;
10283 }
10284 
change_endianness_utf16(const char16_t * in,size_t size,char16_t * out)10285 simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* out) {
10286   const uint16_t * input = reinterpret_cast<const uint16_t *>(in);
10287   uint16_t * output = reinterpret_cast<uint16_t *>(out);
10288   for (size_t i = 0; i < size; i++) {
10289     *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
10290   }
10291 }
10292 
10293 } // utf16 namespace
10294 } // unnamed namespace
10295 } // namespace scalar
10296 } // namespace simdutf
10297 
10298 #endif
10299 /* end file src/scalar/utf16.h */
10300 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf32.h
10301 /* begin file src/scalar/utf32.h */
10302 #ifndef SIMDUTF_UTF32_H
10303 #define SIMDUTF_UTF32_H
10304 
10305 namespace simdutf {
10306 namespace scalar {
10307 namespace {
10308 namespace utf32 {
10309 
validate(const char32_t * buf,size_t len)10310 inline simdutf_warn_unused bool validate(const char32_t *buf, size_t len) noexcept {
10311   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10312   uint64_t pos = 0;
10313   for(;pos < len; pos++) {
10314     uint32_t word = data[pos];
10315     if(word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
10316         return false;
10317     }
10318   }
10319   return true;
10320 }
10321 
validate_with_errors(const char32_t * buf,size_t len)10322 inline simdutf_warn_unused result validate_with_errors(const char32_t *buf, size_t len) noexcept {
10323   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10324   size_t pos = 0;
10325   for(;pos < len; pos++) {
10326     uint32_t word = data[pos];
10327     if(word > 0x10FFFF) {
10328         return result(error_code::TOO_LARGE, pos);
10329     }
10330     if(word >= 0xD800 && word <= 0xDFFF) {
10331         return result(error_code::SURROGATE, pos);
10332     }
10333   }
10334   return result(error_code::SUCCESS, pos);
10335 }
10336 
utf8_length_from_utf32(const char32_t * buf,size_t len)10337 inline size_t utf8_length_from_utf32(const char32_t* buf, size_t len) {
10338   // We are not BOM aware.
10339   const uint32_t * p = reinterpret_cast<const uint32_t *>(buf);
10340   size_t counter{0};
10341   for(size_t i = 0; i < len; i++) {
10342     /** ASCII **/
10343     if(p[i] <= 0x7F) { counter++; }
10344     /** two-byte **/
10345     else if(p[i] <= 0x7FF) { counter += 2; }
10346     /** three-byte **/
10347     else if(p[i] <= 0xFFFF) { counter += 3; }
10348     /** four-bytes **/
10349     else { counter += 4; }
10350   }
10351   return counter;
10352 }
10353 
utf16_length_from_utf32(const char32_t * buf,size_t len)10354 inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len) {
10355   // We are not BOM aware.
10356   const uint32_t * p = reinterpret_cast<const uint32_t *>(buf);
10357   size_t counter{0};
10358   for(size_t i = 0; i < len; i++) {
10359     /** non-surrogate word **/
10360     if(p[i] <= 0xFFFF) { counter++; }
10361     /** surrogate pair **/
10362     else { counter += 2; }
10363   }
10364   return counter;
10365 }
10366 
10367 } // utf32 namespace
10368 } // unnamed namespace
10369 } // namespace scalar
10370 } // namespace simdutf
10371 
10372 #endif
10373 /* end file src/scalar/utf32.h */
10374 
10375 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf8/valid_utf32_to_utf8.h
10376 /* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
10377 #ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
10378 #define SIMDUTF_VALID_UTF32_TO_UTF8_H
10379 
10380 namespace simdutf {
10381 namespace scalar {
10382 namespace {
10383 namespace utf32_to_utf8 {
10384 
10385 #if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
10386 // only used by the fallback and POWER kernel
convert_valid(const char32_t * buf,size_t len,char * utf8_output)10387 inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output) {
10388 	const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10389   size_t pos = 0;
10390   char* start{utf8_output};
10391   while (pos < len) {
10392     // try to convert the next block of 2 ASCII characters
10393     if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
10394       uint64_t v;
10395       ::memcpy(&v, data + pos, sizeof(uint64_t));
10396       if ((v & 0xFFFFFF80FFFFFF80) == 0) {
10397         *utf8_output++ = char(buf[pos]);
10398 				*utf8_output++ = char(buf[pos+1]);
10399         pos += 2;
10400         continue;
10401       }
10402     }
10403     uint32_t word = data[pos];
10404     if((word & 0xFFFFFF80)==0) {
10405       // will generate one UTF-8 bytes
10406       *utf8_output++ = char(word);
10407       pos++;
10408     } else if((word & 0xFFFFF800)==0) {
10409       // will generate two UTF-8 bytes
10410       // we have 0b110XXXXX 0b10XXXXXX
10411       *utf8_output++ = char((word>>6) | 0b11000000);
10412       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10413       pos++;
10414     } else if((word & 0xFFFF0000)==0) {
10415       // will generate three UTF-8 bytes
10416       // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
10417       *utf8_output++ = char((word>>12) | 0b11100000);
10418       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10419       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10420       pos++;
10421     } else {
10422       // will generate four UTF-8 bytes
10423       // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
10424       *utf8_output++ = char((word>>18) | 0b11110000);
10425       *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
10426       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10427       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10428       pos ++;
10429     }
10430   }
10431   return utf8_output - start;
10432 }
10433 #endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
10434 
10435 } // utf32_to_utf8 namespace
10436 } // unnamed namespace
10437 } // namespace scalar
10438 } // namespace simdutf
10439 
10440 #endif
10441 /* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
10442 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf8/utf32_to_utf8.h
10443 /* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
10444 #ifndef SIMDUTF_UTF32_TO_UTF8_H
10445 #define SIMDUTF_UTF32_TO_UTF8_H
10446 
10447 namespace simdutf {
10448 namespace scalar {
10449 namespace {
10450 namespace utf32_to_utf8 {
10451 
convert(const char32_t * buf,size_t len,char * utf8_output)10452 inline size_t convert(const char32_t* buf, size_t len, char* utf8_output) {
10453   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10454   size_t pos = 0;
10455   char* start{utf8_output};
10456   while (pos < len) {
10457     // try to convert the next block of 2 ASCII characters
10458     if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
10459       uint64_t v;
10460       ::memcpy(&v, data + pos, sizeof(uint64_t));
10461       if ((v & 0xFFFFFF80FFFFFF80) == 0) {
10462         *utf8_output++ = char(buf[pos]);
10463 				*utf8_output++ = char(buf[pos+1]);
10464         pos += 2;
10465         continue;
10466       }
10467     }
10468     uint32_t word = data[pos];
10469     if((word & 0xFFFFFF80)==0) {
10470       // will generate one UTF-8 bytes
10471       *utf8_output++ = char(word);
10472       pos++;
10473     } else if((word & 0xFFFFF800)==0) {
10474       // will generate two UTF-8 bytes
10475       // we have 0b110XXXXX 0b10XXXXXX
10476       *utf8_output++ = char((word>>6) | 0b11000000);
10477       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10478       pos++;
10479     } else if((word & 0xFFFF0000)==0) {
10480       // will generate three UTF-8 bytes
10481       // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
10482 			if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
10483       *utf8_output++ = char((word>>12) | 0b11100000);
10484       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10485       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10486       pos++;
10487     } else {
10488       // will generate four UTF-8 bytes
10489       // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
10490 			if (word > 0x10FFFF) { return 0; }
10491       *utf8_output++ = char((word>>18) | 0b11110000);
10492       *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
10493       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10494       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10495       pos ++;
10496     }
10497   }
10498   return utf8_output - start;
10499 }
10500 
convert_with_errors(const char32_t * buf,size_t len,char * utf8_output)10501 inline result convert_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
10502   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10503   size_t pos = 0;
10504   char* start{utf8_output};
10505   while (pos < len) {
10506     // try to convert the next block of 2 ASCII characters
10507     if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
10508       uint64_t v;
10509       ::memcpy(&v, data + pos, sizeof(uint64_t));
10510       if ((v & 0xFFFFFF80FFFFFF80) == 0) {
10511         *utf8_output++ = char(buf[pos]);
10512 				*utf8_output++ = char(buf[pos+1]);
10513         pos += 2;
10514         continue;
10515       }
10516     }
10517     uint32_t word = data[pos];
10518     if((word & 0xFFFFFF80)==0) {
10519       // will generate one UTF-8 bytes
10520       *utf8_output++ = char(word);
10521       pos++;
10522     } else if((word & 0xFFFFF800)==0) {
10523       // will generate two UTF-8 bytes
10524       // we have 0b110XXXXX 0b10XXXXXX
10525       *utf8_output++ = char((word>>6) | 0b11000000);
10526       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10527       pos++;
10528     } else if((word & 0xFFFF0000)==0) {
10529       // will generate three UTF-8 bytes
10530       // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
10531 			if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
10532       *utf8_output++ = char((word>>12) | 0b11100000);
10533       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10534       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10535       pos++;
10536     } else {
10537       // will generate four UTF-8 bytes
10538       // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
10539 			if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
10540       *utf8_output++ = char((word>>18) | 0b11110000);
10541       *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
10542       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10543       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10544       pos ++;
10545     }
10546   }
10547   return result(error_code::SUCCESS, utf8_output - start);
10548 }
10549 
10550 } // utf32_to_utf8 namespace
10551 } // unnamed namespace
10552 } // namespace scalar
10553 } // namespace simdutf
10554 
10555 #endif
10556 /* end file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
10557 
10558 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf16/valid_utf32_to_utf16.h
10559 /* begin file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
10560 #ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H
10561 #define SIMDUTF_VALID_UTF32_TO_UTF16_H
10562 
10563 namespace simdutf {
10564 namespace scalar {
10565 namespace {
10566 namespace utf32_to_utf16 {
10567 
10568 template <endianness big_endian>
convert_valid(const char32_t * buf,size_t len,char16_t * utf16_output)10569 inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_output) {
10570   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10571   size_t pos = 0;
10572   char16_t* start{utf16_output};
10573   while (pos < len) {
10574     uint32_t word = data[pos];
10575     if((word & 0xFFFF0000)==0) {
10576       // will not generate a surrogate pair
10577       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
10578       pos++;
10579     } else {
10580       // will generate a surrogate pair
10581       word -= 0x10000;
10582       uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
10583       uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
10584       if (!match_system(big_endian)) {
10585         high_surrogate = utf16::swap_bytes(high_surrogate);
10586         low_surrogate = utf16::swap_bytes(low_surrogate);
10587       }
10588       *utf16_output++ = char16_t(high_surrogate);
10589       *utf16_output++ = char16_t(low_surrogate);
10590       pos++;
10591     }
10592   }
10593   return utf16_output - start;
10594 }
10595 
10596 } // utf32_to_utf16 namespace
10597 } // unnamed namespace
10598 } // namespace scalar
10599 } // namespace simdutf
10600 
10601 #endif
10602 /* end file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
10603 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf16/utf32_to_utf16.h
10604 /* begin file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
10605 #ifndef SIMDUTF_UTF32_TO_UTF16_H
10606 #define SIMDUTF_UTF32_TO_UTF16_H
10607 
10608 namespace simdutf {
10609 namespace scalar {
10610 namespace {
10611 namespace utf32_to_utf16 {
10612 
10613 template <endianness big_endian>
convert(const char32_t * buf,size_t len,char16_t * utf16_output)10614 inline size_t convert(const char32_t* buf, size_t len, char16_t* utf16_output) {
10615   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10616   size_t pos = 0;
10617   char16_t* start{utf16_output};
10618   while (pos < len) {
10619     uint32_t word = data[pos];
10620     if((word & 0xFFFF0000)==0) {
10621       if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
10622       // will not generate a surrogate pair
10623       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
10624     } else {
10625       // will generate a surrogate pair
10626       if (word > 0x10FFFF) { return 0; }
10627       word -= 0x10000;
10628       uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
10629       uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
10630       if (!match_system(big_endian)) {
10631         high_surrogate = utf16::swap_bytes(high_surrogate);
10632         low_surrogate = utf16::swap_bytes(low_surrogate);
10633       }
10634       *utf16_output++ = char16_t(high_surrogate);
10635       *utf16_output++ = char16_t(low_surrogate);
10636     }
10637     pos++;
10638   }
10639   return utf16_output - start;
10640 }
10641 
10642 template <endianness big_endian>
convert_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output)10643 inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
10644   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10645   size_t pos = 0;
10646   char16_t* start{utf16_output};
10647   while (pos < len) {
10648     uint32_t word = data[pos];
10649     if((word & 0xFFFF0000)==0) {
10650       if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
10651       // will not generate a surrogate pair
10652       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
10653     } else {
10654       // will generate a surrogate pair
10655       if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
10656       word -= 0x10000;
10657       uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
10658       uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
10659       if (!match_system(big_endian)) {
10660         high_surrogate = utf16::swap_bytes(high_surrogate);
10661         low_surrogate = utf16::swap_bytes(low_surrogate);
10662       }
10663       *utf16_output++ = char16_t(high_surrogate);
10664       *utf16_output++ = char16_t(low_surrogate);
10665     }
10666     pos++;
10667   }
10668   return result(error_code::SUCCESS, utf16_output - start);
10669 }
10670 
10671 } // utf32_to_utf16 namespace
10672 } // unnamed namespace
10673 } // namespace scalar
10674 } // namespace simdutf
10675 
10676 #endif
10677 /* end file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
10678 
10679 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/valid_utf16_to_utf8.h
10680 /* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
10681 #ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
10682 #define SIMDUTF_VALID_UTF16_TO_UTF8_H
10683 
10684 namespace simdutf {
10685 namespace scalar {
10686 namespace {
10687 namespace utf16_to_utf8 {
10688 
10689 template <endianness big_endian>
convert_valid(const char16_t * buf,size_t len,char * utf8_output)10690 inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output) {
10691  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
10692   size_t pos = 0;
10693   char* start{utf8_output};
10694   while (pos < len) {
10695     // try to convert the next block of 4 ASCII characters
10696     if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
10697       uint64_t v;
10698       ::memcpy(&v, data + pos, sizeof(uint64_t));
10699       if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8));
10700       if ((v & 0xFF80FF80FF80FF80) == 0) {
10701         size_t final_pos = pos + 4;
10702         while(pos < final_pos) {
10703           *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
10704           pos++;
10705         }
10706         continue;
10707       }
10708     }
10709 
10710     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
10711     if((word & 0xFF80)==0) {
10712       // will generate one UTF-8 bytes
10713       *utf8_output++ = char(word);
10714       pos++;
10715     } else if((word & 0xF800)==0) {
10716       // will generate two UTF-8 bytes
10717       // we have 0b110XXXXX 0b10XXXXXX
10718       *utf8_output++ = char((word>>6) | 0b11000000);
10719       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10720       pos++;
10721     } else if((word &0xF800 ) != 0xD800) {
10722       // will generate three UTF-8 bytes
10723       // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
10724       *utf8_output++ = char((word>>12) | 0b11100000);
10725       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10726       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10727       pos++;
10728     } else {
10729       // must be a surrogate pair
10730       uint16_t diff = uint16_t(word - 0xD800);
10731       if(pos + 1 >= len) { return 0; } // minimal bound checking
10732       uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
10733       uint16_t diff2 = uint16_t(next_word - 0xDC00);
10734       uint32_t value = (diff << 10) + diff2 + 0x10000;
10735       // will generate four UTF-8 bytes
10736       // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
10737       *utf8_output++ = char((value>>18) | 0b11110000);
10738       *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
10739       *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
10740       *utf8_output++ = char((value & 0b111111) | 0b10000000);
10741       pos += 2;
10742     }
10743   }
10744   return utf8_output - start;
10745 }
10746 
10747 } // utf16_to_utf8 namespace
10748 } // unnamed namespace
10749 } // namespace scalar
10750 } // namespace simdutf
10751 
10752 #endif
10753 /* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
10754 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/utf16_to_utf8.h
10755 /* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
10756 #ifndef SIMDUTF_UTF16_TO_UTF8_H
10757 #define SIMDUTF_UTF16_TO_UTF8_H
10758 
10759 namespace simdutf {
10760 namespace scalar {
10761 namespace {
10762 namespace utf16_to_utf8 {
10763 
10764 template <endianness big_endian>
convert(const char16_t * buf,size_t len,char * utf8_output)10765 inline size_t convert(const char16_t* buf, size_t len, char* utf8_output) {
10766  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
10767   size_t pos = 0;
10768   char* start{utf8_output};
10769   while (pos < len) {
10770     // try to convert the next block of 8 ASCII characters
10771     if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
10772       uint64_t v;
10773       ::memcpy(&v, data + pos, sizeof(uint64_t));
10774       if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8));
10775       if ((v & 0xFF80FF80FF80FF80) == 0) {
10776         size_t final_pos = pos + 4;
10777         while(pos < final_pos) {
10778           *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
10779           pos++;
10780         }
10781         continue;
10782       }
10783     }
10784     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
10785     if((word & 0xFF80)==0) {
10786       // will generate one UTF-8 bytes
10787       *utf8_output++ = char(word);
10788       pos++;
10789     } else if((word & 0xF800)==0) {
10790       // will generate two UTF-8 bytes
10791       // we have 0b110XXXXX 0b10XXXXXX
10792       *utf8_output++ = char((word>>6) | 0b11000000);
10793       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10794       pos++;
10795     } else if((word &0xF800 ) != 0xD800) {
10796       // will generate three UTF-8 bytes
10797       // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
10798       *utf8_output++ = char((word>>12) | 0b11100000);
10799       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10800       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10801       pos++;
10802     } else {
10803       // must be a surrogate pair
10804       if(pos + 1 >= len) { return 0; }
10805       uint16_t diff = uint16_t(word - 0xD800);
10806       if(diff > 0x3FF) { return 0; }
10807       uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
10808       uint16_t diff2 = uint16_t(next_word - 0xDC00);
10809       if(diff2 > 0x3FF) { return 0; }
10810       uint32_t value = (diff << 10) + diff2 + 0x10000;
10811       // will generate four UTF-8 bytes
10812       // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
10813       *utf8_output++ = char((value>>18) | 0b11110000);
10814       *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
10815       *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
10816       *utf8_output++ = char((value & 0b111111) | 0b10000000);
10817       pos += 2;
10818     }
10819   }
10820   return utf8_output - start;
10821 }
10822 
10823 template <endianness big_endian>
convert_with_errors(const char16_t * buf,size_t len,char * utf8_output)10824 inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
10825  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
10826   size_t pos = 0;
10827   char* start{utf8_output};
10828   while (pos < len) {
10829     // try to convert the next block of 8 ASCII characters
10830     if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
10831       uint64_t v;
10832       ::memcpy(&v, data + pos, sizeof(uint64_t));
10833       if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8));
10834       if ((v & 0xFF80FF80FF80FF80) == 0) {
10835         size_t final_pos = pos + 4;
10836         while(pos < final_pos) {
10837           *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
10838           pos++;
10839         }
10840         continue;
10841       }
10842     }
10843     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
10844     if((word & 0xFF80)==0) {
10845       // will generate one UTF-8 bytes
10846       *utf8_output++ = char(word);
10847       pos++;
10848     } else if((word & 0xF800)==0) {
10849       // will generate two UTF-8 bytes
10850       // we have 0b110XXXXX 0b10XXXXXX
10851       *utf8_output++ = char((word>>6) | 0b11000000);
10852       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10853       pos++;
10854     } else if((word &0xF800 ) != 0xD800) {
10855       // will generate three UTF-8 bytes
10856       // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
10857       *utf8_output++ = char((word>>12) | 0b11100000);
10858       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10859       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10860       pos++;
10861     } else {
10862       // must be a surrogate pair
10863       if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); }
10864       uint16_t diff = uint16_t(word - 0xD800);
10865       if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
10866       uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
10867       uint16_t diff2 = uint16_t(next_word - 0xDC00);
10868       if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
10869       uint32_t value = (diff << 10) + diff2 + 0x10000;
10870       // will generate four UTF-8 bytes
10871       // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
10872       *utf8_output++ = char((value>>18) | 0b11110000);
10873       *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
10874       *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
10875       *utf8_output++ = char((value & 0b111111) | 0b10000000);
10876       pos += 2;
10877     }
10878   }
10879   return result(error_code::SUCCESS, utf8_output - start);
10880 }
10881 
10882 } // utf16_to_utf8 namespace
10883 } // unnamed namespace
10884 } // namespace scalar
10885 } // namespace simdutf
10886 
10887 #endif
10888 /* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
10889 
10890 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf32/valid_utf16_to_utf32.h
10891 /* begin file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
10892 #ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H
10893 #define SIMDUTF_VALID_UTF16_TO_UTF32_H
10894 
10895 namespace simdutf {
10896 namespace scalar {
10897 namespace {
10898 namespace utf16_to_utf32 {
10899 
10900 template <endianness big_endian>
convert_valid(const char16_t * buf,size_t len,char32_t * utf32_output)10901 inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_output) {
10902  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
10903   size_t pos = 0;
10904   char32_t* start{utf32_output};
10905   while (pos < len) {
10906     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
10907     if((word &0xF800 ) != 0xD800) {
10908       // No surrogate pair, extend 16-bit word to 32-bit word
10909       *utf32_output++ = char32_t(word);
10910       pos++;
10911     } else {
10912       // must be a surrogate pair
10913       uint16_t diff = uint16_t(word - 0xD800);
10914       if(pos + 1 >= len) { return 0; } // minimal bound checking
10915       uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
10916       uint16_t diff2 = uint16_t(next_word - 0xDC00);
10917       uint32_t value = (diff << 10) + diff2 + 0x10000;
10918       *utf32_output++ = char32_t(value);
10919       pos += 2;
10920     }
10921   }
10922   return utf32_output - start;
10923 }
10924 
10925 } // utf16_to_utf32 namespace
10926 } // unnamed namespace
10927 } // namespace scalar
10928 } // namespace simdutf
10929 
10930 #endif
10931 /* end file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
10932 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf32/utf16_to_utf32.h
10933 /* begin file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
10934 #ifndef SIMDUTF_UTF16_TO_UTF32_H
10935 #define SIMDUTF_UTF16_TO_UTF32_H
10936 
10937 namespace simdutf {
10938 namespace scalar {
10939 namespace {
10940 namespace utf16_to_utf32 {
10941 
10942 template <endianness big_endian>
convert(const char16_t * buf,size_t len,char32_t * utf32_output)10943 inline size_t convert(const char16_t* buf, size_t len, char32_t* utf32_output) {
10944  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
10945   size_t pos = 0;
10946   char32_t* start{utf32_output};
10947   while (pos < len) {
10948     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
10949     if((word &0xF800 ) != 0xD800) {
10950       // No surrogate pair, extend 16-bit word to 32-bit word
10951       *utf32_output++ = char32_t(word);
10952       pos++;
10953     } else {
10954       // must be a surrogate pair
10955       uint16_t diff = uint16_t(word - 0xD800);
10956       if(diff > 0x3FF) { return 0; }
10957       if(pos + 1 >= len) { return 0; } // minimal bound checking
10958       uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
10959       uint16_t diff2 = uint16_t(next_word - 0xDC00);
10960       if(diff2 > 0x3FF) { return 0; }
10961       uint32_t value = (diff << 10) + diff2 + 0x10000;
10962       *utf32_output++ = char32_t(value);
10963       pos += 2;
10964     }
10965   }
10966   return utf32_output - start;
10967 }
10968 
10969 template <endianness big_endian>
convert_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output)10970 inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
10971  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
10972   size_t pos = 0;
10973   char32_t* start{utf32_output};
10974   while (pos < len) {
10975     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
10976     if((word &0xF800 ) != 0xD800) {
10977       // No surrogate pair, extend 16-bit word to 32-bit word
10978       *utf32_output++ = char32_t(word);
10979       pos++;
10980     } else {
10981       // must be a surrogate pair
10982       uint16_t diff = uint16_t(word - 0xD800);
10983       if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
10984       if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } // minimal bound checking
10985       uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
10986       uint16_t diff2 = uint16_t(next_word - 0xDC00);
10987       if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
10988       uint32_t value = (diff << 10) + diff2 + 0x10000;
10989       *utf32_output++ = char32_t(value);
10990       pos += 2;
10991     }
10992   }
10993   return result(error_code::SUCCESS, utf32_output - start);
10994 }
10995 
10996 } // utf16_to_utf32 namespace
10997 } // unnamed namespace
10998 } // namespace scalar
10999 } // namespace simdutf
11000 
11001 #endif
11002 /* end file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
11003 
11004 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/valid_utf8_to_utf16.h
11005 /* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
11006 #ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
11007 #define SIMDUTF_VALID_UTF8_TO_UTF16_H
11008 
11009 namespace simdutf {
11010 namespace scalar {
11011 namespace {
11012 namespace utf8_to_utf16 {
11013 
11014 template <endianness big_endian>
convert_valid(const char * buf,size_t len,char16_t * utf16_output)11015 inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) {
11016  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
11017   size_t pos = 0;
11018   char16_t* start{utf16_output};
11019   while (pos < len) {
11020     // try to convert the next block of 8 ASCII bytes
11021     if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
11022       uint64_t v;
11023       ::memcpy(&v, data + pos, sizeof(uint64_t));
11024       if ((v & 0x8080808080808080) == 0) {
11025         size_t final_pos = pos + 8;
11026         while(pos < final_pos) {
11027           *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
11028           pos++;
11029         }
11030         continue;
11031       }
11032     }
11033     uint8_t leading_byte = data[pos]; // leading byte
11034     if (leading_byte < 0b10000000) {
11035       // converting one ASCII byte !!!
11036       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
11037       pos++;
11038     } else if ((leading_byte & 0b11100000) == 0b11000000) {
11039       // We have a two-byte UTF-8, it should become
11040       // a single UTF-16 word.
11041       if(pos + 1 >= len) { break; } // minimal bound checking
11042       uint16_t code_point = uint16_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111));
11043       if (!match_system(big_endian)) {
11044         code_point = utf16::swap_bytes(uint16_t(code_point));
11045       }
11046       *utf16_output++ = char16_t(code_point);
11047       pos += 2;
11048     } else if ((leading_byte & 0b11110000) == 0b11100000) {
11049       // We have a three-byte UTF-8, it should become
11050       // a single UTF-16 word.
11051       if(pos + 2 >= len) { break; } // minimal bound checking
11052       uint16_t code_point = uint16_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111));
11053       if (!match_system(big_endian)) {
11054         code_point = utf16::swap_bytes(uint16_t(code_point));
11055       }
11056       *utf16_output++ = char16_t(code_point);
11057       pos += 3;
11058     } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
11059       // we have a 4-byte UTF-8 word.
11060       if(pos + 3 >= len) { break; } // minimal bound checking
11061       uint32_t code_point = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12)
11062                            | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111);
11063       code_point -= 0x10000;
11064       uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
11065       uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
11066       if (!match_system(big_endian)) {
11067         high_surrogate = utf16::swap_bytes(high_surrogate);
11068         low_surrogate = utf16::swap_bytes(low_surrogate);
11069       }
11070       *utf16_output++ = char16_t(high_surrogate);
11071       *utf16_output++ = char16_t(low_surrogate);
11072       pos += 4;
11073     } else {
11074       // we may have a continuation but we do not do error checking
11075       return 0;
11076     }
11077   }
11078   return utf16_output - start;
11079 }
11080 
11081 
11082 } // namespace utf8_to_utf16
11083 } // unnamed namespace
11084 } // namespace scalar
11085 } // namespace simdutf
11086 
11087 #endif
11088 /* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
11089 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/utf8_to_utf16.h
11090 /* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
11091 #ifndef SIMDUTF_UTF8_TO_UTF16_H
11092 #define SIMDUTF_UTF8_TO_UTF16_H
11093 
11094 namespace simdutf {
11095 namespace scalar {
11096 namespace {
11097 namespace utf8_to_utf16 {
11098 
11099 template <endianness big_endian>
convert(const char * buf,size_t len,char16_t * utf16_output)11100 inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) {
11101  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
11102   size_t pos = 0;
11103   char16_t* start{utf16_output};
11104   while (pos < len) {
11105     // try to convert the next block of 16 ASCII bytes
11106     if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
11107       uint64_t v1;
11108       ::memcpy(&v1, data + pos, sizeof(uint64_t));
11109       uint64_t v2;
11110       ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
11111       uint64_t v{v1 | v2};
11112       if ((v & 0x8080808080808080) == 0) {
11113         size_t final_pos = pos + 16;
11114         while(pos < final_pos) {
11115           *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
11116           pos++;
11117         }
11118         continue;
11119       }
11120     }
11121 
11122     uint8_t leading_byte = data[pos]; // leading byte
11123     if (leading_byte < 0b10000000) {
11124       // converting one ASCII byte !!!
11125       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte);
11126       pos++;
11127     } else if ((leading_byte & 0b11100000) == 0b11000000) {
11128       // We have a two-byte UTF-8, it should become
11129       // a single UTF-16 word.
11130       if(pos + 1 >= len) { return 0; } // minimal bound checking
11131       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
11132       // range check
11133       uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
11134       if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
11135       if (!match_system(big_endian)) {
11136         code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
11137       }
11138       *utf16_output++ = char16_t(code_point);
11139       pos += 2;
11140     } else if ((leading_byte & 0b11110000) == 0b11100000) {
11141       // We have a three-byte UTF-8, it should become
11142       // a single UTF-16 word.
11143       if(pos + 2 >= len) { return 0; } // minimal bound checking
11144 
11145       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
11146       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
11147       // range check
11148       uint32_t code_point = (leading_byte & 0b00001111) << 12 |
11149                    (data[pos + 1] & 0b00111111) << 6 |
11150                    (data[pos + 2] & 0b00111111);
11151       if (code_point < 0x800 || 0xffff < code_point ||
11152           (0xd7ff < code_point && code_point < 0xe000)) {
11153         return 0;
11154       }
11155       if (!match_system(big_endian)) {
11156         code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
11157       }
11158       *utf16_output++ = char16_t(code_point);
11159       pos += 3;
11160     } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
11161       // we have a 4-byte UTF-8 word.
11162       if(pos + 3 >= len) { return 0; } // minimal bound checking
11163       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
11164       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
11165       if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
11166 
11167       // range check
11168       uint32_t code_point =
11169           (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
11170           (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
11171       if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
11172       code_point -= 0x10000;
11173       uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
11174       uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
11175       if (!match_system(big_endian)) {
11176         high_surrogate = utf16::swap_bytes(high_surrogate);
11177         low_surrogate = utf16::swap_bytes(low_surrogate);
11178       }
11179       *utf16_output++ = char16_t(high_surrogate);
11180       *utf16_output++ = char16_t(low_surrogate);
11181       pos += 4;
11182     } else {
11183       return 0;
11184     }
11185   }
11186   return utf16_output - start;
11187 }
11188 
11189 template <endianness big_endian>
convert_with_errors(const char * buf,size_t len,char16_t * utf16_output)11190 inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output) {
11191  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
11192   size_t pos = 0;
11193   char16_t* start{utf16_output};
11194   while (pos < len) {
11195     // try to convert the next block of 16 ASCII bytes
11196     if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
11197       uint64_t v1;
11198       ::memcpy(&v1, data + pos, sizeof(uint64_t));
11199       uint64_t v2;
11200       ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
11201       uint64_t v{v1 | v2};
11202       if ((v & 0x8080808080808080) == 0) {
11203         size_t final_pos = pos + 16;
11204         while(pos < final_pos) {
11205           *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
11206           pos++;
11207         }
11208         continue;
11209       }
11210     }
11211     uint8_t leading_byte = data[pos]; // leading byte
11212     if (leading_byte < 0b10000000) {
11213       // converting one ASCII byte !!!
11214       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte);
11215       pos++;
11216     } else if ((leading_byte & 0b11100000) == 0b11000000) {
11217       // We have a two-byte UTF-8, it should become
11218       // a single UTF-16 word.
11219       if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
11220       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11221       // range check
11222       uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
11223       if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); }
11224       if (!match_system(big_endian)) {
11225         code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
11226       }
11227       *utf16_output++ = char16_t(code_point);
11228       pos += 2;
11229     } else if ((leading_byte & 0b11110000) == 0b11100000) {
11230       // We have a three-byte UTF-8, it should become
11231       // a single UTF-16 word.
11232       if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
11233 
11234       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11235       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11236       // range check
11237       uint32_t code_point = (leading_byte & 0b00001111) << 12 |
11238                    (data[pos + 1] & 0b00111111) << 6 |
11239                    (data[pos + 2] & 0b00111111);
11240       if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);}
11241       if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
11242       if (!match_system(big_endian)) {
11243         code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
11244       }
11245       *utf16_output++ = char16_t(code_point);
11246       pos += 3;
11247     } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
11248       // we have a 4-byte UTF-8 word.
11249       if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
11250       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11251       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11252       if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11253 
11254       // range check
11255       uint32_t code_point =
11256           (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
11257           (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
11258       if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
11259       if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
11260       code_point -= 0x10000;
11261       uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
11262       uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
11263       if (!match_system(big_endian)) {
11264         high_surrogate = utf16::swap_bytes(high_surrogate);
11265         low_surrogate = utf16::swap_bytes(low_surrogate);
11266       }
11267       *utf16_output++ = char16_t(high_surrogate);
11268       *utf16_output++ = char16_t(low_surrogate);
11269       pos += 4;
11270     } else {
11271       // we either have too many continuation bytes or an invalid leading byte
11272       if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
11273       else { return result(error_code::HEADER_BITS, pos); }
11274     }
11275   }
11276   return result(error_code::SUCCESS, utf16_output - start);
11277 }
11278 
11279 /**
11280  * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and we have
11281  * up to len input bytes left, and we encountered some error. It is possible that
11282  * the error is at 'buf' exactly, but it could also be in the previous bytes  (up to 3 bytes back).
11283  *
11284  * prior_bytes indicates how many bytes, prior to 'buf' may belong to the current memory section
11285  * and can be safely accessed. We prior_bytes to access safely up to three bytes before 'buf'.
11286  *
11287  * The caller is responsible to ensure that len > 0.
11288  *
11289  * If the error is believed to have occured prior to 'buf', the count value contain in the result
11290  * will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
11291  */
11292 template <endianness endian>
rewind_and_convert_with_errors(size_t prior_bytes,const char * buf,size_t len,char16_t * utf16_output)11293 inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char16_t* utf16_output) {
11294   size_t extra_len{0};
11295   // We potentially need to go back in time and find a leading byte.
11296   // In theory '3' would be sufficient, but sometimes the error can go back quite far.
11297   size_t how_far_back = prior_bytes;
11298   // size_t how_far_back = 3; // 3 bytes in the past + current position
11299   // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
11300   bool found_leading_bytes{false};
11301   // important: it is i <= how_far_back and not 'i < how_far_back'.
11302   for(size_t i = 0; i <= how_far_back; i++) {
11303     unsigned char byte = buf[0-i];
11304     found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
11305     if(found_leading_bytes) {
11306       buf -= i;
11307       extra_len = i;
11308       break;
11309     }
11310   }
11311   //
11312   // It is possible for this function to return a negative count in its result.
11313   // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
11314   // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
11315   //
11316   // An unsigned type will simply wrap round arithmetically (well defined).
11317   //
11318   if(!found_leading_bytes) {
11319     // If how_far_back == 3, we may have four consecutive continuation bytes!!!
11320     // [....] [continuation] [continuation] [continuation] | [buf is continuation]
11321     // Or we possibly have a stream that does not start with a leading byte.
11322     return result(error_code::TOO_LONG, 0-how_far_back);
11323   }
11324   result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
11325   if (res.error) {
11326     res.count -= extra_len;
11327   }
11328   return res;
11329 }
11330 
11331 } // utf8_to_utf16 namespace
11332 } // unnamed namespace
11333 } // namespace scalar
11334 } // namespace simdutf
11335 
11336 #endif
11337 /* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
11338 
11339 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf32/valid_utf8_to_utf32.h
11340 /* begin file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
11341 #ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
11342 #define SIMDUTF_VALID_UTF8_TO_UTF32_H
11343 
11344 namespace simdutf {
11345 namespace scalar {
11346 namespace {
11347 namespace utf8_to_utf32 {
11348 
convert_valid(const char * buf,size_t len,char32_t * utf32_output)11349 inline size_t convert_valid(const char* buf, size_t len, char32_t* utf32_output) {
11350  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
11351   size_t pos = 0;
11352   char32_t* start{utf32_output};
11353   while (pos < len) {
11354     // try to convert the next block of 8 ASCII bytes
11355     if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
11356       uint64_t v;
11357       ::memcpy(&v, data + pos, sizeof(uint64_t));
11358       if ((v & 0x8080808080808080) == 0) {
11359         size_t final_pos = pos + 8;
11360         while(pos < final_pos) {
11361           *utf32_output++ = char32_t(buf[pos]);
11362           pos++;
11363         }
11364         continue;
11365       }
11366     }
11367     uint8_t leading_byte = data[pos]; // leading byte
11368     if (leading_byte < 0b10000000) {
11369       // converting one ASCII byte !!!
11370       *utf32_output++ = char32_t(leading_byte);
11371       pos++;
11372     } else if ((leading_byte & 0b11100000) == 0b11000000) {
11373       // We have a two-byte UTF-8
11374       if(pos + 1 >= len) { break; } // minimal bound checking
11375       *utf32_output++ = char32_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111));
11376       pos += 2;
11377     } else if ((leading_byte & 0b11110000) == 0b11100000) {
11378       // We have a three-byte UTF-8
11379       if(pos + 2 >= len) { break; } // minimal bound checking
11380       *utf32_output++ = char32_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111));
11381       pos += 3;
11382     } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
11383       // we have a 4-byte UTF-8 word.
11384       if(pos + 3 >= len) { break; } // minimal bound checking
11385       uint32_t code_word = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12)
11386                            | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111);
11387       *utf32_output++ = char32_t(code_word);
11388       pos += 4;
11389     } else {
11390       // we may have a continuation but we do not do error checking
11391       return 0;
11392     }
11393   }
11394   return utf32_output - start;
11395 }
11396 
11397 
11398 } // namespace utf8_to_utf32
11399 } // unnamed namespace
11400 } // namespace scalar
11401 } // namespace simdutf
11402 
11403 #endif
11404 /* end file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
11405 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf32/utf8_to_utf32.h
11406 /* begin file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
11407 #ifndef SIMDUTF_UTF8_TO_UTF32_H
11408 #define SIMDUTF_UTF8_TO_UTF32_H
11409 
11410 namespace simdutf {
11411 namespace scalar {
11412 namespace {
11413 namespace utf8_to_utf32 {
11414 
convert(const char * buf,size_t len,char32_t * utf32_output)11415 inline size_t convert(const char* buf, size_t len, char32_t* utf32_output) {
11416  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
11417   size_t pos = 0;
11418   char32_t* start{utf32_output};
11419   while (pos < len) {
11420     // try to convert the next block of 16 ASCII bytes
11421     if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
11422       uint64_t v1;
11423       ::memcpy(&v1, data + pos, sizeof(uint64_t));
11424       uint64_t v2;
11425       ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
11426       uint64_t v{v1 | v2};
11427       if ((v & 0x8080808080808080) == 0) {
11428         size_t final_pos = pos + 16;
11429         while(pos < final_pos) {
11430           *utf32_output++ = char32_t(buf[pos]);
11431           pos++;
11432         }
11433         continue;
11434       }
11435     }
11436     uint8_t leading_byte = data[pos]; // leading byte
11437     if (leading_byte < 0b10000000) {
11438       // converting one ASCII byte !!!
11439       *utf32_output++ = char32_t(leading_byte);
11440       pos++;
11441     } else if ((leading_byte & 0b11100000) == 0b11000000) {
11442       // We have a two-byte UTF-8
11443       if(pos + 1 >= len) { return 0; } // minimal bound checking
11444       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
11445       // range check
11446       uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
11447       if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
11448       *utf32_output++ = char32_t(code_point);
11449       pos += 2;
11450     } else if ((leading_byte & 0b11110000) == 0b11100000) {
11451       // We have a three-byte UTF-8
11452       if(pos + 2 >= len) { return 0; } // minimal bound checking
11453 
11454       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
11455       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
11456       // range check
11457       uint32_t code_point = (leading_byte & 0b00001111) << 12 |
11458                    (data[pos + 1] & 0b00111111) << 6 |
11459                    (data[pos + 2] & 0b00111111);
11460       if (code_point < 0x800 || 0xffff < code_point ||
11461           (0xd7ff < code_point && code_point < 0xe000)) {
11462         return 0;
11463       }
11464       *utf32_output++ = char32_t(code_point);
11465       pos += 3;
11466     } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
11467       // we have a 4-byte UTF-8 word.
11468       if(pos + 3 >= len) { return 0; } // minimal bound checking
11469       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
11470       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
11471       if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
11472 
11473       // range check
11474       uint32_t code_point =
11475           (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
11476           (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
11477       if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
11478       *utf32_output++ = char32_t(code_point);
11479       pos += 4;
11480     } else {
11481       return 0;
11482     }
11483   }
11484   return utf32_output - start;
11485 }
11486 
convert_with_errors(const char * buf,size_t len,char32_t * utf32_output)11487 inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_output) {
11488  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
11489   size_t pos = 0;
11490   char32_t* start{utf32_output};
11491   while (pos < len) {
11492     // try to convert the next block of 16 ASCII bytes
11493     if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
11494       uint64_t v1;
11495       ::memcpy(&v1, data + pos, sizeof(uint64_t));
11496       uint64_t v2;
11497       ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
11498       uint64_t v{v1 | v2};
11499       if ((v & 0x8080808080808080) == 0) {
11500         size_t final_pos = pos + 16;
11501         while(pos < final_pos) {
11502           *utf32_output++ = char32_t(buf[pos]);
11503           pos++;
11504         }
11505         continue;
11506       }
11507     }
11508     uint8_t leading_byte = data[pos]; // leading byte
11509     if (leading_byte < 0b10000000) {
11510       // converting one ASCII byte !!!
11511       *utf32_output++ = char32_t(leading_byte);
11512       pos++;
11513     } else if ((leading_byte & 0b11100000) == 0b11000000) {
11514       // We have a two-byte UTF-8
11515       if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
11516       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11517       // range check
11518       uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
11519       if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); }
11520       *utf32_output++ = char32_t(code_point);
11521       pos += 2;
11522     } else if ((leading_byte & 0b11110000) == 0b11100000) {
11523       // We have a three-byte UTF-8
11524       if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
11525 
11526       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11527       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11528       // range check
11529       uint32_t code_point = (leading_byte & 0b00001111) << 12 |
11530                    (data[pos + 1] & 0b00111111) << 6 |
11531                    (data[pos + 2] & 0b00111111);
11532       if (code_point < 0x800 || 0xffff < code_point) { return result(error_code::OVERLONG, pos); }
11533       if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
11534       *utf32_output++ = char32_t(code_point);
11535       pos += 3;
11536     } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
11537       // we have a 4-byte UTF-8 word.
11538       if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
11539       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos);}
11540       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11541       if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11542 
11543       // range check
11544       uint32_t code_point =
11545           (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
11546           (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
11547       if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
11548       if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
11549       *utf32_output++ = char32_t(code_point);
11550       pos += 4;
11551     } else {
11552       // we either have too many continuation bytes or an invalid leading byte
11553       if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
11554       else { return result(error_code::HEADER_BITS, pos); }
11555     }
11556   }
11557   return result(error_code::SUCCESS, utf32_output - start);
11558 }
11559 
11560 /**
11561  * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and we have
11562  * up to len input bytes left, and we encountered some error. It is possible that
11563  * the error is at 'buf' exactly, but it could also be in the previous bytes location (up to 3 bytes back).
11564  *
11565  * prior_bytes indicates how many bytes, prior to 'buf' may belong to the current memory section
11566  * and can be safely accessed. We prior_bytes to access safely up to three bytes before 'buf'.
11567  *
11568  * The caller is responsible to ensure that len > 0.
11569  *
11570  * If the error is believed to have occured prior to 'buf', the count value contain in the result
11571  * will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
11572  */
rewind_and_convert_with_errors(size_t prior_bytes,const char * buf,size_t len,char32_t * utf32_output)11573 inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char32_t* utf32_output) {
11574   size_t extra_len{0};
11575   // We potentially need to go back in time and find a leading byte.
11576   size_t how_far_back = 3; // 3 bytes in the past + current position
11577   if(how_far_back > prior_bytes) { how_far_back = prior_bytes; }
11578   bool found_leading_bytes{false};
11579   // important: it is i <= how_far_back and not 'i < how_far_back'.
11580   for(size_t i = 0; i <= how_far_back; i++) {
11581     unsigned char byte = buf[0-i];
11582     found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
11583     if(found_leading_bytes) {
11584       buf -= i;
11585       extra_len = i;
11586       break;
11587     }
11588   }
11589   //
11590   // It is possible for this function to return a negative count in its result.
11591   // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
11592   // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
11593   //
11594   // An unsigned type will simply wrap round arithmetically (well defined).
11595   //
11596   if(!found_leading_bytes) {
11597     // If how_far_back == 3, we may have four consecutive continuation bytes!!!
11598     // [....] [continuation] [continuation] [continuation] | [buf is continuation]
11599     // Or we possibly have a stream that does not start with a leading byte.
11600     return result(error_code::TOO_LONG, 0-how_far_back);
11601   }
11602 
11603   result res = convert_with_errors(buf, len + extra_len, utf32_output);
11604   if (res.error) {
11605     res.count -= extra_len;
11606   }
11607   return res;
11608 }
11609 
11610 } // utf8_to_utf32 namespace
11611 } // unnamed namespace
11612 } // namespace scalar
11613 } // namespace simdutf
11614 
11615 #endif
11616 /* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
11617 //
11618 
11619 
11620 SIMDUTF_PUSH_DISABLE_WARNINGS
11621 SIMDUTF_DISABLE_UNDESIRED_WARNINGS
11622 
11623 
11624 #if SIMDUTF_IMPLEMENTATION_ARM64
11625 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/implementation.cpp
11626 /* begin file src/arm64/implementation.cpp */
11627 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h
11628 /* begin file src/simdutf/arm64/begin.h */
11629 // redefining SIMDUTF_IMPLEMENTATION to "arm64"
11630 // #define SIMDUTF_IMPLEMENTATION arm64
11631 /* end file src/simdutf/arm64/begin.h */
11632 namespace simdutf {
11633 namespace arm64 {
11634 namespace {
11635 #ifndef SIMDUTF_ARM64_H
11636 #error "arm64.h must be included"
11637 #endif
11638 using namespace simd;
11639 
is_ascii(const simd8x64<uint8_t> & input)11640 simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
11641     simd8<uint8_t> bits = input.reduce_or();
11642     return bits.max_val() < 0b10000000u;
11643 }
11644 
must_be_continuation(const simd8<uint8_t> prev1,const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)11645 simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
11646     simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
11647     simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
11648     simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
11649     // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well.
11650     // This will work fine because we only have to report errors for cases with 0-1 lead bytes.
11651     // Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is
11652     // guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character.
11653     // The error will be detected there.
11654     return is_second_byte ^ is_third_byte ^ is_fourth_byte;
11655 }
11656 
must_be_2_3_continuation(const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)11657 simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
11658     simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
11659     simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
11660     return is_third_byte ^ is_fourth_byte;
11661 }
11662 
11663 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_detect_encodings.cpp
11664 /* begin file src/arm64/arm_detect_encodings.cpp */
11665 template<class checker>
11666 // len is known to be a multiple of 2 when this is called
arm_detect_encodings(const char * buf,size_t len)11667 int arm_detect_encodings(const char * buf, size_t len) {
11668     const char* start = buf;
11669     const char* end = buf + len;
11670 
11671     bool is_utf8 = true;
11672     bool is_utf16 = true;
11673     bool is_utf32 = true;
11674 
11675     int out = 0;
11676 
11677     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
11678     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
11679 
11680     uint32x4_t currentmax = vmovq_n_u32(0x0);
11681 
11682     checker check{};
11683 
11684     while(buf + 64 <= end) {
11685         uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
11686         uint16x8_t secondin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + simd16<uint16_t>::SIZE / sizeof(char16_t));
11687         uint16x8_t thirdin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 2*simd16<uint16_t>::SIZE / sizeof(char16_t));
11688         uint16x8_t fourthin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 3*simd16<uint16_t>::SIZE / sizeof(char16_t));
11689 
11690         const auto u0 = simd16<uint16_t>(in);
11691         const auto u1 = simd16<uint16_t>(secondin);
11692         const auto u2 = simd16<uint16_t>(thirdin);
11693         const auto u3 = simd16<uint16_t>(fourthin);
11694 
11695         const auto v0 = u0.shr<8>();
11696         const auto v1 = u1.shr<8>();
11697         const auto v2 = u2.shr<8>();
11698         const auto v3 = u3.shr<8>();
11699 
11700         const auto in16 = simd16<uint16_t>::pack(v0, v1);
11701         const auto nextin16 = simd16<uint16_t>::pack(v2, v3);
11702 
11703         const uint64_t surrogates_wordmask0 = ((in16 & v_f8) == v_d8).to_bitmask64();
11704         const uint64_t surrogates_wordmask1 = ((nextin16 & v_f8) == v_d8).to_bitmask64();
11705 
11706         // Check for surrogates
11707         if (surrogates_wordmask0 != 0 || surrogates_wordmask1 != 0) {
11708             // Cannot be UTF8
11709             is_utf8 = false;
11710             // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates
11711             // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
11712             // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant
11713             // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
11714             // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words.
11715 
11716             if (((surrogates_wordmask0 | surrogates_wordmask1) & 0xf0f0f0f0f0f0f0f0) != 0) {
11717                 is_utf32 = false;
11718                 // Code from arm_validate_utf16le.cpp
11719                 // Not efficient, we do not process surrogates_wordmask1
11720                 const char16_t * input = reinterpret_cast<const char16_t*>(buf);
11721                 const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
11722 
11723                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
11724                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
11725 
11726                 const uint64_t V0 = ~surrogates_wordmask0;
11727 
11728                 const auto vH0 = ((in16 & v_fc) ==  v_dc);
11729                 const uint64_t H0 = vH0.to_bitmask64();
11730 
11731                 const uint64_t L0 = ~H0 & surrogates_wordmask0;
11732 
11733                 const uint64_t a0 = L0 & (H0 >> 4);
11734 
11735                 const uint64_t b0 = a0 << 4;
11736 
11737                 const uint64_t c0 = V0 | a0 | b0;
11738                 if (c0 == ~0ull) {
11739                     input += 16;
11740                 } else if (c0 == 0xfffffffffffffffull) {
11741                     input += 15;
11742                 } else {
11743                     is_utf16 = false;
11744                     break;
11745                 }
11746 
11747                 while (input + 16 < end16) {
11748                     const auto in0 = simd16<uint16_t>(input);
11749                     const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
11750                     const auto t0 = in0.shr<8>();
11751                     const auto t1 = in1.shr<8>();
11752                     const simd8<uint8_t> in_16 = simd16<uint16_t>::pack(t0, t1);
11753 
11754                     const uint64_t surrogates_wordmask = ((in_16 & v_f8) == v_d8).to_bitmask64();
11755                     if(surrogates_wordmask == 0) {
11756                         input += 16;
11757                     } else {
11758                         const uint64_t V = ~surrogates_wordmask;
11759 
11760                         const auto vH = ((in_16 & v_fc) ==  v_dc);
11761                         const uint64_t H = vH.to_bitmask64();
11762 
11763                         const uint64_t L = ~H & surrogates_wordmask;
11764 
11765                         const uint64_t a = L & (H >> 4);
11766 
11767                         const uint64_t b = a << 4;
11768 
11769                         const uint64_t c = V | a | b;
11770                         if (c == ~0ull) {
11771                             input += 16;
11772                         } else if (c == 0xfffffffffffffffull) {
11773                             input += 15;
11774                         } else {
11775                             is_utf16 = false;
11776                             break;
11777                         }
11778                     }
11779                 }
11780             } else {
11781                 is_utf16 = false;
11782                 // Check for UTF-32
11783                 if (len % 4 == 0) {
11784                     const char32_t * input = reinterpret_cast<const char32_t*>(buf);
11785                     const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
11786 
11787                     // Must start checking for surrogates
11788                     uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
11789                     const uint32x4_t offset = vmovq_n_u32(0xffff2000);
11790                     const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
11791 
11792                     const uint32x4_t in32 =  vreinterpretq_u32_u16(in);
11793                     const uint32x4_t secondin32 =  vreinterpretq_u32_u16(secondin);
11794                     const uint32x4_t thirdin32 =  vreinterpretq_u32_u16(thirdin);
11795                     const uint32x4_t fourthin32 =  vreinterpretq_u32_u16(fourthin);
11796 
11797                     currentmax = vmaxq_u32(in32,currentmax);
11798                     currentmax = vmaxq_u32(secondin32,currentmax);
11799                     currentmax = vmaxq_u32(thirdin32,currentmax);
11800                     currentmax = vmaxq_u32(fourthin32,currentmax);
11801 
11802                     currentoffsetmax = vmaxq_u32(vaddq_u32(in32, offset), currentoffsetmax);
11803                     currentoffsetmax = vmaxq_u32(vaddq_u32(secondin32, offset), currentoffsetmax);
11804                     currentoffsetmax = vmaxq_u32(vaddq_u32(thirdin32, offset), currentoffsetmax);
11805                     currentoffsetmax = vmaxq_u32(vaddq_u32(fourthin32, offset), currentoffsetmax);
11806 
11807                     while (input + 4 < end32) {
11808                         const uint32x4_t in_32 = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
11809                         currentmax = vmaxq_u32(in_32,currentmax);
11810                         currentoffsetmax = vmaxq_u32(vaddq_u32(in_32, offset), currentoffsetmax);
11811                         input += 4;
11812                     }
11813 
11814                     uint32x4_t forbidden_words = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
11815                     if(vmaxvq_u32(forbidden_words) != 0) {
11816                         is_utf32 = false;
11817                     }
11818                 } else {
11819                     is_utf32 = false;
11820                 }
11821             }
11822             break;
11823         }
11824         // If no surrogate, validate under other encodings as well
11825 
11826         // UTF-32 validation
11827         currentmax = vmaxq_u32(vreinterpretq_u32_u16(in),currentmax);
11828         currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin),currentmax);
11829         currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin),currentmax);
11830         currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin),currentmax);
11831 
11832         // UTF-8 validation
11833         // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
11834         simd::simd8x64<uint8_t> in8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(secondin), vreinterpretq_u8_u16(thirdin), vreinterpretq_u8_u16(fourthin));
11835         check.check_next_input(in8);
11836 
11837         buf += 64;
11838     }
11839 
11840     // Check which encodings are possible
11841 
11842     if (is_utf8) {
11843         if (static_cast<size_t>(buf - start) != len) {
11844             uint8_t block[64]{};
11845             std::memset(block, 0x20, 64);
11846             std::memcpy(block, buf, len - (buf - start));
11847             simd::simd8x64<uint8_t> in(block);
11848             check.check_next_input(in);
11849         }
11850         if (!check.errors()) {
11851             out |= simdutf::encoding_type::UTF8;
11852         }
11853     }
11854 
11855     if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
11856         out |= simdutf::encoding_type::UTF16_LE;
11857     }
11858 
11859     if (is_utf32 && (len % 4 == 0)) {
11860         const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
11861         uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
11862         if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
11863             out |= simdutf::encoding_type::UTF32_LE;
11864         }
11865     }
11866 
11867     return out;
11868 }
11869 /* end file src/arm64/arm_detect_encodings.cpp */
11870 
11871 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_validate_utf16.cpp
11872 /* begin file src/arm64/arm_validate_utf16.cpp */
11873 template <endianness big_endian>
arm_validate_utf16(const char16_t * input,size_t size)11874 const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
11875     const char16_t* end = input + size;
11876     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
11877     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
11878     const auto v_fc = simd8<uint8_t>::splat(0xfc);
11879     const auto v_dc = simd8<uint8_t>::splat(0xdc);
11880     while (input + 16 < end) {
11881         // 0. Load data: since the validation takes into account only higher
11882         //    byte of each word, we compress the two vectors into one which
11883         //    consists only the higher bytes.
11884         auto in0 = simd16<uint16_t>(input);
11885         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
11886         if (!match_system(big_endian)) {
11887             #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
11888             const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
11889             #else
11890             const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
11891             #endif
11892             in0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in0), swap));
11893             in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), swap));
11894         }
11895         const auto t0 = in0.shr<8>();
11896         const auto t1 = in1.shr<8>();
11897         const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
11898         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
11899         const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
11900         if(surrogates_wordmask == 0) {
11901             input += 16;
11902         } else {
11903             // 2. We have some surrogates that have to be distinguished:
11904             //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
11905             //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
11906             //
11907             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
11908 
11909             // V - non-surrogate words
11910             //     V = not surrogates_wordmask
11911             const uint64_t V = ~surrogates_wordmask;
11912 
11913             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
11914             const auto vH = ((in & v_fc) ==  v_dc);
11915             const uint64_t H = vH.to_bitmask64();
11916 
11917             // L - word mask for low surrogates
11918             //     L = not H and surrogates_wordmask
11919             const uint64_t L = ~H & surrogates_wordmask;
11920 
11921             const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one.
11922                               // (A low surrogate placed in the 7th register's word
11923                               // is an exception we handle.)
11924             const uint64_t b = a << 4; // Just mark that the opposite fact is hold,
11925                           // thanks to that we have only two masks for valid case.
11926             const uint64_t c = V | a | b;      // Combine all the masks into the final one.
11927             if (c == ~0ull) {
11928                 // The whole input register contains valid UTF-16, i.e.,
11929                 // either single words or proper surrogate pairs.
11930                 input += 16;
11931             } else if (c == 0xfffffffffffffffull) {
11932                 // The 15 lower words of the input register contains valid UTF-16.
11933                 // The 15th word may be either a low or high surrogate. It the next
11934                 // iteration we 1) check if the low surrogate is followed by a high
11935                 // one, 2) reject sole high surrogate.
11936                 input += 15;
11937             } else {
11938                 return nullptr;
11939             }
11940         }
11941     }
11942     return input;
11943 }
11944 
11945 
11946 template <endianness big_endian>
arm_validate_utf16_with_errors(const char16_t * input,size_t size)11947 const result arm_validate_utf16_with_errors(const char16_t* input, size_t size) {
11948     const char16_t* start = input;
11949     const char16_t* end = input + size;
11950 
11951     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
11952     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
11953     const auto v_fc = simd8<uint8_t>::splat(0xfc);
11954     const auto v_dc = simd8<uint8_t>::splat(0xdc);
11955     while (input + 16 < end) {
11956         // 0. Load data: since the validation takes into account only higher
11957         //    byte of each word, we compress the two vectors into one which
11958         //    consists only the higher bytes.
11959         auto in0 = simd16<uint16_t>(input);
11960         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
11961 
11962         if (!match_system(big_endian)) {
11963             #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
11964             const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
11965             #else
11966             const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
11967             #endif
11968             in0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in0), swap));
11969             in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), swap));
11970         }
11971         const auto t0 = in0.shr<8>();
11972         const auto t1 = in1.shr<8>();
11973         const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
11974         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
11975         const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
11976         if(surrogates_wordmask == 0) {
11977             input += 16;
11978         } else {
11979             // 2. We have some surrogates that have to be distinguished:
11980             //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
11981             //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
11982             //
11983             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
11984 
11985             // V - non-surrogate words
11986             //     V = not surrogates_wordmask
11987             const uint64_t V = ~surrogates_wordmask;
11988 
11989             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
11990             const auto vH = ((in & v_fc) ==  v_dc);
11991             const uint64_t H = vH.to_bitmask64();
11992 
11993             // L - word mask for low surrogates
11994             //     L = not H and surrogates_wordmask
11995             const uint64_t L = ~H & surrogates_wordmask;
11996 
11997             const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one.
11998                               // (A low surrogate placed in the 7th register's word
11999                               // is an exception we handle.)
12000             const uint64_t b = a << 4; // Just mark that the opposite fact is hold,
12001                           // thanks to that we have only two masks for valid case.
12002             const uint64_t c = V | a | b;      // Combine all the masks into the final one.
12003             if (c == ~0ull) {
12004                 // The whole input register contains valid UTF-16, i.e.,
12005                 // either single words or proper surrogate pairs.
12006                 input += 16;
12007             } else if (c == 0xfffffffffffffffull) {
12008                 // The 15 lower words of the input register contains valid UTF-16.
12009                 // The 15th word may be either a low or high surrogate. It the next
12010                 // iteration we 1) check if the low surrogate is followed by a high
12011                 // one, 2) reject sole high surrogate.
12012                 input += 15;
12013             } else {
12014                 return result(error_code::SURROGATE, input - start);
12015             }
12016         }
12017     }
12018     return result(error_code::SUCCESS, input - start);
12019 }
12020 /* end file src/arm64/arm_validate_utf16.cpp */
12021 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_validate_utf32le.cpp
12022 /* begin file src/arm64/arm_validate_utf32le.cpp */
12023 
arm_validate_utf32le(const char32_t * input,size_t size)12024 const char32_t* arm_validate_utf32le(const char32_t* input, size_t size) {
12025     const char32_t* end = input + size;
12026 
12027     const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
12028     const uint32x4_t offset = vmovq_n_u32(0xffff2000);
12029     const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
12030     uint32x4_t currentmax = vmovq_n_u32(0x0);
12031     uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
12032 
12033     while (input + 4 < end) {
12034         const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
12035         currentmax = vmaxq_u32(in,currentmax);
12036         currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
12037         input += 4;
12038     }
12039 
12040     uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
12041     if(vmaxvq_u32(is_zero) != 0) {
12042         return nullptr;
12043     }
12044 
12045     is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
12046     if(vmaxvq_u32(is_zero) != 0) {
12047         return nullptr;
12048     }
12049 
12050     return input;
12051 }
12052 
12053 
arm_validate_utf32le_with_errors(const char32_t * input,size_t size)12054 const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size) {
12055     const char32_t* start = input;
12056     const char32_t* end = input + size;
12057 
12058     const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
12059     const uint32x4_t offset = vmovq_n_u32(0xffff2000);
12060     const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
12061     uint32x4_t currentmax = vmovq_n_u32(0x0);
12062     uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
12063 
12064     while (input + 4 < end) {
12065         const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
12066         currentmax = vmaxq_u32(in,currentmax);
12067         currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
12068 
12069         uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
12070         if(vmaxvq_u32(is_zero) != 0) {
12071             return result(error_code::TOO_LARGE, input - start);
12072         }
12073 
12074         is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
12075         if(vmaxvq_u32(is_zero) != 0) {
12076             return result(error_code::SURROGATE, input - start);
12077         }
12078 
12079         input += 4;
12080     }
12081 
12082     return result(error_code::SUCCESS, input - start);
12083 }
12084 /* end file src/arm64/arm_validate_utf32le.cpp */
12085 
12086 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf8_to_utf16.cpp
12087 /* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */
12088 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
12089 // end of the code points. Only the least significant 12 bits of the mask
12090 // are accessed.
12091 // It returns how many bytes were consumed (up to 12).
12092 template <endianness big_endian>
convert_masked_utf8_to_utf16(const char * input,uint64_t utf8_end_of_code_point_mask,char16_t * & utf16_output)12093 size_t convert_masked_utf8_to_utf16(const char *input,
12094                            uint64_t utf8_end_of_code_point_mask,
12095                            char16_t *&utf16_output) {
12096   // we use an approach where we try to process up to 12 input bytes.
12097   // Why 12 input bytes and not 16? Because we are concerned with the size of
12098   // the lookup tables. Also 12 is nicely divisible by two and three.
12099   //
12100   #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12101   const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
12102   #else
12103   const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
12104   #endif
12105   uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
12106   const uint16_t input_utf8_end_of_code_point_mask =
12107       utf8_end_of_code_point_mask & 0xfff;
12108   //
12109   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
12110   // beneficial to have fast paths that depend on branch prediction but have less latency.
12111   // This results in more instructions but, potentially, also higher speeds.
12112   //
12113   // We first try a few fast paths.
12114   if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
12115     // We process in chunks of 16 bytes
12116     uint16x8_t ascii_first = vmovl_u8(vget_low_u8 (in));
12117     uint16x8_t ascii_second = vmovl_high_u8(in);
12118     if (!match_system(big_endian)) {
12119       ascii_first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_first), swap));
12120       ascii_second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_second), swap));
12121     }
12122     vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), ascii_first);
12123     vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output) + 8, ascii_second);
12124     utf16_output += 16; // We wrote 16 16-bit characters.
12125     return 16; // We consumed 16 bytes.
12126   }
12127   if((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) {
12128     // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
12129     // There is probably a more efficient sequence, but the following might do.
12130     uint8x16_t perm = vqtbl1q_u8(in, swap);
12131     uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
12132     uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
12133     uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
12134     if (!match_system(big_endian)) composed = vqtbl1q_u8(composed, swap);
12135     vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
12136     utf16_output += 8; // We wrote 16 bytes, 8 code points.
12137     return 16;
12138   }
12139   if(input_utf8_end_of_code_point_mask == 0x924) {
12140     // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
12141     // There is probably a more efficient sequence, but the following might do.
12142 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12143     const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
12144 #else
12145     const uint8x16_t sh = {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255};
12146 #endif
12147     uint8x16_t perm = vqtbl1q_u8(in, sh);
12148     uint8x16_t ascii =
12149         vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
12150     uint8x16_t middlebyte =
12151         vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
12152     uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
12153     uint32x4_t highbyte =
12154         vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
12155     uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
12156     uint32x4_t composed =
12157         vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
12158     uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
12159     if (!match_system(big_endian)) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap));
12160     vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
12161     utf16_output += 4;
12162     return 12;
12163   }
12164   /// We do not have a fast path available, so we fallback.
12165 
12166   const uint8_t idx =
12167       simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
12168   const uint8_t consumed =
12169       simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
12170 
12171 
12172   if (idx < 64) {
12173     // SIX (6) input code-words
12174     // this is a relatively easy scenario
12175     // we process SIX (6) input code-words. The max length in bytes of six code
12176     // words spanning between 1 and 2 bytes each is 12 bytes.
12177     uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
12178     uint8x16_t perm = vqtbl1q_u8(in, sh);
12179     uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
12180     uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
12181     uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
12182     if (!match_system(big_endian)) composed = vqtbl1q_u8(composed, swap);
12183     vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
12184     utf16_output += 6; // We wrote 12 bytes, 6 code points.
12185   } else if (idx < 145) {
12186     // FOUR (4) input code-words
12187     uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
12188     uint8x16_t perm = vqtbl1q_u8(in, sh);
12189     uint8x16_t ascii =
12190         vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
12191     uint8x16_t middlebyte =
12192         vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
12193     uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
12194     uint32x4_t highbyte =
12195         vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
12196     uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
12197     uint32x4_t composed =
12198         vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
12199     uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
12200     if (!match_system(big_endian)) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap));
12201     vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
12202     utf16_output += 4;
12203   } else if (idx < 209) {
12204     // TWO (2) input code-words
12205     //////////////
12206     // There might be garbage inputs where a leading byte mascarades as a four-byte
12207     // leading byte (by being followed by 3 continuation byte), but is not greater than
12208     // 0xf0. This could trigger a buffer overflow if we only counted leading
12209     // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
12210     // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
12211     // We do as at the cost of an extra mask.
12212     /////////////
12213     uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
12214     uint8x16_t perm = vqtbl1q_u8(in, sh);
12215     uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
12216     uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
12217     uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
12218     uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
12219     // correct for spurious high bit
12220     uint8x16_t correct =
12221         vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
12222     middlehighbyte = veorq_u8(correct, middlehighbyte);
12223     uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
12224     // We deliberately carry the leading four bits if they are present, we remove
12225     // them later when computing hightenbits.
12226     uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0xff000000)));
12227     uint8x16_t highbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
12228     // When we need to generate a surrogate pair (leading byte > 0xF0), then
12229     // the corresponding 32-bit value in 'composed'  will be greater than
12230     // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
12231     // location of the surrogate pairs.
12232     uint8x16_t composed =
12233         vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
12234                      vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
12235     uint32x4_t composedminus =
12236         vsubq_u32(vreinterpretq_u32_u8(composed), vmovq_n_u32(0x10000));
12237     uint32x4_t lowtenbits =
12238         vandq_u32(composedminus, vmovq_n_u32(0x3ff));
12239     // Notice the 0x3ff mask:
12240     uint32x4_t hightenbits = vandq_u32(vshrq_n_u32(composedminus, 10), vmovq_n_u32(0x3ff));
12241     uint32x4_t lowtenbitsadd =
12242         vaddq_u32(lowtenbits, vmovq_n_u32(0xDC00));
12243     uint32x4_t hightenbitsadd =
12244         vaddq_u32(hightenbits, vmovq_n_u32(0xD800));
12245     uint32x4_t lowtenbitsaddshifted = vshlq_n_u32(lowtenbitsadd, 16);
12246     uint32x4_t surrogates =
12247         vorrq_u32(hightenbitsadd, lowtenbitsaddshifted);
12248     uint32_t basic_buffer[4];
12249     uint32_t basic_buffer_swap[4];
12250     if (!match_system(big_endian)) {
12251       vst1q_u32(basic_buffer_swap, vreinterpretq_u32_u8(vqtbl1q_u8(composed, swap)));
12252       surrogates = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(surrogates), swap));
12253     }
12254     vst1q_u32(basic_buffer, vreinterpretq_u32_u8(composed));
12255     uint32_t surrogate_buffer[4];
12256     vst1q_u32(surrogate_buffer, surrogates);
12257     for (size_t i = 0; i < 3; i++) {
12258       if(basic_buffer[i] > 0x3c00000) {
12259         utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
12260         utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
12261         utf16_output += 2;
12262       } else {
12263         utf16_output[0] = !match_system(big_endian) ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
12264         utf16_output++;
12265       }
12266     }
12267   } else {
12268     // here we know that there is an error but we do not handle errors
12269   }
12270   return consumed;
12271 }
12272 /* end file src/arm64/arm_convert_utf8_to_utf16.cpp */
12273 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf8_to_utf32.cpp
12274 /* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */
12275 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
12276 // end of the code points. Only the least significant 12 bits of the mask
12277 // are accessed.
12278 // It returns how many bytes were consumed (up to 12).
convert_masked_utf8_to_utf32(const char * input,uint64_t utf8_end_of_code_point_mask,char32_t * & utf32_out)12279 size_t convert_masked_utf8_to_utf32(const char *input,
12280                            uint64_t utf8_end_of_code_point_mask,
12281                            char32_t *&utf32_out) {
12282   // we use an approach where we try to process up to 12 input bytes.
12283   // Why 12 input bytes and not 16? Because we are concerned with the size of
12284   // the lookup tables. Also 12 is nicely divisible by two and three.
12285   //
12286   uint32_t*& utf32_output = reinterpret_cast<uint32_t*&>(utf32_out);
12287   uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
12288   const uint16_t input_utf8_end_of_code_point_mask =
12289       utf8_end_of_code_point_mask & 0xFFF;
12290   //
12291   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
12292   // beneficial to have fast paths that depend on branch prediction but have less latency.
12293   // This results in more instructions but, potentially, also higher speeds.
12294   //
12295   // We first try a few fast paths.
12296   if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
12297     // We process in chunks of 16 bytes
12298     vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8 (in)))));
12299     vst1q_u32(utf32_output + 4, vmovl_high_u16(vmovl_u8(vget_low_u8 (in))));
12300     vst1q_u32(utf32_output + 8, vmovl_u16(vget_low_u16(vmovl_high_u8(in))));
12301     vst1q_u32(utf32_output + 12, vmovl_high_u16(vmovl_high_u8(in)));
12302     utf32_output += 16; // We wrote 16 16-bit characters.
12303     return 16; // We consumed 16 bytes.
12304   }
12305   if((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) {
12306     // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
12307     // There is probably a more efficient sequence, but the following might do.
12308 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12309     const uint8x16_t sh = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
12310 #else
12311     //const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
12312     const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
12313 #endif
12314     uint8x16_t perm = vqtbl1q_u8(in, sh);
12315     uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
12316     uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
12317     uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
12318     vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed))));
12319     vst1q_u32(utf32_output+4,  vmovl_high_u16(vreinterpretq_u16_u8(composed)));
12320     utf32_output += 8; // We wrote 32 bytes, 8 code points.
12321     return 16;
12322   }
12323   if(input_utf8_end_of_code_point_mask == 0x924) {
12324     // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
12325     // There is probably a more efficient sequence, but the following might do.
12326 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12327     const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
12328 #else
12329     const uint8x16_t sh = {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255};
12330 #endif
12331     uint8x16_t perm = vqtbl1q_u8(in, sh);
12332     uint8x16_t ascii =
12333         vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
12334     uint8x16_t middlebyte =
12335         vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
12336     uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
12337     uint32x4_t highbyte =
12338         vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
12339     uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
12340     uint32x4_t composed =
12341         vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
12342     vst1q_u32(utf32_output, composed);
12343     utf32_output += 4;
12344     return 12;
12345   }
12346   /// We do not have a fast path available, so we fallback.
12347 
12348   const uint8_t idx =
12349       simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
12350   const uint8_t consumed =
12351       simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
12352 
12353 
12354   if (idx < 64) {
12355     // SIX (6) input code-words
12356     // this is a relatively easy scenario
12357     // we process SIX (6) input code-words. The max length in bytes of six code
12358     // words spanning between 1 and 2 bytes each is 12 bytes.
12359     uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
12360     uint8x16_t perm = vqtbl1q_u8(in, sh);
12361     uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
12362     uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
12363     uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
12364     vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed))));
12365     vst1q_u32(utf32_output+4,  vmovl_high_u16(vreinterpretq_u16_u8(composed)));
12366     utf32_output += 6; // We wrote 12 bytes, 6 code points.
12367   } else if (idx < 145) {
12368     // FOUR (4) input code-words
12369     uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
12370     uint8x16_t perm = vqtbl1q_u8(in, sh);
12371     uint8x16_t ascii =
12372         vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
12373     uint8x16_t middlebyte =
12374         vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
12375     uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
12376     uint32x4_t highbyte =
12377         vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
12378     uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
12379     uint32x4_t composed =
12380         vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
12381     vst1q_u32(utf32_output, composed);
12382     utf32_output += 4;
12383   } else if (idx < 209) {
12384     // TWO (2) input code-words
12385     uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
12386     uint8x16_t perm = vqtbl1q_u8(in, sh);
12387     uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
12388     uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
12389     uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
12390     uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
12391     // correct for spurious high bit
12392     uint8x16_t correct =
12393         vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
12394     middlehighbyte = veorq_u8(correct, middlehighbyte);
12395     uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
12396     uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x07000000)));
12397     uint8x16_t highbyte_shifted =vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
12398     uint8x16_t composed =
12399         vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
12400                      vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
12401     vst1q_u32(utf32_output, vreinterpretq_u32_u8(composed));
12402     utf32_output += 3;
12403   } else {
12404     // here we know that there is an error but we do not handle errors
12405   }
12406   return consumed;
12407 }
12408 /* end file src/arm64/arm_convert_utf8_to_utf32.cpp */
12409 
12410 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf16_to_utf8.cpp
12411 /* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */
12412 /*
12413     The vectorized algorithm works on single SSE register i.e., it
12414     loads eight 16-bit words.
12415 
12416     We consider three cases:
12417     1. an input register contains no surrogates and each value
12418        is in range 0x0000 .. 0x07ff.
12419     2. an input register contains no surrogates and values are
12420        is in range 0x0000 .. 0xffff.
12421     3. an input register contains surrogates --- i.e. codepoints
12422        can have 16 or 32 bits.
12423 
12424     Ad 1.
12425 
12426     When values are less than 0x0800, it means that a 16-bit words
12427     can be converted into: 1) single UTF8 byte (when it's an ASCII
12428     char) or 2) two UTF8 bytes.
12429 
12430     For this case we do only some shuffle to obtain these 2-byte
12431     codes and finally compress the whole SSE register with a single
12432     shuffle.
12433 
12434     We need 256-entry lookup table to get a compression pattern
12435     and the number of output bytes in the compressed vector register.
12436     Each entry occupies 17 bytes.
12437 
12438     Ad 2.
12439 
12440     When values fit in 16-bit words, but are above 0x07ff, then
12441     a single word may produce one, two or three UTF8 bytes.
12442 
12443     We prepare data for all these three cases in two registers.
12444     The first register contains lower two UTF8 bytes (used in all
12445     cases), while the second one contains just the third byte for
12446     the three-UTF8-bytes case.
12447 
12448     Finally these two registers are interleaved forming eight-element
12449     array of 32-bit values. The array spans two SSE registers.
12450     The bytes from the registers are compressed using two shuffles.
12451 
12452     We need 256-entry lookup table to get a compression pattern
12453     and the number of output bytes in the compressed vector register.
12454     Each entry occupies 17 bytes.
12455 
12456 
12457     To summarize:
12458     - We need two 256-entry tables that have 8704 bytes in total.
12459 */
12460 /*
12461   Returns a pair: the first unprocessed byte from buf and utf8_output
12462   A scalar routing should carry on the conversion of the tail.
12463 */
12464 template <endianness big_endian>
arm_convert_utf16_to_utf8(const char16_t * buf,size_t len,char * utf8_out)12465 std::pair<const char16_t*, char*> arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out) {
12466   uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
12467   const char16_t* end = buf + len;
12468 
12469   const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
12470   const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
12471   const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
12472 
12473   while (buf + 16 <= end) {
12474     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
12475     if (!match_system(big_endian)) {
12476       #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12477       const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
12478       #else
12479       const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
12480       #endif
12481       in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
12482     }
12483     if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
12484         // It is common enough that we have sequences of 16 consecutive ASCII characters.
12485         uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
12486         if (!match_system(big_endian)) {
12487           #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12488           const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
12489           #else
12490           const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
12491           #endif
12492           nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
12493         }
12494         if(vmaxvq_u16(nextin) > 0x7F) {
12495           // 1. pack the bytes
12496           // obviously suboptimal.
12497           uint8x8_t utf8_packed = vmovn_u16(in);
12498           // 2. store (8 bytes)
12499           vst1_u8(utf8_output, utf8_packed);
12500           // 3. adjust pointers
12501           buf += 8;
12502           utf8_output += 8;
12503           in = nextin;
12504         } else {
12505           // 1. pack the bytes
12506           // obviously suboptimal.
12507           uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
12508           // 2. store (16 bytes)
12509           vst1q_u8(utf8_output, utf8_packed);
12510           // 3. adjust pointers
12511           buf += 16;
12512           utf8_output += 16;
12513           continue; // we are done for this round!
12514         }
12515     }
12516 
12517     if (vmaxvq_u16(in) <= 0x7FF) {
12518           // 1. prepare 2-byte values
12519           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
12520           // expected output   : [110a|aaaa|10bb|bbbb] x 8
12521           const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
12522           const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
12523 
12524           // t0 = [000a|aaaa|bbbb|bb00]
12525           const uint16x8_t t0 = vshlq_n_u16(in, 2);
12526           // t1 = [000a|aaaa|0000|0000]
12527           const uint16x8_t t1 = vandq_u16(t0, v_1f00);
12528           // t2 = [0000|0000|00bb|bbbb]
12529           const uint16x8_t t2 = vandq_u16(in, v_003f);
12530           // t3 = [000a|aaaa|00bb|bbbb]
12531           const uint16x8_t t3 = vorrq_u16(t1, t2);
12532           // t4 = [110a|aaaa|10bb|bbbb]
12533           const uint16x8_t t4 = vorrq_u16(t3, v_c080);
12534           // 2. merge ASCII and 2-byte codewords
12535           const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
12536           const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
12537           const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
12538           // 3. prepare bitmask for 8-bit lookup
12539 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12540           const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
12541                                     0x0010, 0x0040,
12542                                     0x0002, 0x0008,
12543                                     0x0020, 0x0080);
12544 #else
12545           const uint16x8_t mask = { 0x0001, 0x0004,
12546                                     0x0010, 0x0040,
12547                                     0x0002, 0x0008,
12548                                     0x0020, 0x0080 };
12549 #endif
12550           uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
12551           // 4. pack the bytes
12552           const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
12553           const uint8x16_t shuffle = vld1q_u8(row + 1);
12554           const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
12555 
12556           // 5. store bytes
12557           vst1q_u8(utf8_output, utf8_packed);
12558 
12559           // 6. adjust pointers
12560           buf += 8;
12561           utf8_output += row[0];
12562           continue;
12563 
12564     }
12565     const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
12566     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
12567     // it is likely an uncommon occurrence.
12568       if (vmaxvq_u16(surrogates_bytemask) == 0) {
12569       // case: words from register produce either 1, 2 or 3 UTF-8 bytes
12570 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12571         const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
12572                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
12573 #else
12574         const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
12575                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
12576 #endif
12577         /* In this branch we handle three cases:
12578            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
12579            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
12580            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
12581 
12582           We expand the input word (16-bit) into two words (32-bit), thus
12583           we have room for four bytes. However, we need five distinct bit
12584           layouts. Note that the last byte in cases #2 and #3 is the same.
12585 
12586           We precompute byte 1 for case #1 and the common byte for cases #2 & #3
12587           in register t2.
12588 
12589           We precompute byte 1 for case #3 and -- **conditionally** -- precompute
12590           either byte 1 for case #2 or byte 2 for case #3. Note that they
12591           differ by exactly one bit.
12592 
12593           Finally from these two words we build proper UTF-8 sequence, taking
12594           into account the case (i.e, the number of bytes to write).
12595         */
12596         /**
12597          * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
12598          * t2 => [0ccc|cccc] [10cc|cccc]
12599          * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
12600          */
12601 #define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
12602         // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
12603         const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
12604         // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
12605         const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
12606         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
12607         const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
12608 
12609         // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
12610         const uint16x8_t s0 = vshrq_n_u16(in, 12);
12611         // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
12612         const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
12613         // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
12614         const uint16x8_t s1s = vshlq_n_u16(s1, 2);
12615         // [00bb|bbbb|0000|aaaa]
12616         const uint16x8_t s2 = vorrq_u16(s0, s1s);
12617         // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
12618         const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
12619         const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
12620         const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
12621         const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
12622         const uint16x8_t s4 = veorq_u16(s3, m0);
12623 #undef simdutf_vec
12624 
12625         // 4. expand words 16-bit => 32-bit
12626         const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
12627         const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
12628 
12629         // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
12630         const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
12631         const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
12632 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12633         const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
12634                                     0x0010, 0x0040,
12635                                     0x0100, 0x0400,
12636                                     0x1000, 0x4000 );
12637         const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
12638                                     0x0020, 0x0080,
12639                                     0x0200, 0x0800,
12640                                     0x2000, 0x8000 );
12641 #else
12642         const uint16x8_t onemask = { 0x0001, 0x0004,
12643                                     0x0010, 0x0040,
12644                                     0x0100, 0x0400,
12645                                     0x1000, 0x4000 };
12646         const uint16x8_t twomask = { 0x0002, 0x0008,
12647                                     0x0020, 0x0080,
12648                                     0x0200, 0x0800,
12649                                     0x2000, 0x8000 };
12650 #endif
12651         const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
12652         const uint16_t mask = vaddvq_u16(combined);
12653         // The following fast path may or may not be beneficial.
12654         /*if(mask == 0) {
12655           // We only have three-byte words. Use fast path.
12656           const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
12657           const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
12658           const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
12659           vst1q_u8(utf8_output, utf8_0);
12660           utf8_output += 12;
12661           vst1q_u8(utf8_output, utf8_1);
12662           utf8_output += 12;
12663           buf += 8;
12664           continue;
12665         }*/
12666         const uint8_t mask0 = uint8_t(mask);
12667 
12668         const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
12669         const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
12670         const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
12671 
12672         const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
12673         const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
12674         const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
12675         const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
12676 
12677         vst1q_u8(utf8_output, utf8_0);
12678         utf8_output += row0[0];
12679         vst1q_u8(utf8_output, utf8_1);
12680         utf8_output += row1[0];
12681 
12682         buf += 8;
12683     // surrogate pair(s) in a register
12684     } else {
12685       // Let us do a scalar fallback.
12686       // It may seem wasteful to use scalar code, but being efficient with SIMD
12687       // in the presence of surrogate pairs may require non-trivial tables.
12688       size_t forward = 15;
12689       size_t k = 0;
12690       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
12691       for(; k < forward; k++) {
12692         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
12693         if((word & 0xFF80)==0) {
12694           *utf8_output++ = char(word);
12695         } else if((word & 0xF800)==0) {
12696           *utf8_output++ = char((word>>6) | 0b11000000);
12697           *utf8_output++ = char((word & 0b111111) | 0b10000000);
12698         } else if((word &0xF800 ) != 0xD800) {
12699           *utf8_output++ = char((word>>12) | 0b11100000);
12700           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
12701           *utf8_output++ = char((word & 0b111111) | 0b10000000);
12702         } else {
12703           // must be a surrogate pair
12704           uint16_t diff = uint16_t(word - 0xD800);
12705           uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
12706           k++;
12707           uint16_t diff2 = uint16_t(next_word - 0xDC00);
12708           if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
12709           uint32_t value = (diff << 10) + diff2 + 0x10000;
12710           *utf8_output++ = char((value>>18) | 0b11110000);
12711           *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
12712           *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
12713           *utf8_output++ = char((value & 0b111111) | 0b10000000);
12714         }
12715       }
12716       buf += k;
12717     }
12718   } // while
12719 
12720   return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
12721 }
12722 
12723 
12724 /*
12725   Returns a pair: a result struct and utf8_output.
12726   If there is an error, the count field of the result is the position of the error.
12727   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
12728   A scalar routing should carry on the conversion of the tail if needed.
12729 */
12730 template <endianness big_endian>
arm_convert_utf16_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_out)12731 std::pair<result, char*> arm_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_out) {
12732   uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
12733     const char16_t* start = buf;
12734   const char16_t* end = buf + len;
12735 
12736   const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
12737   const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
12738   const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
12739 
12740   while (buf + 16 <= end) {
12741     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
12742     if (!match_system(big_endian)) {
12743       #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12744       const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
12745       #else
12746       const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
12747       #endif
12748       in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
12749     }
12750     if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
12751         // It is common enough that we have sequences of 16 consecutive ASCII characters.
12752         uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
12753         if (!match_system(big_endian)) {
12754           #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12755           const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
12756           #else
12757           const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
12758           #endif
12759           nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
12760         }
12761         if(vmaxvq_u16(nextin) > 0x7F) {
12762           // 1. pack the bytes
12763           // obviously suboptimal.
12764           uint8x8_t utf8_packed = vmovn_u16(in);
12765           // 2. store (8 bytes)
12766           vst1_u8(utf8_output, utf8_packed);
12767           // 3. adjust pointers
12768           buf += 8;
12769           utf8_output += 8;
12770           in = nextin;
12771         } else {
12772           // 1. pack the bytes
12773           // obviously suboptimal.
12774           uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
12775           // 2. store (16 bytes)
12776           vst1q_u8(utf8_output, utf8_packed);
12777           // 3. adjust pointers
12778           buf += 16;
12779           utf8_output += 16;
12780           continue; // we are done for this round!
12781         }
12782     }
12783 
12784     if (vmaxvq_u16(in) <= 0x7FF) {
12785           // 1. prepare 2-byte values
12786           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
12787           // expected output   : [110a|aaaa|10bb|bbbb] x 8
12788           const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
12789           const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
12790 
12791           // t0 = [000a|aaaa|bbbb|bb00]
12792           const uint16x8_t t0 = vshlq_n_u16(in, 2);
12793           // t1 = [000a|aaaa|0000|0000]
12794           const uint16x8_t t1 = vandq_u16(t0, v_1f00);
12795           // t2 = [0000|0000|00bb|bbbb]
12796           const uint16x8_t t2 = vandq_u16(in, v_003f);
12797           // t3 = [000a|aaaa|00bb|bbbb]
12798           const uint16x8_t t3 = vorrq_u16(t1, t2);
12799           // t4 = [110a|aaaa|10bb|bbbb]
12800           const uint16x8_t t4 = vorrq_u16(t3, v_c080);
12801           // 2. merge ASCII and 2-byte codewords
12802           const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
12803           const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
12804           const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
12805           // 3. prepare bitmask for 8-bit lookup
12806 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12807           const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
12808                                     0x0010, 0x0040,
12809                                     0x0002, 0x0008,
12810                                     0x0020, 0x0080);
12811 #else
12812           const uint16x8_t mask = { 0x0001, 0x0004,
12813                                     0x0010, 0x0040,
12814                                     0x0002, 0x0008,
12815                                     0x0020, 0x0080 };
12816 #endif
12817           uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
12818           // 4. pack the bytes
12819           const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
12820           const uint8x16_t shuffle = vld1q_u8(row + 1);
12821           const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
12822 
12823           // 5. store bytes
12824           vst1q_u8(utf8_output, utf8_packed);
12825 
12826           // 6. adjust pointers
12827           buf += 8;
12828           utf8_output += row[0];
12829           continue;
12830 
12831     }
12832     const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
12833     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
12834     // it is likely an uncommon occurrence.
12835       if (vmaxvq_u16(surrogates_bytemask) == 0) {
12836       // case: words from register produce either 1, 2 or 3 UTF-8 bytes
12837 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12838         const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
12839                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
12840 #else
12841         const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
12842                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
12843 #endif
12844         /* In this branch we handle three cases:
12845            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
12846            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
12847            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
12848 
12849           We expand the input word (16-bit) into two words (32-bit), thus
12850           we have room for four bytes. However, we need five distinct bit
12851           layouts. Note that the last byte in cases #2 and #3 is the same.
12852 
12853           We precompute byte 1 for case #1 and the common byte for cases #2 & #3
12854           in register t2.
12855 
12856           We precompute byte 1 for case #3 and -- **conditionally** -- precompute
12857           either byte 1 for case #2 or byte 2 for case #3. Note that they
12858           differ by exactly one bit.
12859 
12860           Finally from these two words we build proper UTF-8 sequence, taking
12861           into account the case (i.e, the number of bytes to write).
12862         */
12863         /**
12864          * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
12865          * t2 => [0ccc|cccc] [10cc|cccc]
12866          * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
12867          */
12868 #define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
12869         // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
12870         const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
12871         // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
12872         const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
12873         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
12874         const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
12875 
12876         // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
12877         const uint16x8_t s0 = vshrq_n_u16(in, 12);
12878         // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
12879         const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
12880         // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
12881         const uint16x8_t s1s = vshlq_n_u16(s1, 2);
12882         // [00bb|bbbb|0000|aaaa]
12883         const uint16x8_t s2 = vorrq_u16(s0, s1s);
12884         // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
12885         const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
12886         const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
12887         const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
12888         const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
12889         const uint16x8_t s4 = veorq_u16(s3, m0);
12890 #undef simdutf_vec
12891 
12892         // 4. expand words 16-bit => 32-bit
12893         const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
12894         const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
12895 
12896         // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
12897         const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
12898         const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
12899 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12900         const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
12901                                     0x0010, 0x0040,
12902                                     0x0100, 0x0400,
12903                                     0x1000, 0x4000 );
12904         const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
12905                                     0x0020, 0x0080,
12906                                     0x0200, 0x0800,
12907                                     0x2000, 0x8000 );
12908 #else
12909         const uint16x8_t onemask = { 0x0001, 0x0004,
12910                                     0x0010, 0x0040,
12911                                     0x0100, 0x0400,
12912                                     0x1000, 0x4000 };
12913         const uint16x8_t twomask = { 0x0002, 0x0008,
12914                                     0x0020, 0x0080,
12915                                     0x0200, 0x0800,
12916                                     0x2000, 0x8000 };
12917 #endif
12918         const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
12919         const uint16_t mask = vaddvq_u16(combined);
12920         // The following fast path may or may not be beneficial.
12921         /*if(mask == 0) {
12922           // We only have three-byte words. Use fast path.
12923           const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
12924           const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
12925           const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
12926           vst1q_u8(utf8_output, utf8_0);
12927           utf8_output += 12;
12928           vst1q_u8(utf8_output, utf8_1);
12929           utf8_output += 12;
12930           buf += 8;
12931           continue;
12932         }*/
12933         const uint8_t mask0 = uint8_t(mask);
12934 
12935         const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
12936         const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
12937         const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
12938 
12939         const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
12940         const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
12941         const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
12942         const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
12943 
12944         vst1q_u8(utf8_output, utf8_0);
12945         utf8_output += row0[0];
12946         vst1q_u8(utf8_output, utf8_1);
12947         utf8_output += row1[0];
12948 
12949         buf += 8;
12950     // surrogate pair(s) in a register
12951     } else {
12952       // Let us do a scalar fallback.
12953       // It may seem wasteful to use scalar code, but being efficient with SIMD
12954       // in the presence of surrogate pairs may require non-trivial tables.
12955       size_t forward = 15;
12956       size_t k = 0;
12957       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
12958       for(; k < forward; k++) {
12959         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
12960         if((word & 0xFF80)==0) {
12961           *utf8_output++ = char(word);
12962         } else if((word & 0xF800)==0) {
12963           *utf8_output++ = char((word>>6) | 0b11000000);
12964           *utf8_output++ = char((word & 0b111111) | 0b10000000);
12965         } else if((word &0xF800 ) != 0xD800) {
12966           *utf8_output++ = char((word>>12) | 0b11100000);
12967           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
12968           *utf8_output++ = char((word & 0b111111) | 0b10000000);
12969         } else {
12970           // must be a surrogate pair
12971           uint16_t diff = uint16_t(word - 0xD800);
12972           uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
12973           k++;
12974           uint16_t diff2 = uint16_t(next_word - 0xDC00);
12975           if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char*>(utf8_output)); }
12976           uint32_t value = (diff << 10) + diff2 + 0x10000;
12977           *utf8_output++ = char((value>>18) | 0b11110000);
12978           *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
12979           *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
12980           *utf8_output++ = char((value & 0b111111) | 0b10000000);
12981         }
12982       }
12983       buf += k;
12984     }
12985   } // while
12986 
12987   return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
12988 }
12989 /* end file src/arm64/arm_convert_utf16_to_utf8.cpp */
12990 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf16_to_utf32.cpp
12991 /* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */
12992 /*
12993     The vectorized algorithm works on single SSE register i.e., it
12994     loads eight 16-bit words.
12995 
12996     We consider three cases:
12997     1. an input register contains no surrogates and each value
12998        is in range 0x0000 .. 0x07ff.
12999     2. an input register contains no surrogates and values are
13000        is in range 0x0000 .. 0xffff.
13001     3. an input register contains surrogates --- i.e. codepoints
13002        can have 16 or 32 bits.
13003 
13004     Ad 1.
13005 
13006     When values are less than 0x0800, it means that a 16-bit words
13007     can be converted into: 1) single UTF8 byte (when it's an ASCII
13008     char) or 2) two UTF8 bytes.
13009 
13010     For this case we do only some shuffle to obtain these 2-byte
13011     codes and finally compress the whole SSE register with a single
13012     shuffle.
13013 
13014     We need 256-entry lookup table to get a compression pattern
13015     and the number of output bytes in the compressed vector register.
13016     Each entry occupies 17 bytes.
13017 
13018     Ad 2.
13019 
13020     When values fit in 16-bit words, but are above 0x07ff, then
13021     a single word may produce one, two or three UTF8 bytes.
13022 
13023     We prepare data for all these three cases in two registers.
13024     The first register contains lower two UTF8 bytes (used in all
13025     cases), while the second one contains just the third byte for
13026     the three-UTF8-bytes case.
13027 
13028     Finally these two registers are interleaved forming eight-element
13029     array of 32-bit values. The array spans two SSE registers.
13030     The bytes from the registers are compressed using two shuffles.
13031 
13032     We need 256-entry lookup table to get a compression pattern
13033     and the number of output bytes in the compressed vector register.
13034     Each entry occupies 17 bytes.
13035 
13036 
13037     To summarize:
13038     - We need two 256-entry tables that have 8704 bytes in total.
13039 */
13040 /*
13041   Returns a pair: the first unprocessed byte from buf and utf8_output
13042   A scalar routing should carry on the conversion of the tail.
13043 */
13044 template <endianness big_endian>
arm_convert_utf16_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_out)13045 std::pair<const char16_t*, char32_t*> arm_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_out) {
13046   uint32_t * utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
13047   const char16_t* end = buf + len;
13048 
13049   const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
13050   const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
13051 
13052   while (buf + 16 <= end) {
13053     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
13054     if (!match_system(big_endian)) {
13055       #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
13056       const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
13057       #else
13058       const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
13059       #endif
13060       in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
13061     }
13062 
13063     const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
13064     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
13065     // it is likely an uncommon occurrence.
13066       if (vmaxvq_u16(surrogates_bytemask) == 0) {
13067       // case: no surrogate pairs, extend all 16-bit words to 32-bit words
13068       vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(in)));
13069       vst1q_u32(utf32_output+4,  vmovl_high_u16(in));
13070       utf32_output += 8;
13071       buf += 8;
13072     // surrogate pair(s) in a register
13073     } else {
13074       // Let us do a scalar fallback.
13075       // It may seem wasteful to use scalar code, but being efficient with SIMD
13076       // in the presence of surrogate pairs may require non-trivial tables.
13077       size_t forward = 15;
13078       size_t k = 0;
13079       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
13080       for(; k < forward; k++) {
13081         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
13082         if((word &0xF800 ) != 0xD800) {
13083           *utf32_output++ = char32_t(word);
13084         } else {
13085           // must be a surrogate pair
13086           uint16_t diff = uint16_t(word - 0xD800);
13087           uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
13088           k++;
13089           uint16_t diff2 = uint16_t(next_word - 0xDC00);
13090           if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, reinterpret_cast<char32_t*>(utf32_output)); }
13091           uint32_t value = (diff << 10) + diff2 + 0x10000;
13092           *utf32_output++ = char32_t(value);
13093         }
13094       }
13095       buf += k;
13096     }
13097   } // while
13098   return std::make_pair(buf, reinterpret_cast<char32_t*>(utf32_output));
13099 }
13100 
13101 
13102 /*
13103   Returns a pair: a result struct and utf8_output.
13104   If there is an error, the count field of the result is the position of the error.
13105   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
13106   A scalar routing should carry on the conversion of the tail if needed.
13107 */
13108 template <endianness big_endian>
arm_convert_utf16_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_out)13109 std::pair<result, char32_t*> arm_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_out) {
13110   uint32_t * utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
13111   const char16_t* start = buf;
13112   const char16_t* end = buf + len;
13113 
13114   const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
13115   const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
13116 
13117   while (buf + 16 <= end) {
13118     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
13119     if (!match_system(big_endian)) {
13120       #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
13121       const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
13122       #else
13123       const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
13124       #endif
13125       in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
13126     }
13127 
13128     const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
13129     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
13130     // it is likely an uncommon occurrence.
13131       if (vmaxvq_u16(surrogates_bytemask) == 0) {
13132       // case: no surrogate pairs, extend all 16-bit words to 32-bit words
13133       vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(in)));
13134       vst1q_u32(utf32_output+4,  vmovl_high_u16(in));
13135       utf32_output += 8;
13136       buf += 8;
13137     // surrogate pair(s) in a register
13138     } else {
13139       // Let us do a scalar fallback.
13140       // It may seem wasteful to use scalar code, but being efficient with SIMD
13141       // in the presence of surrogate pairs may require non-trivial tables.
13142       size_t forward = 15;
13143       size_t k = 0;
13144       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
13145       for(; k < forward; k++) {
13146         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
13147         if((word &0xF800 ) != 0xD800) {
13148           *utf32_output++ = char32_t(word);
13149         } else {
13150           // must be a surrogate pair
13151           uint16_t diff = uint16_t(word - 0xD800);
13152           uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
13153           k++;
13154           uint16_t diff2 = uint16_t(next_word - 0xDC00);
13155           if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char32_t*>(utf32_output)); }
13156           uint32_t value = (diff << 10) + diff2 + 0x10000;
13157           *utf32_output++ = char32_t(value);
13158         }
13159       }
13160       buf += k;
13161     }
13162   } // while
13163   return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char32_t*>(utf32_output));
13164 }
13165 /* end file src/arm64/arm_convert_utf16_to_utf32.cpp */
13166 
13167 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf32_to_utf8.cpp
13168 /* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */
arm_convert_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_out)13169 std::pair<const char32_t*, char*> arm_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_out) {
13170   uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
13171   const char32_t* end = buf + len;
13172 
13173   const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
13174 
13175   uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
13176 
13177   while (buf + 16 <= end) {
13178     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
13179     uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
13180 
13181     // Check if no bits set above 16th
13182     if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
13183       // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
13184       // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
13185       uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
13186       if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
13187           // 1. pack the bytes
13188           // obviously suboptimal.
13189           uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
13190           // 2. store (8 bytes)
13191           vst1_u8(utf8_output, utf8_packed);
13192           // 3. adjust pointers
13193           buf += 8;
13194           utf8_output += 8;
13195           continue; // we are done for this round!
13196       }
13197 
13198       if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
13199             // 1. prepare 2-byte values
13200             // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
13201             // expected output   : [110a|aaaa|10bb|bbbb] x 8
13202             const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
13203             const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
13204 
13205             // t0 = [000a|aaaa|bbbb|bb00]
13206             const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
13207             // t1 = [000a|aaaa|0000|0000]
13208             const uint16x8_t t1 = vandq_u16(t0, v_1f00);
13209             // t2 = [0000|0000|00bb|bbbb]
13210             const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
13211             // t3 = [000a|aaaa|00bb|bbbb]
13212             const uint16x8_t t3 = vorrq_u16(t1, t2);
13213             // t4 = [110a|aaaa|10bb|bbbb]
13214             const uint16x8_t t4 = vorrq_u16(t3, v_c080);
13215             // 2. merge ASCII and 2-byte codewords
13216             const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
13217             const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
13218             const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
13219             // 3. prepare bitmask for 8-bit lookup
13220   #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
13221             const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
13222                                       0x0010, 0x0040,
13223                                       0x0002, 0x0008,
13224                                       0x0020, 0x0080);
13225   #else
13226             const uint16x8_t mask = { 0x0001, 0x0004,
13227                                       0x0010, 0x0040,
13228                                       0x0002, 0x0008,
13229                                       0x0020, 0x0080 };
13230   #endif
13231             uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
13232             // 4. pack the bytes
13233             const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
13234             const uint8x16_t shuffle = vld1q_u8(row + 1);
13235             const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
13236 
13237             // 5. store bytes
13238             vst1q_u8(utf8_output, utf8_packed);
13239 
13240             // 6. adjust pointers
13241             buf += 8;
13242             utf8_output += row[0];
13243             continue;
13244 
13245       } else {
13246         // case: words from register produce either 1, 2 or 3 UTF-8 bytes
13247         const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
13248         const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
13249         forbidden_bytemask = vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)), forbidden_bytemask);
13250 
13251   #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
13252           const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
13253                                       0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
13254   #else
13255           const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
13256                                       0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
13257   #endif
13258           /* In this branch we handle three cases:
13259             1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
13260             2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
13261             3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
13262 
13263             We expand the input word (16-bit) into two words (32-bit), thus
13264             we have room for four bytes. However, we need five distinct bit
13265             layouts. Note that the last byte in cases #2 and #3 is the same.
13266 
13267             We precompute byte 1 for case #1 and the common byte for cases #2 & #3
13268             in register t2.
13269 
13270             We precompute byte 1 for case #3 and -- **conditionally** -- precompute
13271             either byte 1 for case #2 or byte 2 for case #3. Note that they
13272             differ by exactly one bit.
13273 
13274             Finally from these two words we build proper UTF-8 sequence, taking
13275             into account the case (i.e, the number of bytes to write).
13276           */
13277           /**
13278            * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
13279            * t2 => [0ccc|cccc] [10cc|cccc]
13280            * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
13281            */
13282   #define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
13283           // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
13284           const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
13285           // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
13286           const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
13287           // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
13288           const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
13289 
13290           // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
13291           const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
13292           // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
13293           const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
13294           // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
13295           const uint16x8_t s1s = vshlq_n_u16(s1, 2);
13296           // [00bb|bbbb|0000|aaaa]
13297           const uint16x8_t s2 = vorrq_u16(s0, s1s);
13298           // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
13299           const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
13300           const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
13301           const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
13302           const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
13303           const uint16x8_t s4 = veorq_u16(s3, m0);
13304   #undef simdutf_vec
13305 
13306           // 4. expand words 16-bit => 32-bit
13307           const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
13308           const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
13309 
13310           // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
13311           const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
13312           const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
13313   #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
13314           const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
13315                                       0x0010, 0x0040,
13316                                       0x0100, 0x0400,
13317                                       0x1000, 0x4000 );
13318           const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
13319                                       0x0020, 0x0080,
13320                                       0x0200, 0x0800,
13321                                       0x2000, 0x8000 );
13322   #else
13323           const uint16x8_t onemask = { 0x0001, 0x0004,
13324                                       0x0010, 0x0040,
13325                                       0x0100, 0x0400,
13326                                       0x1000, 0x4000 };
13327           const uint16x8_t twomask = { 0x0002, 0x0008,
13328                                       0x0020, 0x0080,
13329                                       0x0200, 0x0800,
13330                                       0x2000, 0x8000 };
13331   #endif
13332           const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
13333           const uint16_t mask = vaddvq_u16(combined);
13334           // The following fast path may or may not be beneficial.
13335           /*if(mask == 0) {
13336             // We only have three-byte words. Use fast path.
13337             const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
13338             const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
13339             const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
13340             vst1q_u8(utf8_output, utf8_0);
13341             utf8_output += 12;
13342             vst1q_u8(utf8_output, utf8_1);
13343             utf8_output += 12;
13344             buf += 8;
13345             continue;
13346           }*/
13347           const uint8_t mask0 = uint8_t(mask);
13348           const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
13349           const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
13350           const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
13351 
13352           const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
13353           const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
13354           const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
13355           const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
13356 
13357           vst1q_u8(utf8_output, utf8_0);
13358           utf8_output += row0[0];
13359           vst1q_u8(utf8_output, utf8_1);
13360           utf8_output += row1[0];
13361 
13362           buf += 8;
13363       }
13364     // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
13365     } else {
13366       // Let us do a scalar fallback.
13367       // It may seem wasteful to use scalar code, but being efficient with SIMD
13368       // in the presence of surrogate pairs may require non-trivial tables.
13369       size_t forward = 15;
13370       size_t k = 0;
13371       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
13372       for(; k < forward; k++) {
13373         uint32_t word = buf[k];
13374         if((word & 0xFFFFFF80)==0) {
13375           *utf8_output++ = char(word);
13376         } else if((word & 0xFFFFF800)==0) {
13377           *utf8_output++ = char((word>>6) | 0b11000000);
13378           *utf8_output++ = char((word & 0b111111) | 0b10000000);
13379         } else if((word & 0xFFFF0000)==0) {
13380           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
13381           *utf8_output++ = char((word>>12) | 0b11100000);
13382           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
13383           *utf8_output++ = char((word & 0b111111) | 0b10000000);
13384         } else {
13385           if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
13386           *utf8_output++ = char((word>>18) | 0b11110000);
13387           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
13388           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
13389           *utf8_output++ = char((word & 0b111111) | 0b10000000);
13390         }
13391       }
13392       buf += k;
13393     }
13394   } // while
13395 
13396   // check for invalid input
13397   if (vmaxvq_u16(forbidden_bytemask) != 0) {
13398     return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
13399   }
13400   return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
13401 }
13402 
13403 
arm_convert_utf32_to_utf8_with_errors(const char32_t * buf,size_t len,char * utf8_out)13404 std::pair<result, char*> arm_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_out) {
13405   uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
13406   const char32_t* start = buf;
13407   const char32_t* end = buf + len;
13408 
13409   const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
13410 
13411   while (buf + 16 <= end) {
13412     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
13413     uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
13414 
13415     // Check if no bits set above 16th
13416     if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
13417       // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
13418       // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
13419       uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
13420       if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
13421           // 1. pack the bytes
13422           // obviously suboptimal.
13423           uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
13424           // 2. store (8 bytes)
13425           vst1_u8(utf8_output, utf8_packed);
13426           // 3. adjust pointers
13427           buf += 8;
13428           utf8_output += 8;
13429           continue; // we are done for this round!
13430       }
13431 
13432       if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
13433             // 1. prepare 2-byte values
13434             // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
13435             // expected output   : [110a|aaaa|10bb|bbbb] x 8
13436             const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
13437             const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
13438 
13439             // t0 = [000a|aaaa|bbbb|bb00]
13440             const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
13441             // t1 = [000a|aaaa|0000|0000]
13442             const uint16x8_t t1 = vandq_u16(t0, v_1f00);
13443             // t2 = [0000|0000|00bb|bbbb]
13444             const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
13445             // t3 = [000a|aaaa|00bb|bbbb]
13446             const uint16x8_t t3 = vorrq_u16(t1, t2);
13447             // t4 = [110a|aaaa|10bb|bbbb]
13448             const uint16x8_t t4 = vorrq_u16(t3, v_c080);
13449             // 2. merge ASCII and 2-byte codewords
13450             const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
13451             const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
13452             const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
13453             // 3. prepare bitmask for 8-bit lookup
13454   #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
13455             const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
13456                                       0x0010, 0x0040,
13457                                       0x0002, 0x0008,
13458                                       0x0020, 0x0080);
13459   #else
13460             const uint16x8_t mask = { 0x0001, 0x0004,
13461                                       0x0010, 0x0040,
13462                                       0x0002, 0x0008,
13463                                       0x0020, 0x0080 };
13464   #endif
13465             uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
13466             // 4. pack the bytes
13467             const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
13468             const uint8x16_t shuffle = vld1q_u8(row + 1);
13469             const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
13470 
13471             // 5. store bytes
13472             vst1q_u8(utf8_output, utf8_packed);
13473 
13474             // 6. adjust pointers
13475             buf += 8;
13476             utf8_output += row[0];
13477             continue;
13478 
13479       } else {
13480         // case: words from register produce either 1, 2 or 3 UTF-8 bytes
13481 
13482         // check for invalid input
13483         const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
13484         const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
13485         const uint16x8_t forbidden_bytemask = vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
13486         if (vmaxvq_u16(forbidden_bytemask) != 0) {
13487           return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char*>(utf8_output));
13488         }
13489 
13490   #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
13491           const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
13492                                       0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
13493   #else
13494           const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
13495                                       0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
13496   #endif
13497           /* In this branch we handle three cases:
13498             1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
13499             2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
13500             3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
13501 
13502             We expand the input word (16-bit) into two words (32-bit), thus
13503             we have room for four bytes. However, we need five distinct bit
13504             layouts. Note that the last byte in cases #2 and #3 is the same.
13505 
13506             We precompute byte 1 for case #1 and the common byte for cases #2 & #3
13507             in register t2.
13508 
13509             We precompute byte 1 for case #3 and -- **conditionally** -- precompute
13510             either byte 1 for case #2 or byte 2 for case #3. Note that they
13511             differ by exactly one bit.
13512 
13513             Finally from these two words we build proper UTF-8 sequence, taking
13514             into account the case (i.e, the number of bytes to write).
13515           */
13516           /**
13517            * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
13518            * t2 => [0ccc|cccc] [10cc|cccc]
13519            * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
13520            */
13521   #define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
13522           // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
13523           const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
13524           // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
13525           const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
13526           // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
13527           const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
13528 
13529           // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
13530           const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
13531           // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
13532           const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
13533           // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
13534           const uint16x8_t s1s = vshlq_n_u16(s1, 2);
13535           // [00bb|bbbb|0000|aaaa]
13536           const uint16x8_t s2 = vorrq_u16(s0, s1s);
13537           // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
13538           const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
13539           const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
13540           const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
13541           const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
13542           const uint16x8_t s4 = veorq_u16(s3, m0);
13543   #undef simdutf_vec
13544 
13545           // 4. expand words 16-bit => 32-bit
13546           const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
13547           const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
13548 
13549           // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
13550           const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
13551           const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
13552   #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
13553           const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
13554                                       0x0010, 0x0040,
13555                                       0x0100, 0x0400,
13556                                       0x1000, 0x4000 );
13557           const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
13558                                       0x0020, 0x0080,
13559                                       0x0200, 0x0800,
13560                                       0x2000, 0x8000 );
13561   #else
13562           const uint16x8_t onemask = { 0x0001, 0x0004,
13563                                       0x0010, 0x0040,
13564                                       0x0100, 0x0400,
13565                                       0x1000, 0x4000 };
13566           const uint16x8_t twomask = { 0x0002, 0x0008,
13567                                       0x0020, 0x0080,
13568                                       0x0200, 0x0800,
13569                                       0x2000, 0x8000 };
13570   #endif
13571           const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
13572           const uint16_t mask = vaddvq_u16(combined);
13573           // The following fast path may or may not be beneficial.
13574           /*if(mask == 0) {
13575             // We only have three-byte words. Use fast path.
13576             const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
13577             const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
13578             const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
13579             vst1q_u8(utf8_output, utf8_0);
13580             utf8_output += 12;
13581             vst1q_u8(utf8_output, utf8_1);
13582             utf8_output += 12;
13583             buf += 8;
13584             continue;
13585           }*/
13586           const uint8_t mask0 = uint8_t(mask);
13587 
13588           const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
13589           const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
13590           const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
13591 
13592           const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
13593           const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
13594           const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
13595           const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
13596 
13597           vst1q_u8(utf8_output, utf8_0);
13598           utf8_output += row0[0];
13599           vst1q_u8(utf8_output, utf8_1);
13600           utf8_output += row1[0];
13601 
13602           buf += 8;
13603       }
13604     // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
13605     } else {
13606       // Let us do a scalar fallback.
13607       // It may seem wasteful to use scalar code, but being efficient with SIMD
13608       // in the presence of surrogate pairs may require non-trivial tables.
13609       size_t forward = 15;
13610       size_t k = 0;
13611       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
13612       for(; k < forward; k++) {
13613         uint32_t word = buf[k];
13614         if((word & 0xFFFFFF80)==0) {
13615           *utf8_output++ = char(word);
13616         } else if((word & 0xFFFFF800)==0) {
13617           *utf8_output++ = char((word>>6) | 0b11000000);
13618           *utf8_output++ = char((word & 0b111111) | 0b10000000);
13619         } else if((word & 0xFFFF0000)==0) {
13620           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
13621           *utf8_output++ = char((word>>12) | 0b11100000);
13622           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
13623           *utf8_output++ = char((word & 0b111111) | 0b10000000);
13624         } else {
13625           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
13626           *utf8_output++ = char((word>>18) | 0b11110000);
13627           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
13628           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
13629           *utf8_output++ = char((word & 0b111111) | 0b10000000);
13630         }
13631       }
13632       buf += k;
13633     }
13634   } // while
13635 
13636   return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
13637 }
13638 /* end file src/arm64/arm_convert_utf32_to_utf8.cpp */
13639 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf32_to_utf16.cpp
13640 /* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */
13641 template <endianness big_endian>
arm_convert_utf32_to_utf16(const char32_t * buf,size_t len,char16_t * utf16_out)13642 std::pair<const char32_t*, char16_t*> arm_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_out) {
13643   uint16_t * utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
13644   const char32_t* end = buf + len;
13645 
13646   uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
13647 
13648   while(buf + 4 <= end) {
13649     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
13650 
13651     // Check if no bits set above 16th
13652     if(vmaxvq_u32(in) <= 0xFFFF) {
13653       uint16x4_t utf16_packed = vmovn_u32(in);
13654 
13655       const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
13656       const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
13657       forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)), forbidden_bytemask);
13658 
13659       if (!match_system(big_endian)) {
13660         #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
13661         const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
13662         #else
13663         const uint8x8_t swap = {1, 0, 3, 2, 5, 4, 7, 6};
13664         #endif
13665         utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
13666       }
13667       vst1_u16(utf16_output, utf16_packed);
13668       utf16_output += 4;
13669       buf += 4;
13670     } else {
13671       size_t forward = 3;
13672       size_t k = 0;
13673       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
13674       for(; k < forward; k++) {
13675         uint32_t word = buf[k];
13676         if((word & 0xFFFF0000)==0) {
13677           // will not generate a surrogate pair
13678           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
13679           *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
13680         } else {
13681           // will generate a surrogate pair
13682           if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
13683           word -= 0x10000;
13684           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
13685           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
13686           if (!match_system(big_endian)) {
13687             high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
13688             low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
13689           }
13690           *utf16_output++ = char16_t(high_surrogate);
13691           *utf16_output++ = char16_t(low_surrogate);
13692         }
13693       }
13694       buf += k;
13695     }
13696   }
13697 
13698   // check for invalid input
13699   if (vmaxv_u16(forbidden_bytemask) != 0) {
13700     return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
13701   }
13702 
13703   return std::make_pair(buf, reinterpret_cast<char16_t*>(utf16_output));
13704 }
13705 
13706 
13707 template <endianness big_endian>
arm_convert_utf32_to_utf16_with_errors(const char32_t * buf,size_t len,char16_t * utf16_out)13708 std::pair<result, char16_t*> arm_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_out) {
13709   uint16_t * utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
13710   const char32_t* start = buf;
13711   const char32_t* end = buf + len;
13712 
13713   while(buf + 4 <= end) {
13714     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
13715 
13716     // Check if no bits set above 16th
13717     if(vmaxvq_u32(in) <= 0xFFFF) {
13718       uint16x4_t utf16_packed = vmovn_u32(in);
13719 
13720       const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
13721       const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
13722       const uint16x4_t forbidden_bytemask = vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800));
13723       if (vmaxv_u16(forbidden_bytemask) != 0) {
13724         return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char16_t*>(utf16_output));
13725       }
13726 
13727       if (!match_system(big_endian)) {
13728         #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
13729         const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
13730         #else
13731         const uint8x8_t swap = {1, 0, 3, 2, 5, 4, 7, 6};
13732         #endif
13733         utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
13734       }
13735       vst1_u16(utf16_output, utf16_packed);
13736       utf16_output += 4;
13737       buf += 4;
13738     } else {
13739       size_t forward = 3;
13740       size_t k = 0;
13741       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
13742       for(; k < forward; k++) {
13743         uint32_t word = buf[k];
13744         if((word & 0xFFFF0000)==0) {
13745           // will not generate a surrogate pair
13746           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
13747           *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
13748         } else {
13749           // will generate a surrogate pair
13750           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
13751           word -= 0x10000;
13752           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
13753           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
13754           if (!match_system(big_endian)) {
13755             high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
13756             low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
13757           }
13758           *utf16_output++ = char16_t(high_surrogate);
13759           *utf16_output++ = char16_t(low_surrogate);
13760         }
13761       }
13762       buf += k;
13763     }
13764   }
13765 
13766   return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char16_t*>(utf16_output));
13767 }
13768 /* end file src/arm64/arm_convert_utf32_to_utf16.cpp */
13769 } // unnamed namespace
13770 } // namespace arm64
13771 } // namespace simdutf
13772 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
13773 /* begin file src/generic/buf_block_reader.h */
13774 namespace simdutf {
13775 namespace arm64 {
13776 namespace {
13777 
13778 // Walks through a buffer in block-sized increments, loading the last part with spaces
13779 template<size_t STEP_SIZE>
13780 struct buf_block_reader {
13781 public:
13782   simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
13783   simdutf_really_inline size_t block_index();
13784   simdutf_really_inline bool has_full_block() const;
13785   simdutf_really_inline const uint8_t *full_block() const;
13786   /**
13787    * Get the last block, padded with spaces.
13788    *
13789    * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
13790    * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
13791    * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
13792    *
13793    * @return the number of effective characters in the last block.
13794    */
13795   simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
13796   simdutf_really_inline void advance();
13797 private:
13798   const uint8_t *buf;
13799   const size_t len;
13800   const size_t lenminusstep;
13801   size_t idx;
13802 };
13803 
13804 // Routines to print masks and text for debugging bitmask operations
format_input_text_64(const uint8_t * text)13805 simdutf_unused static char * format_input_text_64(const uint8_t *text) {
13806   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
13807   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
13808     buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
13809   }
13810   buf[sizeof(simd8x64<uint8_t>)] = '\0';
13811   return buf;
13812 }
13813 
13814 // Routines to print masks and text for debugging bitmask operations
format_input_text(const simd8x64<uint8_t> & in)13815 simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
13816   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
13817   in.store(reinterpret_cast<uint8_t*>(buf));
13818   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
13819     if (buf[i] < ' ') { buf[i] = '_'; }
13820   }
13821   buf[sizeof(simd8x64<uint8_t>)] = '\0';
13822   return buf;
13823 }
13824 
format_mask(uint64_t mask)13825 simdutf_unused static char * format_mask(uint64_t mask) {
13826   static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
13827   for (size_t i=0; i<64; i++) {
13828     buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
13829   }
13830   buf[64] = '\0';
13831   return buf;
13832 }
13833 
13834 template<size_t STEP_SIZE>
buf_block_reader(const uint8_t * _buf,size_t _len)13835 simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
13836 
13837 template<size_t STEP_SIZE>
block_index()13838 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
13839 
13840 template<size_t STEP_SIZE>
has_full_block() const13841 simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
13842   return idx < lenminusstep;
13843 }
13844 
13845 template<size_t STEP_SIZE>
full_block() const13846 simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
13847   return &buf[idx];
13848 }
13849 
13850 template<size_t STEP_SIZE>
get_remainder(uint8_t * dst) const13851 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
13852   if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
13853   std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
13854   std::memcpy(dst, buf + idx, len - idx);
13855   return len - idx;
13856 }
13857 
13858 template<size_t STEP_SIZE>
advance()13859 simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
13860   idx += STEP_SIZE;
13861 }
13862 
13863 } // unnamed namespace
13864 } // namespace arm64
13865 } // namespace simdutf
13866 /* end file src/generic/buf_block_reader.h */
13867 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
13868 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
13869 namespace simdutf {
13870 namespace arm64 {
13871 namespace {
13872 namespace utf8_validation {
13873 
13874 using namespace simd;
13875 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)13876   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
13877 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
13878 // Bit 1 = Too Long (ASCII followed by continuation)
13879 // Bit 2 = Overlong 3-byte
13880 // Bit 4 = Surrogate
13881 // Bit 5 = Overlong 2-byte
13882 // Bit 7 = Two Continuations
13883     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
13884                                                 // 11______ 11______
13885     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
13886     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
13887     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
13888     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
13889     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
13890     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
13891                                                 // 11110100 101_____
13892                                                 // 11110101 1001____
13893                                                 // 11110101 101_____
13894                                                 // 1111011_ 1001____
13895                                                 // 1111011_ 101_____
13896                                                 // 11111___ 1001____
13897                                                 // 11111___ 101_____
13898     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
13899                                                 // 11110101 1000____
13900                                                 // 1111011_ 1000____
13901                                                 // 11111___ 1000____
13902     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
13903 
13904     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
13905       // 0_______ ________ <ASCII in byte 1>
13906       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
13907       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
13908       // 10______ ________ <continuation in byte 1>
13909       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
13910       // 1100____ ________ <two byte lead in byte 1>
13911       TOO_SHORT | OVERLONG_2,
13912       // 1101____ ________ <two byte lead in byte 1>
13913       TOO_SHORT,
13914       // 1110____ ________ <three byte lead in byte 1>
13915       TOO_SHORT | OVERLONG_3 | SURROGATE,
13916       // 1111____ ________ <four+ byte lead in byte 1>
13917       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
13918     );
13919     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
13920     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
13921       // ____0000 ________
13922       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
13923       // ____0001 ________
13924       CARRY | OVERLONG_2,
13925       // ____001_ ________
13926       CARRY,
13927       CARRY,
13928 
13929       // ____0100 ________
13930       CARRY | TOO_LARGE,
13931       // ____0101 ________
13932       CARRY | TOO_LARGE | TOO_LARGE_1000,
13933       // ____011_ ________
13934       CARRY | TOO_LARGE | TOO_LARGE_1000,
13935       CARRY | TOO_LARGE | TOO_LARGE_1000,
13936 
13937       // ____1___ ________
13938       CARRY | TOO_LARGE | TOO_LARGE_1000,
13939       CARRY | TOO_LARGE | TOO_LARGE_1000,
13940       CARRY | TOO_LARGE | TOO_LARGE_1000,
13941       CARRY | TOO_LARGE | TOO_LARGE_1000,
13942       CARRY | TOO_LARGE | TOO_LARGE_1000,
13943       // ____1101 ________
13944       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
13945       CARRY | TOO_LARGE | TOO_LARGE_1000,
13946       CARRY | TOO_LARGE | TOO_LARGE_1000
13947     );
13948     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
13949       // ________ 0_______ <ASCII in byte 2>
13950       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
13951       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
13952 
13953       // ________ 1000____
13954       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
13955       // ________ 1001____
13956       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
13957       // ________ 101_____
13958       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
13959       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
13960 
13961       // ________ 11______
13962       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
13963     );
13964     return (byte_1_high & byte_1_low & byte_2_high);
13965   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)13966   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
13967       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
13968     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
13969     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
13970     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
13971     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
13972     return must23_80 ^ sc;
13973   }
13974 
13975   //
13976   // Return nonzero if there are incomplete multibyte characters at the end of the block:
13977   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
13978   //
is_incomplete(const simd8<uint8_t> input)13979   simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
13980     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
13981     // ... 1111____ 111_____ 11______
13982     static const uint8_t max_array[32] = {
13983       255, 255, 255, 255, 255, 255, 255, 255,
13984       255, 255, 255, 255, 255, 255, 255, 255,
13985       255, 255, 255, 255, 255, 255, 255, 255,
13986       255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
13987     };
13988     const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
13989     return input.gt_bits(max_value);
13990   }
13991 
13992   struct utf8_checker {
13993     // If this is nonzero, there has been a UTF-8 error.
13994     simd8<uint8_t> error;
13995     // The last input we received
13996     simd8<uint8_t> prev_input_block;
13997     // Whether the last input we received was incomplete (used for ASCII fast path)
13998     simd8<uint8_t> prev_incomplete;
13999 
14000     //
14001     // Check whether the current bytes are valid UTF-8.
14002     //
check_utf8_bytessimdutf::arm64::__anone55652eb2611::utf8_validation::utf8_checker14003     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
14004       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
14005       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
14006       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
14007       simd8<uint8_t> sc = check_special_cases(input, prev1);
14008       this->error |= check_multibyte_lengths(input, prev_input, sc);
14009     }
14010 
14011     // The only problem that can happen at EOF is that a multibyte character is too short
14012     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
14013     // too large in the first of two bytes.
check_eofsimdutf::arm64::__anone55652eb2611::utf8_validation::utf8_checker14014     simdutf_really_inline void check_eof() {
14015       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
14016       // possibly finish them.
14017       this->error |= this->prev_incomplete;
14018     }
14019 
check_next_inputsimdutf::arm64::__anone55652eb2611::utf8_validation::utf8_checker14020     simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
14021       if(simdutf_likely(is_ascii(input))) {
14022         this->error |= this->prev_incomplete;
14023       } else {
14024         // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
14025         static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
14026             "We support either two or four chunks per 64-byte block.");
14027         if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
14028           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
14029           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
14030         } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
14031           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
14032           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
14033           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
14034           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
14035         }
14036         this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
14037         this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
14038 
14039       }
14040     }
14041 
14042     // do not forget to call check_eof!
errorssimdutf::arm64::__anone55652eb2611::utf8_validation::utf8_checker14043     simdutf_really_inline bool errors() const {
14044       return this->error.any_bits_set_anywhere();
14045     }
14046 
14047   }; // struct utf8_checker
14048 } // namespace utf8_validation
14049 
14050 using utf8_validation::utf8_checker;
14051 
14052 } // unnamed namespace
14053 } // namespace arm64
14054 } // namespace simdutf
14055 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
14056 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
14057 /* begin file src/generic/utf8_validation/utf8_validator.h */
14058 namespace simdutf {
14059 namespace arm64 {
14060 namespace {
14061 namespace utf8_validation {
14062 
14063 /**
14064  * Validates that the string is actual UTF-8.
14065  */
14066 template<class checker>
generic_validate_utf8(const uint8_t * input,size_t length)14067 bool generic_validate_utf8(const uint8_t * input, size_t length) {
14068     checker c{};
14069     buf_block_reader<64> reader(input, length);
14070     while (reader.has_full_block()) {
14071       simd::simd8x64<uint8_t> in(reader.full_block());
14072       c.check_next_input(in);
14073       reader.advance();
14074     }
14075     uint8_t block[64]{};
14076     reader.get_remainder(block);
14077     simd::simd8x64<uint8_t> in(block);
14078     c.check_next_input(in);
14079     reader.advance();
14080     c.check_eof();
14081     return !c.errors();
14082 }
14083 
generic_validate_utf8(const char * input,size_t length)14084 bool generic_validate_utf8(const char * input, size_t length) {
14085   return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
14086 }
14087 
14088 /**
14089  * Validates that the string is actual UTF-8 and stops on errors.
14090  */
14091 template<class checker>
generic_validate_utf8_with_errors(const uint8_t * input,size_t length)14092 result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
14093     checker c{};
14094     buf_block_reader<64> reader(input, length);
14095     size_t count{0};
14096     while (reader.has_full_block()) {
14097       simd::simd8x64<uint8_t> in(reader.full_block());
14098       c.check_next_input(in);
14099       if(c.errors()) {
14100         if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
14101         result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
14102         res.count += count;
14103         return res;
14104       }
14105       reader.advance();
14106       count += 64;
14107     }
14108     uint8_t block[64]{};
14109     reader.get_remainder(block);
14110     simd::simd8x64<uint8_t> in(block);
14111     c.check_next_input(in);
14112     reader.advance();
14113     c.check_eof();
14114     if (c.errors()) {
14115       result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
14116       res.count += count;
14117       return res;
14118     } else {
14119       return result(error_code::SUCCESS, length);
14120     }
14121 }
14122 
generic_validate_utf8_with_errors(const char * input,size_t length)14123 result generic_validate_utf8_with_errors(const char * input, size_t length) {
14124   return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
14125 }
14126 
14127 template<class checker>
generic_validate_ascii(const uint8_t * input,size_t length)14128 bool generic_validate_ascii(const uint8_t * input, size_t length) {
14129     buf_block_reader<64> reader(input, length);
14130     uint8_t blocks[64]{};
14131     simd::simd8x64<uint8_t> running_or(blocks);
14132     while (reader.has_full_block()) {
14133       simd::simd8x64<uint8_t> in(reader.full_block());
14134       running_or |= in;
14135       reader.advance();
14136     }
14137     uint8_t block[64]{};
14138     reader.get_remainder(block);
14139     simd::simd8x64<uint8_t> in(block);
14140     running_or |= in;
14141     return running_or.is_ascii();
14142 }
14143 
generic_validate_ascii(const char * input,size_t length)14144 bool generic_validate_ascii(const char * input, size_t length) {
14145   return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
14146 }
14147 
14148 template<class checker>
generic_validate_ascii_with_errors(const uint8_t * input,size_t length)14149 result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
14150   buf_block_reader<64> reader(input, length);
14151   size_t count{0};
14152   while (reader.has_full_block()) {
14153     simd::simd8x64<uint8_t> in(reader.full_block());
14154     if (!in.is_ascii()) {
14155       result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
14156       return result(res.error, count + res.count);
14157     }
14158     reader.advance();
14159 
14160     count += 64;
14161   }
14162   uint8_t block[64]{};
14163   reader.get_remainder(block);
14164   simd::simd8x64<uint8_t> in(block);
14165   if (!in.is_ascii()) {
14166     result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
14167     return result(res.error, count + res.count);
14168   } else {
14169     return result(error_code::SUCCESS, length);
14170   }
14171 }
14172 
generic_validate_ascii_with_errors(const char * input,size_t length)14173 result generic_validate_ascii_with_errors(const char * input, size_t length) {
14174   return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
14175 }
14176 
14177 } // namespace utf8_validation
14178 } // unnamed namespace
14179 } // namespace arm64
14180 } // namespace simdutf
14181 /* end file src/generic/utf8_validation/utf8_validator.h */
14182 // transcoding from UTF-8 to UTF-16
14183 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
14184 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
14185 
14186 
14187 namespace simdutf {
14188 namespace arm64 {
14189 namespace {
14190 namespace utf8_to_utf16 {
14191 
14192 using namespace simd;
14193 
14194 template <endianness endian>
convert_valid(const char * input,size_t size,char16_t * utf16_output)14195 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
14196     char16_t* utf16_output) noexcept {
14197   // The implementation is not specific to haswell and should be moved to the generic directory.
14198   size_t pos = 0;
14199   char16_t* start{utf16_output};
14200   const size_t safety_margin = 16; // to avoid overruns!
14201   while(pos + 64 + safety_margin <= size) {
14202     // this loop could be unrolled further. For example, we could process the mask
14203     // far more than 64 bytes.
14204     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
14205     if(in.is_ascii()) {
14206       in.store_ascii_as_utf16<endian>(utf16_output);
14207       utf16_output += 64;
14208       pos += 64;
14209     } else {
14210       // Slow path. We hope that the compiler will recognize that this is a slow path.
14211       // Anything that is not a continuation mask is a 'leading byte', that is, the
14212       // start of a new code point.
14213       uint64_t utf8_continuation_mask = in.lt(-65 + 1);
14214       // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
14215       uint64_t utf8_leading_mask = ~utf8_continuation_mask;
14216       // The *start* of code points is not so useful, rather, we want the *end* of code points.
14217       uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
14218       // We process in blocks of up to 12 bytes except possibly
14219       // for fast paths which may process up to 16 bytes. For the
14220       // slow path to work, we should have at least 12 input bytes left.
14221       size_t max_starting_point = (pos + 64) - 12;
14222       // Next loop is going to run at least five times when using solely
14223       // the slow/regular path, and at least four times if there are fast paths.
14224       while(pos < max_starting_point) {
14225         // Performance note: our ability to compute 'consumed' and
14226         // then shift and recompute is critical. If there is a
14227         // latency of, say, 4 cycles on getting 'consumed', then
14228         // the inner loop might have a total latency of about 6 cycles.
14229         // Yet we process between 6 to 12 inputs bytes, thus we get
14230         // a speed limit between 1 cycle/byte and 0.5 cycle/byte
14231         // for this section of the code. Hence, there is a limit
14232         // to how much we can further increase this latency before
14233         // it seriously harms performance.
14234         //
14235         // Thus we may allow convert_masked_utf8_to_utf16 to process
14236         // more bytes at a time under a fast-path mode where 16 bytes
14237         // are consumed at once (e.g., when encountering ASCII).
14238         size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
14239                             utf8_end_of_code_point_mask, utf16_output);
14240         pos += consumed;
14241         utf8_end_of_code_point_mask >>= consumed;
14242       }
14243       // At this point there may remain between 0 and 12 bytes in the
14244       // 64-byte block. These bytes will be processed again. So we have an
14245       // 80% efficiency (in the worst case). In practice we expect an
14246       // 85% to 90% efficiency.
14247     }
14248   }
14249   utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
14250   return utf16_output - start;
14251 }
14252 
14253 } // namespace utf8_to_utf16
14254 } // unnamed namespace
14255 } // namespace arm64
14256 } // namespace simdutf
14257 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
14258 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
14259 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
14260 
14261 
14262 namespace simdutf {
14263 namespace arm64 {
14264 namespace {
14265 namespace utf8_to_utf16 {
14266 using namespace simd;
14267 
14268 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)14269   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
14270 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
14271 // Bit 1 = Too Long (ASCII followed by continuation)
14272 // Bit 2 = Overlong 3-byte
14273 // Bit 4 = Surrogate
14274 // Bit 5 = Overlong 2-byte
14275 // Bit 7 = Two Continuations
14276     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
14277                                                 // 11______ 11______
14278     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
14279     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
14280     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
14281     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
14282     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
14283     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
14284                                                 // 11110100 101_____
14285                                                 // 11110101 1001____
14286                                                 // 11110101 101_____
14287                                                 // 1111011_ 1001____
14288                                                 // 1111011_ 101_____
14289                                                 // 11111___ 1001____
14290                                                 // 11111___ 101_____
14291     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
14292                                                 // 11110101 1000____
14293                                                 // 1111011_ 1000____
14294                                                 // 11111___ 1000____
14295     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
14296 
14297     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
14298       // 0_______ ________ <ASCII in byte 1>
14299       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
14300       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
14301       // 10______ ________ <continuation in byte 1>
14302       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
14303       // 1100____ ________ <two byte lead in byte 1>
14304       TOO_SHORT | OVERLONG_2,
14305       // 1101____ ________ <two byte lead in byte 1>
14306       TOO_SHORT,
14307       // 1110____ ________ <three byte lead in byte 1>
14308       TOO_SHORT | OVERLONG_3 | SURROGATE,
14309       // 1111____ ________ <four+ byte lead in byte 1>
14310       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
14311     );
14312     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
14313     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
14314       // ____0000 ________
14315       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
14316       // ____0001 ________
14317       CARRY | OVERLONG_2,
14318       // ____001_ ________
14319       CARRY,
14320       CARRY,
14321 
14322       // ____0100 ________
14323       CARRY | TOO_LARGE,
14324       // ____0101 ________
14325       CARRY | TOO_LARGE | TOO_LARGE_1000,
14326       // ____011_ ________
14327       CARRY | TOO_LARGE | TOO_LARGE_1000,
14328       CARRY | TOO_LARGE | TOO_LARGE_1000,
14329 
14330       // ____1___ ________
14331       CARRY | TOO_LARGE | TOO_LARGE_1000,
14332       CARRY | TOO_LARGE | TOO_LARGE_1000,
14333       CARRY | TOO_LARGE | TOO_LARGE_1000,
14334       CARRY | TOO_LARGE | TOO_LARGE_1000,
14335       CARRY | TOO_LARGE | TOO_LARGE_1000,
14336       // ____1101 ________
14337       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
14338       CARRY | TOO_LARGE | TOO_LARGE_1000,
14339       CARRY | TOO_LARGE | TOO_LARGE_1000
14340     );
14341     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
14342       // ________ 0_______ <ASCII in byte 2>
14343       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
14344       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
14345 
14346       // ________ 1000____
14347       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
14348       // ________ 1001____
14349       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
14350       // ________ 101_____
14351       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
14352       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
14353 
14354       // ________ 11______
14355       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
14356     );
14357     return (byte_1_high & byte_1_low & byte_2_high);
14358   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)14359   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
14360       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
14361     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
14362     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
14363     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
14364     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
14365     return must23_80 ^ sc;
14366   }
14367 
14368 
14369   struct validating_transcoder {
14370     // If this is nonzero, there has been a UTF-8 error.
14371     simd8<uint8_t> error;
14372 
validating_transcodersimdutf::arm64::__anone55652eb2911::utf8_to_utf16::validating_transcoder14373     validating_transcoder() : error(uint8_t(0)) {}
14374     //
14375     // Check whether the current bytes are valid UTF-8.
14376     //
check_utf8_bytessimdutf::arm64::__anone55652eb2911::utf8_to_utf16::validating_transcoder14377     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
14378       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
14379       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
14380       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
14381       simd8<uint8_t> sc = check_special_cases(input, prev1);
14382       this->error |= check_multibyte_lengths(input, prev_input, sc);
14383     }
14384 
14385 
14386     template <endianness endian>
convertsimdutf::arm64::__anone55652eb2911::utf8_to_utf16::validating_transcoder14387     simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
14388       size_t pos = 0;
14389       char16_t* start{utf16_output};
14390       // In the worst case, we have the haswell kernel which can cause an overflow of
14391       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
14392       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
14393       // much more than 8 bytes. However, you cannot generally assume that you have valid
14394       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
14395       // to give us a good margin.
14396       size_t leading_byte = 0;
14397       size_t margin = size;
14398       for(; margin > 0 && leading_byte < 8; margin--) {
14399         leading_byte += (int8_t(in[margin-1]) > -65);
14400       }
14401       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
14402       const size_t safety_margin = size - margin + 1; // to avoid overruns!
14403       while(pos + 64 + safety_margin <= size) {
14404         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
14405         if(input.is_ascii()) {
14406           input.store_ascii_as_utf16<endian>(utf16_output);
14407           utf16_output += 64;
14408           pos += 64;
14409         } else {
14410           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
14411           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
14412               "We support either two or four chunks per 64-byte block.");
14413           auto zero = simd8<uint8_t>{uint8_t(0)};
14414           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
14415             this->check_utf8_bytes(input.chunks[0], zero);
14416             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
14417           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
14418             this->check_utf8_bytes(input.chunks[0], zero);
14419             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
14420             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
14421             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
14422           }
14423           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
14424           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
14425           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
14426           // We process in blocks of up to 12 bytes except possibly
14427           // for fast paths which may process up to 16 bytes. For the
14428           // slow path to work, we should have at least 12 input bytes left.
14429           size_t max_starting_point = (pos + 64) - 12;
14430           // Next loop is going to run at least five times.
14431           while(pos < max_starting_point) {
14432             // Performance note: our ability to compute 'consumed' and
14433             // then shift and recompute is critical. If there is a
14434             // latency of, say, 4 cycles on getting 'consumed', then
14435             // the inner loop might have a total latency of about 6 cycles.
14436             // Yet we process between 6 to 12 inputs bytes, thus we get
14437             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
14438             // for this section of the code. Hence, there is a limit
14439             // to how much we can further increase this latency before
14440             // it seriously harms performance.
14441             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
14442                             utf8_end_of_code_point_mask, utf16_output);
14443             pos += consumed;
14444             utf8_end_of_code_point_mask >>= consumed;
14445           }
14446           // At this point there may remain between 0 and 12 bytes in the
14447           // 64-byte block. These bytes will be processed again. So we have an
14448           // 80% efficiency (in the worst case). In practice we expect an
14449           // 85% to 90% efficiency.
14450         }
14451       }
14452       if(errors()) { return 0; }
14453       if(pos < size) {
14454         size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
14455         if(howmany == 0) { return 0; }
14456         utf16_output += howmany;
14457       }
14458       return utf16_output - start;
14459     }
14460 
14461     template <endianness endian>
convert_with_errorssimdutf::arm64::__anone55652eb2911::utf8_to_utf16::validating_transcoder14462     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
14463       size_t pos = 0;
14464       char16_t* start{utf16_output};
14465       // In the worst case, we have the haswell kernel which can cause an overflow of
14466       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
14467       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
14468       // much more than 8 bytes. However, you cannot generally assume that you have valid
14469       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
14470       // to give us a good margin.
14471       size_t leading_byte = 0;
14472       size_t margin = size;
14473       for(; margin > 0 && leading_byte < 8; margin--) {
14474         leading_byte += (int8_t(in[margin-1]) > -65);
14475       }
14476       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
14477       const size_t safety_margin = size - margin + 1; // to avoid overruns!
14478       while(pos + 64 + safety_margin <= size) {
14479         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
14480         if(input.is_ascii()) {
14481           input.store_ascii_as_utf16<endian>(utf16_output);
14482           utf16_output += 64;
14483           pos += 64;
14484         } else {
14485           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
14486           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
14487               "We support either two or four chunks per 64-byte block.");
14488           auto zero = simd8<uint8_t>{uint8_t(0)};
14489           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
14490             this->check_utf8_bytes(input.chunks[0], zero);
14491             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
14492           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
14493             this->check_utf8_bytes(input.chunks[0], zero);
14494             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
14495             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
14496             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
14497           }
14498           if (errors()) {
14499             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
14500             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
14501             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
14502             res.count += pos;
14503             return res;
14504           }
14505           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
14506           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
14507           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
14508           // We process in blocks of up to 12 bytes except possibly
14509           // for fast paths which may process up to 16 bytes. For the
14510           // slow path to work, we should have at least 12 input bytes left.
14511           size_t max_starting_point = (pos + 64) - 12;
14512           // Next loop is going to run at least five times.
14513           while(pos < max_starting_point) {
14514             // Performance note: our ability to compute 'consumed' and
14515             // then shift and recompute is critical. If there is a
14516             // latency of, say, 4 cycles on getting 'consumed', then
14517             // the inner loop might have a total latency of about 6 cycles.
14518             // Yet we process between 6 to 12 inputs bytes, thus we get
14519             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
14520             // for this section of the code. Hence, there is a limit
14521             // to how much we can further increase this latency before
14522             // it seriously harms performance.
14523             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
14524                             utf8_end_of_code_point_mask, utf16_output);
14525             pos += consumed;
14526             utf8_end_of_code_point_mask >>= consumed;
14527           }
14528           // At this point there may remain between 0 and 12 bytes in the
14529           // 64-byte block. These bytes will be processed again. So we have an
14530           // 80% efficiency (in the worst case). In practice we expect an
14531           // 85% to 90% efficiency.
14532         }
14533       }
14534       if(errors()) {
14535         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
14536         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
14537         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
14538         res.count += pos;
14539         return res;
14540       }
14541       if(pos < size) {
14542         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
14543         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
14544         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
14545         if (res.error) {    // In case of error, we want the error position
14546           res.count += pos;
14547           return res;
14548         } else {    // In case of success, we want the number of word written
14549           utf16_output += res.count;
14550         }
14551       }
14552       return result(error_code::SUCCESS, utf16_output - start);
14553     }
14554 
errorssimdutf::arm64::__anone55652eb2911::utf8_to_utf16::validating_transcoder14555     simdutf_really_inline bool errors() const {
14556       return this->error.any_bits_set_anywhere();
14557     }
14558 
14559   }; // struct utf8_checker
14560 } // utf8_to_utf16 namespace
14561 } // unnamed namespace
14562 } // namespace arm64
14563 } // namespace simdutf
14564 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
14565 // transcoding from UTF-8 to UTF-32
14566 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
14567 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
14568 
14569 namespace simdutf {
14570 namespace arm64 {
14571 namespace {
14572 namespace utf8_to_utf32 {
14573 
14574 using namespace simd;
14575 
14576 
convert_valid(const char * input,size_t size,char32_t * utf32_output)14577 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
14578     char32_t* utf32_output) noexcept {
14579   size_t pos = 0;
14580   char32_t* start{utf32_output};
14581   const size_t safety_margin = 16; // to avoid overruns!
14582   while(pos + 64 + safety_margin <= size) {
14583     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
14584     if(in.is_ascii()) {
14585       in.store_ascii_as_utf32(utf32_output);
14586       utf32_output += 64;
14587       pos += 64;
14588     } else {
14589     // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
14590     uint64_t utf8_continuation_mask = in.lt(-65 + 1);
14591     uint64_t utf8_leading_mask = ~utf8_continuation_mask;
14592     uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
14593     size_t max_starting_point = (pos + 64) - 12;
14594     while(pos < max_starting_point) {
14595       size_t consumed = convert_masked_utf8_to_utf32(input + pos,
14596                           utf8_end_of_code_point_mask, utf32_output);
14597       pos += consumed;
14598       utf8_end_of_code_point_mask >>= consumed;
14599       }
14600     }
14601   }
14602   utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
14603   return utf32_output - start;
14604 }
14605 
14606 
14607 } // namespace utf8_to_utf32
14608 } // unnamed namespace
14609 } // namespace arm64
14610 } // namespace simdutf
14611 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
14612 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
14613 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
14614 
14615 
14616 namespace simdutf {
14617 namespace arm64 {
14618 namespace {
14619 namespace utf8_to_utf32 {
14620 using namespace simd;
14621 
14622 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)14623   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
14624 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
14625 // Bit 1 = Too Long (ASCII followed by continuation)
14626 // Bit 2 = Overlong 3-byte
14627 // Bit 4 = Surrogate
14628 // Bit 5 = Overlong 2-byte
14629 // Bit 7 = Two Continuations
14630     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
14631                                                 // 11______ 11______
14632     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
14633     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
14634     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
14635     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
14636     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
14637     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
14638                                                 // 11110100 101_____
14639                                                 // 11110101 1001____
14640                                                 // 11110101 101_____
14641                                                 // 1111011_ 1001____
14642                                                 // 1111011_ 101_____
14643                                                 // 11111___ 1001____
14644                                                 // 11111___ 101_____
14645     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
14646                                                 // 11110101 1000____
14647                                                 // 1111011_ 1000____
14648                                                 // 11111___ 1000____
14649     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
14650 
14651     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
14652       // 0_______ ________ <ASCII in byte 1>
14653       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
14654       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
14655       // 10______ ________ <continuation in byte 1>
14656       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
14657       // 1100____ ________ <two byte lead in byte 1>
14658       TOO_SHORT | OVERLONG_2,
14659       // 1101____ ________ <two byte lead in byte 1>
14660       TOO_SHORT,
14661       // 1110____ ________ <three byte lead in byte 1>
14662       TOO_SHORT | OVERLONG_3 | SURROGATE,
14663       // 1111____ ________ <four+ byte lead in byte 1>
14664       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
14665     );
14666     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
14667     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
14668       // ____0000 ________
14669       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
14670       // ____0001 ________
14671       CARRY | OVERLONG_2,
14672       // ____001_ ________
14673       CARRY,
14674       CARRY,
14675 
14676       // ____0100 ________
14677       CARRY | TOO_LARGE,
14678       // ____0101 ________
14679       CARRY | TOO_LARGE | TOO_LARGE_1000,
14680       // ____011_ ________
14681       CARRY | TOO_LARGE | TOO_LARGE_1000,
14682       CARRY | TOO_LARGE | TOO_LARGE_1000,
14683 
14684       // ____1___ ________
14685       CARRY | TOO_LARGE | TOO_LARGE_1000,
14686       CARRY | TOO_LARGE | TOO_LARGE_1000,
14687       CARRY | TOO_LARGE | TOO_LARGE_1000,
14688       CARRY | TOO_LARGE | TOO_LARGE_1000,
14689       CARRY | TOO_LARGE | TOO_LARGE_1000,
14690       // ____1101 ________
14691       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
14692       CARRY | TOO_LARGE | TOO_LARGE_1000,
14693       CARRY | TOO_LARGE | TOO_LARGE_1000
14694     );
14695     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
14696       // ________ 0_______ <ASCII in byte 2>
14697       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
14698       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
14699 
14700       // ________ 1000____
14701       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
14702       // ________ 1001____
14703       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
14704       // ________ 101_____
14705       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
14706       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
14707 
14708       // ________ 11______
14709       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
14710     );
14711     return (byte_1_high & byte_1_low & byte_2_high);
14712   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)14713   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
14714       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
14715     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
14716     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
14717     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
14718     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
14719     return must23_80 ^ sc;
14720   }
14721 
14722 
14723   struct validating_transcoder {
14724     // If this is nonzero, there has been a UTF-8 error.
14725     simd8<uint8_t> error;
14726 
validating_transcodersimdutf::arm64::__anone55652eb2b11::utf8_to_utf32::validating_transcoder14727     validating_transcoder() : error(uint8_t(0)) {}
14728     //
14729     // Check whether the current bytes are valid UTF-8.
14730     //
check_utf8_bytessimdutf::arm64::__anone55652eb2b11::utf8_to_utf32::validating_transcoder14731     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
14732       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
14733       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
14734       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
14735       simd8<uint8_t> sc = check_special_cases(input, prev1);
14736       this->error |= check_multibyte_lengths(input, prev_input, sc);
14737     }
14738 
14739 
14740 
convertsimdutf::arm64::__anone55652eb2b11::utf8_to_utf32::validating_transcoder14741     simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
14742       size_t pos = 0;
14743       char32_t* start{utf32_output};
14744       // In the worst case, we have the haswell kernel which can cause an overflow of
14745       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
14746       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
14747       // much more than 8 bytes. However, you cannot generally assume that you have valid
14748       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
14749       // to give us a good margin.
14750       size_t leading_byte = 0;
14751       size_t margin = size;
14752       for(; margin > 0 && leading_byte < 4; margin--) {
14753         leading_byte += (int8_t(in[margin-1]) > -65);
14754       }
14755       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
14756       const size_t safety_margin = size - margin + 1; // to avoid overruns!
14757       while(pos + 64 + safety_margin <= size) {
14758         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
14759         if(input.is_ascii()) {
14760           input.store_ascii_as_utf32(utf32_output);
14761           utf32_output += 64;
14762           pos += 64;
14763         } else {
14764           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
14765           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
14766               "We support either two or four chunks per 64-byte block.");
14767           auto zero = simd8<uint8_t>{uint8_t(0)};
14768           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
14769             this->check_utf8_bytes(input.chunks[0], zero);
14770             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
14771           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
14772             this->check_utf8_bytes(input.chunks[0], zero);
14773             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
14774             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
14775             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
14776           }
14777           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
14778           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
14779           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
14780           // We process in blocks of up to 12 bytes except possibly
14781           // for fast paths which may process up to 16 bytes. For the
14782           // slow path to work, we should have at least 12 input bytes left.
14783           size_t max_starting_point = (pos + 64) - 12;
14784           // Next loop is going to run at least five times.
14785           while(pos < max_starting_point) {
14786             // Performance note: our ability to compute 'consumed' and
14787             // then shift and recompute is critical. If there is a
14788             // latency of, say, 4 cycles on getting 'consumed', then
14789             // the inner loop might have a total latency of about 6 cycles.
14790             // Yet we process between 6 to 12 inputs bytes, thus we get
14791             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
14792             // for this section of the code. Hence, there is a limit
14793             // to how much we can further increase this latency before
14794             // it seriously harms performance.
14795             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
14796                             utf8_end_of_code_point_mask, utf32_output);
14797             pos += consumed;
14798             utf8_end_of_code_point_mask >>= consumed;
14799           }
14800           // At this point there may remain between 0 and 12 bytes in the
14801           // 64-byte block. These bytes will be processed again. So we have an
14802           // 80% efficiency (in the worst case). In practice we expect an
14803           // 85% to 90% efficiency.
14804         }
14805       }
14806       if(errors()) { return 0; }
14807       if(pos < size) {
14808         size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
14809         if(howmany == 0) { return 0; }
14810         utf32_output += howmany;
14811       }
14812       return utf32_output - start;
14813     }
14814 
convert_with_errorssimdutf::arm64::__anone55652eb2b11::utf8_to_utf32::validating_transcoder14815     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
14816       size_t pos = 0;
14817       char32_t* start{utf32_output};
14818       // In the worst case, we have the haswell kernel which can cause an overflow of
14819       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
14820       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
14821       // much more than 8 bytes. However, you cannot generally assume that you have valid
14822       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
14823       // to give us a good margin.
14824       size_t leading_byte = 0;
14825       size_t margin = size;
14826       for(; margin > 0 && leading_byte < 4; margin--) {
14827         leading_byte += (int8_t(in[margin-1]) > -65);
14828       }
14829       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
14830       const size_t safety_margin = size - margin + 1; // to avoid overruns!
14831       while(pos + 64 + safety_margin <= size) {
14832         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
14833         if(input.is_ascii()) {
14834           input.store_ascii_as_utf32(utf32_output);
14835           utf32_output += 64;
14836           pos += 64;
14837         } else {
14838           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
14839           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
14840               "We support either two or four chunks per 64-byte block.");
14841           auto zero = simd8<uint8_t>{uint8_t(0)};
14842           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
14843             this->check_utf8_bytes(input.chunks[0], zero);
14844             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
14845           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
14846             this->check_utf8_bytes(input.chunks[0], zero);
14847             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
14848             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
14849             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
14850           }
14851           if (errors()) {
14852             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
14853             res.count += pos;
14854             return res;
14855           }
14856           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
14857           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
14858           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
14859           // We process in blocks of up to 12 bytes except possibly
14860           // for fast paths which may process up to 16 bytes. For the
14861           // slow path to work, we should have at least 12 input bytes left.
14862           size_t max_starting_point = (pos + 64) - 12;
14863           // Next loop is going to run at least five times.
14864           while(pos < max_starting_point) {
14865             // Performance note: our ability to compute 'consumed' and
14866             // then shift and recompute is critical. If there is a
14867             // latency of, say, 4 cycles on getting 'consumed', then
14868             // the inner loop might have a total latency of about 6 cycles.
14869             // Yet we process between 6 to 12 inputs bytes, thus we get
14870             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
14871             // for this section of the code. Hence, there is a limit
14872             // to how much we can further increase this latency before
14873             // it seriously harms performance.
14874             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
14875                             utf8_end_of_code_point_mask, utf32_output);
14876             pos += consumed;
14877             utf8_end_of_code_point_mask >>= consumed;
14878           }
14879           // At this point there may remain between 0 and 12 bytes in the
14880           // 64-byte block. These bytes will be processed again. So we have an
14881           // 80% efficiency (in the worst case). In practice we expect an
14882           // 85% to 90% efficiency.
14883         }
14884       }
14885       if(errors()) {
14886         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
14887         res.count += pos;
14888         return res;
14889       }
14890       if(pos < size) {
14891         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
14892         if (res.error) {    // In case of error, we want the error position
14893           res.count += pos;
14894           return res;
14895         } else {    // In case of success, we want the number of word written
14896           utf32_output += res.count;
14897         }
14898       }
14899       return result(error_code::SUCCESS, utf32_output - start);
14900     }
14901 
errorssimdutf::arm64::__anone55652eb2b11::utf8_to_utf32::validating_transcoder14902     simdutf_really_inline bool errors() const {
14903       return this->error.any_bits_set_anywhere();
14904     }
14905 
14906   }; // struct utf8_checker
14907 } // utf8_to_utf32 namespace
14908 } // unnamed namespace
14909 } // namespace arm64
14910 } // namespace simdutf
14911 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
14912 // other functions
14913 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8.h
14914 /* begin file src/generic/utf8.h */
14915 
14916 namespace simdutf {
14917 namespace arm64 {
14918 namespace {
14919 namespace utf8 {
14920 
14921 using namespace simd;
14922 
count_code_points(const char * in,size_t size)14923 simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
14924     size_t pos = 0;
14925     size_t count = 0;
14926     for(;pos + 64 <= size; pos += 64) {
14927       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
14928       uint64_t utf8_continuation_mask = input.lt(-65 + 1);
14929       count += 64 - count_ones(utf8_continuation_mask);
14930     }
14931     return count + scalar::utf8::count_code_points(in + pos, size - pos);
14932 }
14933 
14934 
utf16_length_from_utf8(const char * in,size_t size)14935 simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
14936     size_t pos = 0;
14937     size_t count = 0;
14938     // This algorithm could no doubt be improved!
14939     for(;pos + 64 <= size; pos += 64) {
14940       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
14941       uint64_t utf8_continuation_mask = input.lt(-65 + 1);
14942       // We count one word for anything that is not a continuation (so
14943       // leading bytes).
14944       count += 64 - count_ones(utf8_continuation_mask);
14945       int64_t utf8_4byte = input.gteq_unsigned(240);
14946       count += count_ones(utf8_4byte);
14947     }
14948     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
14949 }
14950 
14951 
utf32_length_from_utf8(const char * in,size_t size)14952 simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
14953     return count_code_points(in, size);
14954 }
14955 } // utf8 namespace
14956 } // unnamed namespace
14957 } // namespace arm64
14958 } // namespace simdutf
14959 /* end file src/generic/utf8.h */
14960 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf16.h
14961 /* begin file src/generic/utf16.h */
14962 namespace simdutf {
14963 namespace arm64 {
14964 namespace {
14965 namespace utf16 {
14966 
14967 template <endianness big_endian>
count_code_points(const char16_t * in,size_t size)14968 simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
14969     size_t pos = 0;
14970     size_t count = 0;
14971     for(;pos + 32 <= size; pos += 32) {
14972       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
14973       if (!match_system(big_endian)) input.swap_bytes();
14974       uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
14975       count += count_ones(not_pair) / 2;
14976     }
14977     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
14978 }
14979 
14980 template <endianness big_endian>
utf8_length_from_utf16(const char16_t * in,size_t size)14981 simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
14982     size_t pos = 0;
14983     size_t count = 0;
14984     // This algorithm could no doubt be improved!
14985     for(;pos + 32 <= size; pos += 32) {
14986       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
14987       if (!match_system(big_endian)) input.swap_bytes();
14988       uint64_t ascii_mask = input.lteq(0x7F);
14989       uint64_t twobyte_mask = input.lteq(0x7FF);
14990       uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
14991 
14992       size_t ascii_count = count_ones(ascii_mask) / 2;
14993       size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
14994       size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
14995       size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
14996       count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
14997     }
14998     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
14999 }
15000 
15001 template <endianness big_endian>
utf32_length_from_utf16(const char16_t * in,size_t size)15002 simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
15003     return count_code_points<big_endian>(in, size);
15004 }
15005 
change_endianness_utf16(const char16_t * in,size_t size,char16_t * output)15006 simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
15007   size_t pos = 0;
15008 
15009   while (pos + 32 <= size) {
15010     simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
15011     input.swap_bytes();
15012     input.store(reinterpret_cast<uint16_t *>(output));
15013     pos += 32;
15014     output += 32;
15015   }
15016 
15017   scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
15018 }
15019 
15020 } // utf16
15021 } // unnamed namespace
15022 } // namespace arm64
15023 } // namespace simdutf
15024 /* end file src/generic/utf16.h */
15025 //
15026 // Implementation-specific overrides
15027 //
15028 namespace simdutf {
15029 namespace arm64 {
15030 
detect_encodings(const char * input,size_t length) const15031 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
15032   // If there is a BOM, then we trust it.
15033   auto bom_encoding = simdutf::BOM::check_bom(input, length);
15034   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
15035   if (length % 2 == 0) {
15036     return arm_detect_encodings<utf8_validation::utf8_checker>(input, length);
15037   } else {
15038     if (implementation::validate_utf8(input, length)) {
15039       return simdutf::encoding_type::UTF8;
15040     } else {
15041       return simdutf::encoding_type::unspecified;
15042     }
15043   }
15044 }
15045 
validate_utf8(const char * buf,size_t len) const15046 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
15047   return arm64::utf8_validation::generic_validate_utf8(buf,len);
15048 }
15049 
validate_utf8_with_errors(const char * buf,size_t len) const15050 simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
15051   return arm64::utf8_validation::generic_validate_utf8_with_errors(buf,len);
15052 }
15053 
validate_ascii(const char * buf,size_t len) const15054 simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
15055   return arm64::utf8_validation::generic_validate_ascii(buf,len);
15056 }
15057 
validate_ascii_with_errors(const char * buf,size_t len) const15058 simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
15059   return arm64::utf8_validation::generic_validate_ascii_with_errors(buf,len);
15060 }
15061 
validate_utf16le(const char16_t * buf,size_t len) const15062 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
15063   const char16_t* tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
15064   if (tail) {
15065     return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
15066   } else {
15067     return false;
15068   }
15069 }
15070 
validate_utf16be(const char16_t * buf,size_t len) const15071 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
15072   const char16_t* tail = arm_validate_utf16<endianness::BIG>(buf, len);
15073   if (tail) {
15074     return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
15075   } else {
15076     return false;
15077   }
15078 }
15079 
validate_utf16le_with_errors(const char16_t * buf,size_t len) const15080 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
15081   result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
15082   if (res.count != len) {
15083     result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
15084     return result(scalar_res.error, res.count + scalar_res.count);
15085   } else {
15086     return res;
15087   }
15088 }
15089 
validate_utf16be_with_errors(const char16_t * buf,size_t len) const15090 simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
15091   result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
15092   if (res.count != len) {
15093     result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
15094     return result(scalar_res.error, res.count + scalar_res.count);
15095   } else {
15096     return res;
15097   }
15098 }
15099 
validate_utf32(const char32_t * buf,size_t len) const15100 simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
15101   const char32_t* tail = arm_validate_utf32le(buf, len);
15102   if (tail) {
15103     return scalar::utf32::validate(tail, len - (tail - buf));
15104   } else {
15105     return false;
15106   }
15107 }
15108 
validate_utf32_with_errors(const char32_t * buf,size_t len) const15109 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
15110   result res = arm_validate_utf32le_with_errors(buf, len);
15111   if (res.count != len) {
15112     result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
15113     return result(scalar_res.error, res.count + scalar_res.count);
15114   } else {
15115     return res;
15116   }
15117 }
15118 
convert_utf8_to_utf16le(const char * buf,size_t len,char16_t * utf16_output) const15119 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
15120   utf8_to_utf16::validating_transcoder converter;
15121   return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
15122 }
15123 
convert_utf8_to_utf16be(const char * buf,size_t len,char16_t * utf16_output) const15124 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
15125   utf8_to_utf16::validating_transcoder converter;
15126   return converter.convert<endianness::BIG>(buf, len, utf16_output);
15127 }
15128 
convert_utf8_to_utf16le_with_errors(const char * buf,size_t len,char16_t * utf16_output) const15129 simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
15130   utf8_to_utf16::validating_transcoder converter;
15131   return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
15132 }
15133 
convert_utf8_to_utf16be_with_errors(const char * buf,size_t len,char16_t * utf16_output) const15134 simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
15135   utf8_to_utf16::validating_transcoder converter;
15136   return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
15137 }
15138 
convert_valid_utf8_to_utf16le(const char * input,size_t size,char16_t * utf16_output) const15139 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
15140     char16_t* utf16_output) const noexcept {
15141   return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
15142 }
15143 
convert_valid_utf8_to_utf16be(const char * input,size_t size,char16_t * utf16_output) const15144 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
15145     char16_t* utf16_output) const noexcept {
15146   return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
15147 }
15148 
convert_utf8_to_utf32(const char * buf,size_t len,char32_t * utf32_output) const15149 simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
15150   utf8_to_utf32::validating_transcoder converter;
15151   return converter.convert(buf, len, utf32_output);
15152 }
15153 
convert_utf8_to_utf32_with_errors(const char * buf,size_t len,char32_t * utf32_output) const15154 simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
15155   utf8_to_utf32::validating_transcoder converter;
15156   return converter.convert_with_errors(buf, len, utf32_output);
15157 }
15158 
convert_valid_utf8_to_utf32(const char * input,size_t size,char32_t * utf32_output) const15159 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
15160     char32_t* utf32_output) const noexcept {
15161   return utf8_to_utf32::convert_valid(input, size,  utf32_output);
15162 }
15163 
convert_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const15164 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
15165   std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
15166   if (ret.first == nullptr) { return 0; }
15167   size_t saved_bytes = ret.second - utf8_output;
15168   if (ret.first != buf + len) {
15169     const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
15170                                         ret.first, len - (ret.first - buf), ret.second);
15171     if (scalar_saved_bytes == 0) { return 0; }
15172     saved_bytes += scalar_saved_bytes;
15173   }
15174   return saved_bytes;
15175 }
15176 
convert_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const15177 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
15178   std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
15179   if (ret.first == nullptr) { return 0; }
15180   size_t saved_bytes = ret.second - utf8_output;
15181   if (ret.first != buf + len) {
15182     const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
15183                                         ret.first, len - (ret.first - buf), ret.second);
15184     if (scalar_saved_bytes == 0) { return 0; }
15185     saved_bytes += scalar_saved_bytes;
15186   }
15187   return saved_bytes;
15188 }
15189 
convert_utf16le_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output) const15190 simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
15191   // ret.first.count is always the position in the buffer, not the number of words written even if finished
15192   std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
15193   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
15194   if (ret.first.count != len) { // All good so far, but not finished
15195     result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
15196                                         buf + ret.first.count, len - ret.first.count, ret.second);
15197     if (scalar_res.error) {
15198       scalar_res.count += ret.first.count;
15199       return scalar_res;
15200     } else {
15201       ret.second += scalar_res.count;
15202     }
15203   }
15204   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
15205   return ret.first;
15206 }
15207 
convert_utf16be_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output) const15208 simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
15209   // ret.first.count is always the position in the buffer, not the number of words written even if finished
15210   std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
15211   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
15212   if (ret.first.count != len) { // All good so far, but not finished
15213     result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
15214                                         buf + ret.first.count, len - ret.first.count, ret.second);
15215     if (scalar_res.error) {
15216       scalar_res.count += ret.first.count;
15217       return scalar_res;
15218     } else {
15219       ret.second += scalar_res.count;
15220     }
15221   }
15222   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
15223   return ret.first;
15224 }
15225 
convert_valid_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const15226 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
15227   return convert_utf16le_to_utf8(buf, len, utf8_output);
15228 }
15229 
convert_valid_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const15230 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
15231   return convert_utf16be_to_utf8(buf, len, utf8_output);
15232 }
15233 
convert_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output) const15234 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
15235   std::pair<const char32_t*, char*> ret = arm_convert_utf32_to_utf8(buf, len, utf8_output);
15236   if (ret.first == nullptr) { return 0; }
15237   size_t saved_bytes = ret.second - utf8_output;
15238   if (ret.first != buf + len) {
15239     const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
15240                                         ret.first, len - (ret.first - buf), ret.second);
15241     if (scalar_saved_bytes == 0) { return 0; }
15242     saved_bytes += scalar_saved_bytes;
15243   }
15244   return saved_bytes;
15245 }
15246 
convert_utf32_to_utf8_with_errors(const char32_t * buf,size_t len,char * utf8_output) const15247 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
15248   // ret.first.count is always the position in the buffer, not the number of words written even if finished
15249   std::pair<result, char*> ret = arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
15250   if (ret.first.count != len) {
15251     result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
15252                                         buf + ret.first.count, len - ret.first.count, ret.second);
15253     if (scalar_res.error) {
15254       scalar_res.count += ret.first.count;
15255       return scalar_res;
15256     } else {
15257       ret.second += scalar_res.count;
15258     }
15259   }
15260   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
15261   return ret.first;
15262 }
15263 
convert_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const15264 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
15265   std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
15266   if (ret.first == nullptr) { return 0; }
15267   size_t saved_bytes = ret.second - utf32_output;
15268   if (ret.first != buf + len) {
15269     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
15270                                         ret.first, len - (ret.first - buf), ret.second);
15271     if (scalar_saved_bytes == 0) { return 0; }
15272     saved_bytes += scalar_saved_bytes;
15273   }
15274   return saved_bytes;
15275 }
15276 
convert_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const15277 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
15278   std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
15279   if (ret.first == nullptr) { return 0; }
15280   size_t saved_bytes = ret.second - utf32_output;
15281   if (ret.first != buf + len) {
15282     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
15283                                         ret.first, len - (ret.first - buf), ret.second);
15284     if (scalar_saved_bytes == 0) { return 0; }
15285     saved_bytes += scalar_saved_bytes;
15286   }
15287   return saved_bytes;
15288 }
15289 
convert_utf16le_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output) const15290 simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
15291   // ret.first.count is always the position in the buffer, not the number of words written even if finished
15292   std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
15293   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
15294   if (ret.first.count != len) { // All good so far, but not finished
15295     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
15296                                         buf + ret.first.count, len - ret.first.count, ret.second);
15297     if (scalar_res.error) {
15298       scalar_res.count += ret.first.count;
15299       return scalar_res;
15300     } else {
15301       ret.second += scalar_res.count;
15302     }
15303   }
15304   ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
15305   return ret.first;
15306 }
15307 
convert_utf16be_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output) const15308 simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
15309   // ret.first.count is always the position in the buffer, not the number of words written even if finished
15310   std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
15311   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
15312   if (ret.first.count != len) { // All good so far, but not finished
15313     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
15314                                         buf + ret.first.count, len - ret.first.count, ret.second);
15315     if (scalar_res.error) {
15316       scalar_res.count += ret.first.count;
15317       return scalar_res;
15318     } else {
15319       ret.second += scalar_res.count;
15320     }
15321   }
15322   ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
15323   return ret.first;
15324 }
15325 
convert_valid_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output) const15326 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
15327   return convert_utf32_to_utf8(buf, len, utf8_output);
15328 }
15329 
convert_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_output) const15330 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
15331   std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
15332   if (ret.first == nullptr) { return 0; }
15333   size_t saved_bytes = ret.second - utf16_output;
15334   if (ret.first != buf + len) {
15335     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
15336                                         ret.first, len - (ret.first - buf), ret.second);
15337     if (scalar_saved_bytes == 0) { return 0; }
15338     saved_bytes += scalar_saved_bytes;
15339   }
15340   return saved_bytes;
15341 }
15342 
convert_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_output) const15343 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
15344   std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
15345   if (ret.first == nullptr) { return 0; }
15346   size_t saved_bytes = ret.second - utf16_output;
15347   if (ret.first != buf + len) {
15348     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
15349                                         ret.first, len - (ret.first - buf), ret.second);
15350     if (scalar_saved_bytes == 0) { return 0; }
15351     saved_bytes += scalar_saved_bytes;
15352   }
15353   return saved_bytes;
15354 }
15355 
convert_utf32_to_utf16le_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output) const15356 simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
15357   // ret.first.count is always the position in the buffer, not the number of words written even if finished
15358   std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
15359   if (ret.first.count != len) {
15360     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
15361                                         buf + ret.first.count, len - ret.first.count, ret.second);
15362     if (scalar_res.error) {
15363       scalar_res.count += ret.first.count;
15364       return scalar_res;
15365     } else {
15366       ret.second += scalar_res.count;
15367     }
15368   }
15369   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
15370   return ret.first;
15371 }
15372 
convert_utf32_to_utf16be_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output) const15373 simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
15374   // ret.first.count is always the position in the buffer, not the number of words written even if finished
15375   std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
15376   if (ret.first.count != len) {
15377     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
15378                                         buf + ret.first.count, len - ret.first.count, ret.second);
15379     if (scalar_res.error) {
15380       scalar_res.count += ret.first.count;
15381       return scalar_res;
15382     } else {
15383       ret.second += scalar_res.count;
15384     }
15385   }
15386   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
15387   return ret.first;
15388 }
15389 
convert_valid_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_output) const15390 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
15391   return convert_utf32_to_utf16le(buf, len, utf16_output);
15392 }
15393 
convert_valid_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_output) const15394 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
15395   return convert_utf32_to_utf16be(buf, len, utf16_output);
15396 }
15397 
convert_valid_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const15398 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
15399   return convert_utf16le_to_utf32(buf, len, utf32_output);
15400 }
15401 
convert_valid_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const15402 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
15403   return convert_utf16be_to_utf32(buf, len, utf32_output);
15404 }
15405 
change_endianness_utf16(const char16_t * input,size_t length,char16_t * output) const15406 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
15407   utf16::change_endianness_utf16(input, length, output);
15408 }
15409 
count_utf16le(const char16_t * input,size_t length) const15410 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
15411   return utf16::count_code_points<endianness::LITTLE>(input, length);
15412 }
15413 
count_utf16be(const char16_t * input,size_t length) const15414 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
15415   return utf16::count_code_points<endianness::BIG>(input, length);
15416 }
15417 
count_utf8(const char * input,size_t length) const15418 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
15419   return utf8::count_code_points(input, length);
15420 }
15421 
utf8_length_from_utf16le(const char16_t * input,size_t length) const15422 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
15423   return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
15424 }
15425 
utf8_length_from_utf16be(const char16_t * input,size_t length) const15426 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
15427   return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
15428 }
15429 
utf32_length_from_utf16le(const char16_t * input,size_t length) const15430 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
15431   return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
15432 }
15433 
utf32_length_from_utf16be(const char16_t * input,size_t length) const15434 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
15435   return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
15436 }
15437 
utf16_length_from_utf8(const char * input,size_t length) const15438 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
15439   return utf8::utf16_length_from_utf8(input, length);
15440 }
15441 
utf8_length_from_utf32(const char32_t * input,size_t length) const15442 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
15443   const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
15444   const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
15445   const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
15446   const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
15447   size_t pos = 0;
15448   size_t count = 0;
15449   for(;pos + 4 <= length; pos += 4) {
15450     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
15451     const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
15452     const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
15453     const uint32x4_t two_bytes_bytemask = veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
15454     const uint32x4_t three_bytes_bytemask = veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
15455 
15456     const uint16x8_t reduced_ascii_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
15457     const uint16x8_t reduced_two_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
15458     const uint16x8_t reduced_three_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
15459 
15460     const uint16x8_t compressed_bytemask0 = vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
15461     const uint16x8_t compressed_bytemask1 = vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
15462 
15463     size_t ascii_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
15464     size_t two_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
15465     size_t three_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
15466 
15467     count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
15468   }
15469   return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
15470 }
15471 
utf16_length_from_utf32(const char32_t * input,size_t length) const15472 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
15473   const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
15474   const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
15475   size_t pos = 0;
15476   size_t count = 0;
15477   for(;pos + 4 <= length; pos += 4) {
15478     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
15479     const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
15480     const uint16x8_t reduced_bytemask = vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
15481     const uint16x8_t compressed_bytemask = vpaddq_u16(reduced_bytemask, reduced_bytemask);
15482     size_t surrogate_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
15483     count += 4 + surrogate_count;
15484   }
15485   return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
15486 }
15487 
utf32_length_from_utf8(const char * input,size_t length) const15488 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
15489   return utf8::utf32_length_from_utf8(input, length);
15490 }
15491 
15492 } // namespace arm64
15493 } // namespace simdutf
15494 
15495 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h
15496 /* begin file src/simdutf/arm64/end.h */
15497 /* end file src/simdutf/arm64/end.h */
15498 /* end file src/arm64/implementation.cpp */
15499 #endif
15500 #if SIMDUTF_IMPLEMENTATION_FALLBACK
15501 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=fallback/implementation.cpp
15502 /* begin file src/fallback/implementation.cpp */
15503 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h
15504 /* begin file src/simdutf/fallback/begin.h */
15505 // redefining SIMDUTF_IMPLEMENTATION to "fallback"
15506 // #define SIMDUTF_IMPLEMENTATION fallback
15507 /* end file src/simdutf/fallback/begin.h */
15508 
15509 
15510 
15511 
15512 
15513 
15514 
15515 
15516 namespace simdutf {
15517 namespace fallback {
15518 
detect_encodings(const char * input,size_t length) const15519 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
15520   // If there is a BOM, then we trust it.
15521   auto bom_encoding = simdutf::BOM::check_bom(input, length);
15522   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
15523   int out = 0;
15524   if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
15525   if((length % 2) == 0) {
15526     if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
15527   }
15528   if((length % 4) == 0) {
15529     if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
15530   }
15531 
15532   return out;
15533 }
15534 
validate_utf8(const char * buf,size_t len) const15535 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
15536     return scalar::utf8::validate(buf, len);
15537 }
15538 
validate_utf8_with_errors(const char * buf,size_t len) const15539 simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
15540     return scalar::utf8::validate_with_errors(buf, len);
15541 }
15542 
validate_ascii(const char * buf,size_t len) const15543 simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
15544     return scalar::ascii::validate(buf, len);
15545 }
15546 
validate_ascii_with_errors(const char * buf,size_t len) const15547 simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
15548     return scalar::ascii::validate_with_errors(buf, len);
15549 }
15550 
validate_utf16le(const char16_t * buf,size_t len) const15551 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
15552     return scalar::utf16::validate<endianness::LITTLE>(buf, len);
15553 }
15554 
validate_utf16be(const char16_t * buf,size_t len) const15555 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
15556     return scalar::utf16::validate<endianness::BIG>(buf, len);
15557 }
15558 
validate_utf16le_with_errors(const char16_t * buf,size_t len) const15559 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
15560     return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
15561 }
15562 
validate_utf16be_with_errors(const char16_t * buf,size_t len) const15563 simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
15564     return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
15565 }
15566 
validate_utf32(const char32_t * buf,size_t len) const15567 simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
15568     return scalar::utf32::validate(buf, len);
15569 }
15570 
validate_utf32_with_errors(const char32_t * buf,size_t len) const15571 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
15572     return scalar::utf32::validate_with_errors(buf, len);
15573 }
15574 
convert_utf8_to_utf16le(const char * buf,size_t len,char16_t * utf16_output) const15575 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
15576    return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
15577 }
15578 
convert_utf8_to_utf16be(const char * buf,size_t len,char16_t * utf16_output) const15579 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
15580    return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
15581 }
15582 
convert_utf8_to_utf16le_with_errors(const char * buf,size_t len,char16_t * utf16_output) const15583 simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
15584    return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
15585 }
15586 
convert_utf8_to_utf16be_with_errors(const char * buf,size_t len,char16_t * utf16_output) const15587 simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
15588    return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
15589 }
15590 
convert_valid_utf8_to_utf16le(const char * buf,size_t len,char16_t * utf16_output) const15591 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
15592    return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
15593 }
15594 
convert_valid_utf8_to_utf16be(const char * buf,size_t len,char16_t * utf16_output) const15595 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
15596    return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
15597 }
15598 
convert_utf8_to_utf32(const char * buf,size_t len,char32_t * utf32_output) const15599 simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
15600    return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
15601 }
15602 
convert_utf8_to_utf32_with_errors(const char * buf,size_t len,char32_t * utf32_output) const15603 simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
15604    return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
15605 }
15606 
convert_valid_utf8_to_utf32(const char * input,size_t size,char32_t * utf32_output) const15607 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
15608     char32_t* utf32_output) const noexcept {
15609   return scalar::utf8_to_utf32::convert_valid(input, size,  utf32_output);
15610 }
15611 
convert_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const15612 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
15613   return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
15614 }
15615 
convert_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const15616 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
15617   return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
15618 }
15619 
convert_utf16le_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output) const15620 simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
15621   return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
15622 }
15623 
convert_utf16be_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output) const15624 simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
15625   return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
15626 }
15627 
convert_valid_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const15628 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
15629   return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
15630 }
15631 
convert_valid_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const15632 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
15633   return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
15634 }
15635 
convert_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output) const15636 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
15637   return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
15638 }
15639 
convert_utf32_to_utf8_with_errors(const char32_t * buf,size_t len,char * utf8_output) const15640 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
15641   return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
15642 }
15643 
convert_valid_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output) const15644 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
15645   return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
15646 }
15647 
convert_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_output) const15648 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
15649   return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
15650 }
15651 
convert_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_output) const15652 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
15653   return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
15654 }
15655 
convert_utf32_to_utf16le_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output) const15656 simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
15657   return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
15658 }
15659 
convert_utf32_to_utf16be_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output) const15660 simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
15661   return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
15662 }
15663 
convert_valid_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_output) const15664 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
15665   return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
15666 }
15667 
convert_valid_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_output) const15668 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
15669   return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
15670 }
15671 
convert_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const15672 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
15673   return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
15674 }
15675 
convert_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const15676 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
15677   return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
15678 }
15679 
convert_utf16le_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output) const15680 simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
15681   return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
15682 }
15683 
convert_utf16be_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output) const15684 simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
15685   return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
15686 }
15687 
convert_valid_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const15688 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
15689   return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
15690 }
15691 
convert_valid_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const15692 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
15693   return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
15694 }
15695 
change_endianness_utf16(const char16_t * input,size_t length,char16_t * output) const15696 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
15697   scalar::utf16::change_endianness_utf16(input, length, output);
15698 }
15699 
count_utf16le(const char16_t * input,size_t length) const15700 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
15701   return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
15702 }
15703 
count_utf16be(const char16_t * input,size_t length) const15704 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
15705   return scalar::utf16::count_code_points<endianness::BIG>(input, length);
15706 }
15707 
count_utf8(const char * input,size_t length) const15708 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
15709   return scalar::utf8::count_code_points(input, length);
15710 }
15711 
utf8_length_from_utf16le(const char16_t * input,size_t length) const15712 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
15713   return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
15714 }
15715 
utf8_length_from_utf16be(const char16_t * input,size_t length) const15716 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
15717   return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
15718 }
15719 
utf32_length_from_utf16le(const char16_t * input,size_t length) const15720 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
15721   return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
15722 }
15723 
utf32_length_from_utf16be(const char16_t * input,size_t length) const15724 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
15725   return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
15726 }
15727 
utf16_length_from_utf8(const char * input,size_t length) const15728 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
15729   return scalar::utf8::utf16_length_from_utf8(input, length);
15730 }
15731 
utf8_length_from_utf32(const char32_t * input,size_t length) const15732 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
15733   return scalar::utf32::utf8_length_from_utf32(input, length);
15734 }
15735 
utf16_length_from_utf32(const char32_t * input,size_t length) const15736 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
15737   return scalar::utf32::utf16_length_from_utf32(input, length);
15738 }
15739 
utf32_length_from_utf8(const char * input,size_t length) const15740 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
15741   return scalar::utf8::count_code_points(input, length);
15742 }
15743 
15744 } // namespace fallback
15745 } // namespace simdutf
15746 
15747 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h
15748 /* begin file src/simdutf/fallback/end.h */
15749 /* end file src/simdutf/fallback/end.h */
15750 /* end file src/fallback/implementation.cpp */
15751 #endif
15752 #if SIMDUTF_IMPLEMENTATION_ICELAKE
15753 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=icelake/implementation.cpp
15754 /* begin file src/icelake/implementation.cpp */
15755 
15756 
15757 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/icelake/begin.h
15758 /* begin file src/simdutf/icelake/begin.h */
15759 // redefining SIMDUTF_IMPLEMENTATION to "icelake"
15760 // #define SIMDUTF_IMPLEMENTATION icelake
15761 
15762 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
15763 // nothing needed.
15764 #else
15765 SIMDUTF_TARGET_ICELAKE
15766 #endif
15767 
15768 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
15769 SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
15770 #endif // end of workaround
15771 /* end file src/simdutf/icelake/begin.h */
15772 namespace simdutf {
15773 namespace icelake {
15774 namespace {
15775 #ifndef SIMDUTF_ICELAKE_H
15776 #error "icelake.h must be included"
15777 #endif
15778 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=icelake/icelake_utf8_common.inl.cpp
15779 /* begin file src/icelake/icelake_utf8_common.inl.cpp */
15780 // Common procedures for both validating and non-validating conversions from UTF-8.
15781 enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL};
15782 
15783 using utf8_to_utf16_result = std::pair<const char*, char16_t*>;
15784 using utf8_to_utf32_result = std::pair<const char*, uint32_t*>;
15785 
15786 /*
15787     process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8
15788     to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes)
15789     might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which
15790     indicates how many input bytes are relevant.
15791 
15792     Returns true when the result is correct, otherwise it returns false.
15793 
15794     The provided in and out pointers are advanced according to how many input
15795     bytes have been processed, upon success.
15796 */
15797 template <block_processing_mode tail, endianness big_endian>
process_block_utf8_to_utf16(const char * & in,char16_t * & out,size_t gap)15798 simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) {
15799   // constants
15800   __m512i mask_identity = _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
15801   __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
15802   __m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
15803   __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
15804   __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
15805   __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
15806   __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
15807   __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
15808   __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
15809   __m512i byteflip = _mm512_setr_epi64(
15810             0x0607040502030001,
15811             0x0e0f0c0d0a0b0809,
15812             0x0607040502030001,
15813             0x0e0f0c0d0a0b0809,
15814             0x0607040502030001,
15815             0x0e0f0c0d0a0b0809,
15816             0x0607040502030001,
15817             0x0e0f0c0d0a0b0809
15818         );
15819   // Note that 'tail' is a compile-time constant !
15820   __mmask64 b = (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
15821   __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) : _mm512_maskz_loadu_epi8(b, in);
15822   __mmask64 m1 = (tail == SIMDUTF_FULL) ? _mm512_cmplt_epu8_mask(input, mask_80808080) : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
15823   if(_ktestc_mask64_u8(m1, b)) {// NOT(m1) AND b -- if all zeroes, then all ASCII
15824   // alternatively, we could do 'if (m1 == b) { '
15825     if (tail == SIMDUTF_FULL) {
15826       in += 64;          // consumed 64 bytes
15827       // we convert a full 64-byte block, writing 128 bytes.
15828       __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
15829       if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
15830       _mm512_storeu_si512(out, input1);
15831       out += 32;
15832       __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
15833       if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); }
15834       _mm512_storeu_si512(out, input2);
15835       out += 32;
15836       return true; // we are done
15837     } else {
15838       in += gap;
15839       if (gap <= 32) {
15840         __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
15841         if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
15842         _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), input1);
15843         out += gap;
15844       } else {
15845         __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
15846         if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
15847         _mm512_storeu_si512(out, input1);
15848         out += 32;
15849         __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
15850         if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); }
15851         _mm512_mask_storeu_epi16(out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
15852         out += gap - 32;
15853       }
15854       return true; // we are done
15855     }
15856   }
15857   // classify characters further
15858   __mmask64 m234 = _mm512_cmp_epu8_mask(mask_c0c0c0c0, input,
15859                                         _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
15860   __mmask64 m34 = _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
15861                                        _MM_CMPINT_LT); // 0xdf < input,  3 or 4 leading byte
15862 
15863   __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(m234, input, mask_c2c2c2c2,
15864                                                      _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
15865                                                                      // Overlong 2-byte sequence
15866   if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
15867     // Overlong 2-byte sequence
15868     return false;
15869   }
15870   if (_ktestz_mask64_u8(m34, m34) == 0) {
15871     // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a 4-byte sequence!
15872     __mmask64 m4 = _mm512_cmp_epu8_mask(input, mask_f0f0f0f0,
15873                                         _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
15874 
15875     __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m1) : _kand_mask64(_knot_mask64(m1), b);
15876 
15877     __mmask64 mp1 = _kshiftli_mask64(m234, 1);
15878     __mmask64 mp2 = _kshiftli_mask64(m34, 2);
15879     // We could do it as follows...
15880     // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
15881     // but GCC generates better code when we do:
15882     if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
15883       // Fast path with 1,2,3 bytes
15884       __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
15885       __mmask64 m1234 = _kor_mask64(m1, m234);
15886       // mismatched continuation bytes:
15887       if (tail == SIMDUTF_FULL) {
15888         __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
15889         // the presence of a 1 bit indicates that they overlap.
15890         // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
15891         if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; }
15892       } else {
15893         __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
15894         if (mc != bxorm1234) { return false; }
15895       }
15896       // mend: identifying the last bytes of each sequence to be decoded
15897       __mmask64 mend = _kshiftri_mask64(m1234, 1);
15898       if (tail != SIMDUTF_FULL) {
15899         mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
15900       }
15901 
15902 
15903       __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
15904       __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
15905 
15906       __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
15907       __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input);             // high two bits cleared where not ASCII
15908       __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
15909                                                         clearedbytes); // the last byte of each character
15910 
15911       __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1);               // bytes that precede non-ASCII bytes
15912       __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
15913       __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
15914       __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
15915                                                               beforeasciibytes); // the second last bytes (of two, three byte seq,
15916                                                                                  // surrogates)
15917       secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6);                   // shifted into position
15918 
15919       __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
15920                                                        indexofsecondlastbytes); // indices of the second last bytes
15921       __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
15922                                                     clearedbytes); // only those that are the third last byte of a sequece
15923       __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
15924                                                              thirdlastbyte); // the third last bytes (of three byte sequences, hi
15925                                                                              // surrogate)
15926       thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12);                // shifted into position
15927       __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
15928       // the elements of Wout excluding the last element if it happens to be a high surrogate:
15929 
15930       __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(0xFFFFFFFF, mend) : _pdep_u64(0xFFFFFFFF, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
15931 
15932 
15933       // Encodings out of range...
15934       {
15935         // the location of 3-byte sequence start bytes in the input
15936         __mmask64 m3 = m34 & (b ^ m4);
15937         // words in Wout corresponding to 3-byte sequences.
15938         __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
15939         __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
15940         __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
15941         __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
15942         __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
15943         __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
15944         if (_kor_mask32(Msmall800, M3s)) { return false; }
15945       }
15946       int64_t nout = _mm_popcnt_u64(mprocessed);
15947       in +=  64 - _lzcnt_u64(mprocessed);
15948       if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); }
15949       _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
15950       out += nout;
15951       return true; // ok
15952     }
15953     //
15954     // We have a 4-byte sequence, this is the general case.
15955     // Slow!
15956     __mmask64 mp3 = _kshiftli_mask64(m4, 3);
15957     __mmask64 mc = _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
15958     __mmask64 m1234 = _kor_mask64(m1, m234);
15959 
15960     // mend: identifying the last bytes of each sequence to be decoded
15961     __mmask64 mend = _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
15962     if (tail != SIMDUTF_FULL) {
15963       mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
15964     }
15965     __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
15966     __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
15967 
15968     __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
15969     __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input);             // high two bits cleared where not ASCII
15970     __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
15971                                                       clearedbytes); // the last byte of each character
15972 
15973     __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1);               // bytes that precede non-ASCII bytes
15974     __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
15975     __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
15976     __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
15977                                                             beforeasciibytes); // the second last bytes (of two, three byte seq,
15978                                                                                // surrogates)
15979     secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6);                   // shifted into position
15980 
15981     __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
15982                                                      indexofsecondlastbytes); // indices of the second last bytes
15983     __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
15984                                                   clearedbytes); // only those that are the third last byte of a sequece
15985     __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
15986                                                            thirdlastbyte); // the third last bytes (of three byte sequences, hi
15987                                                                            // surrogate)
15988     thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12);                // shifted into position
15989     __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
15990     uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
15991     __mmask32 Mlo = __mmask32(Mlo_uint64);
15992     __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
15993     __m512i lo_surr_mask = _mm512_maskz_mov_epi16(Mlo,
15994                                                   mask_dc00dc00); // lo surr: 1101110000000000, other:  0000000000000000
15995     __m512i shifted4_thirdsecondandlastbytes = _mm512_srli_epi16(thirdsecondandlastbytes,
15996                                                                  4); // hi surr: 00000WVUTSRQPNML  vuts = WVUTS - 1
15997     __m512i tagged_lo_surrogates = _mm512_or_si512(thirdsecondandlastbytes,
15998                                                    lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other:  unchanged
15999     __m512i Wout = _mm512_mask_add_epi16(tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
16000                                          mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other:  unchanged
16001     // the elements of Wout excluding the last element if it happens to be a high surrogate:
16002     __mmask32 Mout = ~(Mhi & 0x80000000);
16003     __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(Mout, mend) : _pdep_u64(Mout, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
16004 
16005 
16006     // mismatched continuation bytes:
16007     if (tail == SIMDUTF_FULL) {
16008       __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
16009       // the presence of a 1 bit indicates that they overlap.
16010       // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
16011       if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; }
16012     } else {
16013       __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
16014       if (mc != bxorm1234) { return false; }
16015     }
16016     // Encodings out of range...
16017     {
16018       // the location of 3-byte sequence start bytes in the input
16019       __mmask64 m3 = m34 & (b ^ m4);
16020       // words in Wout corresponding to 3-byte sequences.
16021       __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
16022       __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
16023       __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
16024       __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
16025       __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
16026       __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
16027       __m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
16028       __mmask32 M4s = _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
16029       if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) { return false; }
16030     }
16031     in += 64 - _lzcnt_u64(mprocessed);
16032     int64_t nout = _mm_popcnt_u64(mprocessed);
16033     if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); }
16034     _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
16035     out += nout;
16036     return true; // ok
16037   }
16038   // Fast path 2: all ASCII or 2 byte
16039   __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m234) : _kand_mask64(_knot_mask64(m234), b);
16040   // on top of -0xc0 we substract -2 which we get back later of the
16041   // continuation byte tags
16042   __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
16043   __mmask64 leading = tail == (tail == SIMDUTF_FULL) ? _kor_mask64(m1, m234) : _kand_mask64(_kor_mask64(m1, m234), b); // first bytes of each sequence
16044   if (tail == SIMDUTF_FULL) {
16045     __mmask64 xnor234leading = _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
16046     if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) { return false; }
16047   } else {
16048     __mmask64 bxorleading = _kxor_mask64(b, leading);
16049     if (_kshiftli_mask64(m234, 1) != bxorleading) { return false; }
16050   }
16051   //
16052   if (tail == SIMDUTF_FULL) {
16053     // In the two-byte/ASCII scenario, we are easily latency bound, so we want
16054     // to increment the input buffer as quickly as possible.
16055     // We process 32 bytes unless the byte at index 32 is a continuation byte,
16056     // in which case we include it as well for a total of 33 bytes.
16057     // Note that if x is an ASCII byte, then the following is false:
16058     // int8_t(x) <= int8_t(0xc0) under two's complement.
16059     in += 32;
16060     if(int8_t(*in) <= int8_t(0xc0)) in++;
16061     // The alternative is to do
16062     // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
16063     // but it requires loading the input, doing the mask computation, and converting
16064     // back the mask to a general register. It just takes too long, leaving the
16065     // processor likely to be idle.
16066   } else {
16067     in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
16068   }
16069   __m512i lead = _mm512_maskz_compress_epi8(leading, leading2byte);          // will contain zero for ascii, and the data
16070   lead = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(lead));                 // ... zero extended into words
16071   __m512i follow = _mm512_maskz_compress_epi8(continuation_or_ascii, input); // the last bytes of each sequence
16072   follow = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(follow));             // ... zero extended into words
16073   lead = _mm512_slli_epi16(lead, 6);                                         // shifted into position
16074   __m512i final = _mm512_add_epi16(follow, lead);                            // combining lead and follow
16075 
16076   if(big_endian) { final = _mm512_shuffle_epi8(final, byteflip); }
16077   if (tail == SIMDUTF_FULL) {
16078     // Next part is UTF-16 specific and can be generalized to UTF-32.
16079     int nout = _mm_popcnt_u32(uint32_t(leading));
16080     _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
16081     out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
16082   } else {
16083     int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
16084     _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
16085     out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
16086   }
16087 
16088   return true; // we are fine.
16089 }
16090 
16091 
16092 
16093 
16094 /*
16095     utf32_to_utf16_masked converts `count` lower UTF-32 words
16096     from input `utf32` into UTF-16. It differs from utf32_to_utf16
16097     in that it 'masks' the writes.
16098 
16099     Returns how many 16-bit words were stored.
16100 
16101     byteflip is used for flipping 16-bit words, and it should be
16102         __m512i byteflip = _mm512_setr_epi64(
16103             0x0607040502030001,
16104             0x0e0f0c0d0a0b0809,
16105             0x0607040502030001,
16106             0x0e0f0c0d0a0b0809,
16107             0x0607040502030001,
16108             0x0e0f0c0d0a0b0809,
16109             0x0607040502030001,
16110             0x0e0f0c0d0a0b0809
16111         );
16112     We pass it to the (always inlined) function to encourage the compiler to
16113     keep the value in a (constant) register.
16114 */
16115 template <endianness big_endian>
utf32_to_utf16_masked(const __m512i byteflip,__m512i utf32,unsigned int count,char16_t * output)16116 simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) {
16117 
16118     const __mmask16 valid = uint16_t((1 << count) - 1);
16119     // 1. check if we have any surrogate pairs
16120     const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
16121     const __mmask16 sp_mask = _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff);
16122 
16123     if (sp_mask == 0) {
16124         if(big_endian) {
16125           _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip)));
16126 
16127         } else {
16128           _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm512_cvtepi32_epi16(utf32));
16129         }
16130         return count;
16131     }
16132 
16133     {
16134         // build surrogate pair words in 32-bit lanes
16135 
16136         //    t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
16137         const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
16138         const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
16139 
16140         //    t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
16141         const __m512i t1 = _mm512_slli_epi32(t0, 6);
16142 
16143         //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 to t0
16144         //         0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
16145         const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
16146         const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
16147 
16148         //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 to t0
16149         //         0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
16150         const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
16151         const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
16152         const __m512i t3 = _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
16153         const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
16154         __m512i t5 = _mm512_ror_epi32(t4, 16);
16155         // Here we want to trim all of the upper 16-bit words from the 2-byte
16156         // characters represented as 4-byte values. We can compute it from
16157         // sp_mask or the following... It can be more optimized!
16158         const  __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
16159         const  __mmask32 nonzero_masked = _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2*count)) - 1));
16160         if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); }
16161         // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4)
16162         __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5);
16163         _mm512_mask_storeu_epi16(output, (1<<(count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
16164         //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5);
16165     }
16166 
16167     return count + static_cast<unsigned int>(count_ones(sp_mask));
16168 }
16169 
16170 /*
16171     utf32_to_utf16 converts `count` lower UTF-32 words
16172     from input `utf32` into UTF-16. It may overflow.
16173 
16174     Returns how many 16-bit words were stored.
16175 
16176     byteflip is used for flipping 16-bit words, and it should be
16177         __m512i byteflip = _mm512_setr_epi64(
16178             0x0607040502030001,
16179             0x0e0f0c0d0a0b0809,
16180             0x0607040502030001,
16181             0x0e0f0c0d0a0b0809,
16182             0x0607040502030001,
16183             0x0e0f0c0d0a0b0809,
16184             0x0607040502030001,
16185             0x0e0f0c0d0a0b0809
16186         );
16187     We pass it to the (always inlined) function to encourage the compiler to
16188     keep the value in a (constant) register.
16189 */
16190 template <endianness big_endian>
utf32_to_utf16(const __m512i byteflip,__m512i utf32,unsigned int count,char16_t * output)16191 simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) {
16192     // check if we have any surrogate pairs
16193     const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
16194     const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
16195 
16196     if (sp_mask == 0) {
16197         // technically, it should be _mm256_storeu_epi16
16198         if(big_endian) {
16199           _mm256_storeu_si256((__m256i*)output, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),_mm512_castsi512_si256(byteflip)));
16200         } else {
16201           _mm256_storeu_si256((__m256i*)output, _mm512_cvtepi32_epi16(utf32));
16202         }
16203         return count;
16204     }
16205 
16206     {
16207         // build surrogate pair words in 32-bit lanes
16208 
16209         //    t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
16210         const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
16211         const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
16212 
16213         //    t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
16214         const __m512i t1 = _mm512_slli_epi32(t0, 6);
16215 
16216         //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 to t0
16217         //         0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
16218         const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
16219         const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
16220 
16221         //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 to t0
16222         //         0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
16223         const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
16224         const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
16225         const __m512i t3 = _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
16226         const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
16227         __m512i t5 = _mm512_ror_epi32(t4, 16);
16228         const  __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
16229         if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); }
16230         // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4)
16231         __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5);
16232         _mm512_mask_storeu_epi16(output, (1<<(count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
16233         //_mm512_mask_compressstoreu_epi16(output, nonzero, t5);
16234     }
16235 
16236     return count + static_cast<unsigned int>(count_ones(sp_mask));
16237 }
16238 
16239 /**
16240  * Store the last N bytes of previous followed by 512-N bytes from input.
16241  */
16242 template <int N>
prev(__m512i input,__m512i previous)16243 __m512i prev(__m512i input, __m512i previous) {
16244     static_assert(N<=32, "N must be no larger than 32");
16245     const __m512i movemask = _mm512_setr_epi32(28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11);
16246     const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous);
16247 #if SIMDUTF_GCC8 || SIMDUTF_GCC9
16248     constexpr int shift = 16-N; // workaround for GCC8,9
16249     return _mm512_alignr_epi8(input, rotated, shift);
16250 #else
16251     return _mm512_alignr_epi8(input, rotated, 16-N);
16252 #endif // SIMDUTF_GCC8 || SIMDUTF_GCC9
16253 }
16254 
16255 template <unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
shuffle_epi128(__m512i v)16256 __m512i shuffle_epi128(__m512i v) {
16257     static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3");
16258     static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3");
16259     static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3");
16260     static_assert((idx3 >= 0 && idx3 <= 3), "idx3 must be in range 0..3");
16261 
16262     constexpr unsigned shuffle = idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6);
16263     return _mm512_shuffle_i32x4(v, v, shuffle);
16264 }
16265 
16266 template <unsigned idx>
broadcast_epi128(__m512i v)16267 constexpr __m512i broadcast_epi128(__m512i v) {
16268     return shuffle_epi128<idx, idx, idx, idx>(v);
16269 }
16270 
16271 /**
16272  * Current unused.
16273  */
16274 template <int N>
rotate_by_N_epi8(const __m512i input)16275 __m512i rotate_by_N_epi8(const __m512i input) {
16276 
16277     // lanes order: 1, 2, 3, 0 => 0b00_11_10_01
16278     const __m512i permuted = _mm512_shuffle_i32x4(input, input, 0x39);
16279 
16280     return _mm512_alignr_epi8(permuted, input, N);
16281 }
16282 
16283 /*
16284     expanded_utf8_to_utf32 converts expanded UTF-8 characters (`utf8`)
16285     stored at separate 32-bit lanes.
16286 
16287     For each lane we have also a character class (`char_class), given in form
16288     0x8080800N, where N is 4 higest bits from the leading byte; 0x80 resets
16289     corresponding bytes during pshufb.
16290 */
expanded_utf8_to_utf32(__m512i char_class,__m512i utf8)16291 simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i utf8) {
16292     /*
16293         Input:
16294         - utf8: bytes stored at separate 32-bit words
16295         - valid: which words have valid UTF-8 characters
16296 
16297         Bit layout of single word. We show 4 cases for each possible
16298         UTF-8 character encoding. The `?` denotes bits we must not
16299         assume their value.
16300 
16301         |10dd.dddd|10cc.cccc|10bb.bbbb|1111.0aaa| 4-byte char
16302         |????.????|10cc.cccc|10bb.bbbb|1110.aaaa| 3-byte char
16303         |????.????|????.????|10bb.bbbb|110a.aaaa| 2-byte char
16304         |????.????|????.????|????.????|0aaa.aaaa| ASCII char
16305           byte 3    byte 2    byte 1     byte 0
16306     */
16307 
16308     /* 1. Reset control bits of continuation bytes and the MSB
16309           of the leading byte; this makes all bytes unsigned (and
16310           does not alter ASCII char).
16311 
16312         |00dd.dddd|00cc.cccc|00bb.bbbb|0111.0aaa| 4-byte char
16313         |00??.????|00cc.cccc|00bb.bbbb|0110.aaaa| 3-byte char
16314         |00??.????|00??.????|00bb.bbbb|010a.aaaa| 2-byte char
16315         |00??.????|00??.????|00??.????|0aaa.aaaa| ASCII char
16316          ^^        ^^        ^^        ^
16317     */
16318     __m512i values;
16319     const __m512i v_3f3f_3f7f = _mm512_set1_epi32(0x3f3f3f7f);
16320     values = _mm512_and_si512(utf8, v_3f3f_3f7f);
16321 
16322     /* 2. Swap and join fields A-B and C-D
16323 
16324         |0000.cccc|ccdd.dddd|0001.110a|aabb.bbbb| 4-byte char
16325         |0000.cccc|cc??.????|0001.10aa|aabb.bbbb| 3-byte char
16326         |0000.????|????.????|0001.0aaa|aabb.bbbb| 2-byte char
16327         |0000.????|????.????|000a.aaaa|aa??.????| ASCII char */
16328     const __m512i v_0140_0140 = _mm512_set1_epi32(0x01400140);
16329     values = _mm512_maddubs_epi16(values, v_0140_0140);
16330 
16331     /* 3. Swap and join fields AB & CD
16332 
16333         |0000.0001|110a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char
16334         |0000.0001|10aa.aabb|bbbb.cccc|cc??.????| 3-byte char
16335         |0000.0001|0aaa.aabb|bbbb.????|????.????| 2-byte char
16336         |0000.000a|aaaa.aa??|????.????|????.????| ASCII char */
16337     const __m512i v_0001_1000 = _mm512_set1_epi32(0x00011000);
16338     values = _mm512_madd_epi16(values, v_0001_1000);
16339 
16340     /* 4. Shift left the values by variable amounts to reset highest UTF-8 bits
16341         |aaab.bbbb|bccc.cccd|dddd.d000|0000.0000| 4-byte char -- by 11
16342         |aaaa.bbbb|bbcc.cccc|????.??00|0000.0000| 3-byte char -- by 10
16343         |aaaa.abbb|bbb?.????|????.???0|0000.0000| 2-byte char -- by 9
16344         |aaaa.aaa?|????.????|????.????|?000.0000| ASCII char -- by 7 */
16345     {
16346         /** pshufb
16347 
16348         continuation = 0
16349         ascii    = 7
16350         _2_bytes = 9
16351         _3_bytes = 10
16352         _4_bytes = 11
16353 
16354         shift_left_v3 = 4 * [
16355             ascii, # 0000
16356             ascii, # 0001
16357             ascii, # 0010
16358             ascii, # 0011
16359             ascii, # 0100
16360             ascii, # 0101
16361             ascii, # 0110
16362             ascii, # 0111
16363             continuation, # 1000
16364             continuation, # 1001
16365             continuation, # 1010
16366             continuation, # 1011
16367             _2_bytes, # 1100
16368             _2_bytes, # 1101
16369             _3_bytes, # 1110
16370             _4_bytes, # 1111
16371         ] */
16372         const __m512i shift_left_v3 = _mm512_setr_epi64(
16373             0x0707070707070707,
16374             0x0b0a090900000000,
16375             0x0707070707070707,
16376             0x0b0a090900000000,
16377             0x0707070707070707,
16378             0x0b0a090900000000,
16379             0x0707070707070707,
16380             0x0b0a090900000000
16381         );
16382 
16383         const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class);
16384         values = _mm512_sllv_epi32(values, shift);
16385     }
16386 
16387     /* 5. Shift right the values by variable amounts to reset lowest bits
16388         |0000.0000|000a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char -- by 11
16389         |0000.0000|0000.0000|aaaa.bbbb|bbcc.cccc| 3-byte char -- by 16
16390         |0000.0000|0000.0000|0000.0aaa|aabb.bbbb| 2-byte char -- by 21
16391         |0000.0000|0000.0000|0000.0000|0aaa.aaaa| ASCII char -- by 25 */
16392     {
16393         // 4 * [25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 21, 21, 16, 11]
16394         const __m512i shift_right = _mm512_setr_epi64(
16395             0x1919191919191919,
16396             0x0b10151500000000,
16397             0x1919191919191919,
16398             0x0b10151500000000,
16399             0x1919191919191919,
16400             0x0b10151500000000,
16401             0x1919191919191919,
16402             0x0b10151500000000
16403         );
16404 
16405         const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class);
16406         values = _mm512_srlv_epi32(values, shift);
16407     }
16408 
16409     return values;
16410 }
16411 
16412 
expand_and_identify(__m512i lane0,__m512i lane1,int & count)16413 simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1, int &count) {
16414     const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1);
16415     const __m512i expand_ver2 = _mm512_setr_epi64(
16416                 0x0403020103020100,
16417                 0x0605040305040302,
16418                 0x0807060507060504,
16419                 0x0a09080709080706,
16420                 0x0c0b0a090b0a0908,
16421                 0x0e0d0c0b0d0c0b0a,
16422                 0x000f0e0d0f0e0d0c,
16423                 0x0201000f01000f0e
16424     );
16425     const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);
16426     const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);
16427     const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);
16428     const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);
16429     const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);
16430     count = static_cast<int>(count_ones(leading_bytes));
16431     return  _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, input);
16432 }
16433 
expand_utf8_to_utf32(__m512i input)16434 simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
16435     __m512i char_class = _mm512_srli_epi32(input, 4);
16436     /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */
16437     const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);
16438     const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);
16439     char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);
16440     return expanded_utf8_to_utf32(char_class, input);
16441 }
16442 /* end file src/icelake/icelake_utf8_common.inl.cpp */
16443 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=icelake/icelake_macros.inl.cpp
16444 /* begin file src/icelake/icelake_macros.inl.cpp */
16445 
16446 /*
16447     This upcoming macro (SIMDUTF_ICELAKE_TRANSCODE16) takes 16 + 4 bytes (of a UTF-8 string)
16448     and loads all possible 4-byte substring into an AVX512 register.
16449 
16450     For example if we have bytes abcdefgh... we create following 32-bit lanes
16451 
16452     [abcd|bcde|cdef|defg|efgh|...]
16453      ^                          ^
16454      byte 0 of reg              byte 63 of reg
16455 */
16456 /** pshufb
16457         # lane{0,1,2} have got bytes: [  0,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15]
16458         # lane3 has got bytes:        [ 16, 17, 18, 19,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15]
16459 
16460         expand_ver2 = [
16461             # lane 0:
16462             0, 1, 2, 3,
16463             1, 2, 3, 4,
16464             2, 3, 4, 5,
16465             3, 4, 5, 6,
16466 
16467             # lane 1:
16468             4, 5, 6, 7,
16469             5, 6, 7, 8,
16470             6, 7, 8, 9,
16471             7, 8, 9, 10,
16472 
16473             # lane 2:
16474              8,  9, 10, 11,
16475              9, 10, 11, 12,
16476             10, 11, 12, 13,
16477             11, 12, 13, 14,
16478 
16479             # lane 3 order: 13, 14, 15, 16 14, 15, 16, 17, 15, 16, 17, 18, 16, 17, 18, 19
16480             12, 13, 14, 15,
16481             13, 14, 15,  0,
16482             14, 15,  0,  1,
16483             15,  0,  1,  2,
16484         ]
16485 */
16486 
16487 #define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED)                                                    \
16488         {                                                                                                    \
16489             const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1);                              \
16490             const __m512i expand_ver2 = _mm512_setr_epi64(                                                   \
16491                 0x0403020103020100,                                                                          \
16492                 0x0605040305040302,                                                                          \
16493                 0x0807060507060504,                                                                          \
16494                 0x0a09080709080706,                                                                          \
16495                 0x0c0b0a090b0a0908,                                                                          \
16496                 0x0e0d0c0b0d0c0b0a,                                                                          \
16497                 0x000f0e0d0f0e0d0c,                                                                          \
16498                 0x0201000f01000f0e                                                                           \
16499             );                                                                                               \
16500             const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);                                  \
16501                                                                                                              \
16502             __mmask16 leading_bytes;                                                                         \
16503             const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);                                             \
16504             const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);                                         \
16505             const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);                                             \
16506             leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);                                       \
16507                                                                                                              \
16508             __m512i char_class;                                                                              \
16509             char_class = _mm512_srli_epi32(input, 4);                                                        \
16510             /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */                                           \
16511             const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);                                             \
16512             const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);                                       \
16513             char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);              \
16514                                                                                                              \
16515             const int valid_count = static_cast<int>(count_ones(leading_bytes));                             \
16516             const __m512i utf32 = expanded_utf8_to_utf32(char_class, input);                                 \
16517                                                                                                              \
16518             const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, utf32);    \
16519                                                                                                              \
16520             if (UTF32) {                                                                                     \
16521                 if(MASKED) {                                                                                 \
16522                     const __mmask16 valid = uint16_t((1 << valid_count) - 1);                                \
16523                     _mm512_mask_storeu_epi32((__m512i*)output, valid, out);                                  \
16524                 } else {                                                                                     \
16525                     _mm512_storeu_si512((__m512i*)output, out);                                              \
16526                 }                                                                                            \
16527                 output += valid_count;                                                                       \
16528             } else {                                                                                         \
16529                 if(MASKED) {                                                                                 \
16530                     output += utf32_to_utf16_masked<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
16531                 } else {                                                                                     \
16532                     output += utf32_to_utf16<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t *>(output));        \
16533                 }                                                                                            \
16534             }                                                                                                \
16535         }
16536 
16537 #define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED)                                    \
16538 {                                                                                                           \
16539     if (UTF32) {                                                                                            \
16540         if(MASKED) {                                                                                        \
16541             const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1);                                  \
16542             _mm512_mask_storeu_epi32((__m512i*)output, valid_mask, INPUT);                                  \
16543         } else {                                                                                            \
16544             _mm512_storeu_si512((__m512i*)output, INPUT);                                              \
16545         }                                                                                                   \
16546         output += VALID_COUNT;                                                                              \
16547     } else {                                                                                                \
16548         if(MASKED) {                                                                                        \
16549             output += utf32_to_utf16_masked<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t *>(output));      \
16550         } else {                                                                                            \
16551             output += utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t *>(output));             \
16552         }                                                                                                   \
16553     }                                                                                                       \
16554 }
16555 
16556 
16557 #define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)                                  \
16558         if (UTF32) {                                                                      \
16559                 const __m128i t0 = _mm512_castsi512_si128(utf8);                          \
16560                 const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1);                    \
16561                 const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2);                    \
16562                 const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3);                    \
16563                 _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi32(t0)); \
16564                 _mm512_storeu_si512((__m512i*)(output + 1*16), _mm512_cvtepu8_epi32(t1)); \
16565                 _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi32(t2)); \
16566                 _mm512_storeu_si512((__m512i*)(output + 3*16), _mm512_cvtepu8_epi32(t3)); \
16567         } else {                                                                          \
16568                 const __m256i h0 = _mm512_castsi512_si256(utf8);                          \
16569                 const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1);                    \
16570                 if(big_endian) {                                                          \
16571                 _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip)); \
16572                 _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip)); \
16573                 } else {                                                                  \
16574                 _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi16(h0)); \
16575                 _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi16(h1)); \
16576                 }                                                                         \
16577         }
16578 /* end file src/icelake/icelake_macros.inl.cpp */
16579 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=icelake/icelake_from_valid_utf8.inl.cpp
16580 /* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */
16581 // file included directly
16582 
16583 // File contains conversion procedure from VALID UTF-8 strings.
16584 
16585 /*
16586     valid_utf8_to_fixed_length converts a valid UTF-8 string into UTF-32.
16587 
16588     The `OUTPUT` template type decides what to do with UTF-32: store
16589     it directly or convert into UTF-16 (with AVX512).
16590 
16591     Input:
16592     - str           - valid UTF-8 string
16593     - len           - string length
16594     - out_buffer    - output buffer
16595 
16596     Result:
16597     - pair.first    - the first unprocessed input byte
16598     - pair.second   - the first unprocessed output word
16599 */
16600 template <endianness big_endian, typename OUTPUT>
valid_utf8_to_fixed_length(const char * str,size_t len,OUTPUT * dwords)16601 std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) {
16602     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
16603     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
16604     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
16605     static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32");
16606 
16607     __m512i byteflip = _mm512_setr_epi64(
16608             0x0607040502030001,
16609             0x0e0f0c0d0a0b0809,
16610             0x0607040502030001,
16611             0x0e0f0c0d0a0b0809,
16612             0x0607040502030001,
16613             0x0e0f0c0d0a0b0809,
16614             0x0607040502030001,
16615             0x0e0f0c0d0a0b0809
16616         );
16617     const char* ptr = str;
16618     const char* end = ptr + len;
16619 
16620     OUTPUT* output = dwords;
16621     /**
16622      * In the main loop, we consume 64 bytes per iteration,
16623      * but we access 64 + 4 bytes.
16624      * We check for ptr + 64 + 64 <= end because
16625      * we want to be do maskless writes without overruns.
16626      */
16627     while (ptr + 64 + 64 <= end) {
16628         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
16629         const __m512i v_80 = _mm512_set1_epi8(char(0x80));
16630         const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
16631         if(ascii == 0) {
16632             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
16633             output += 64;
16634             ptr += 64;
16635             continue;
16636         }
16637 
16638         const __m512i lane0 = broadcast_epi128<0>(utf8);
16639         const __m512i lane1 = broadcast_epi128<1>(utf8);
16640         int valid_count0;
16641         __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
16642         const __m512i lane2 = broadcast_epi128<2>(utf8);
16643         int valid_count1;
16644         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
16645         if(valid_count0 + valid_count1 <= 16) {
16646             vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
16647             valid_count0 += valid_count1;
16648             vec0 = expand_utf8_to_utf32(vec0);
16649             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
16650         } else {
16651             vec0 = expand_utf8_to_utf32(vec0);
16652             vec1 = expand_utf8_to_utf32(vec1);
16653             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
16654             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, false)
16655         }
16656         const __m512i lane3 = broadcast_epi128<3>(utf8);
16657         int valid_count2;
16658         __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
16659         uint32_t tmp1;
16660         ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
16661         const __m512i lane4 = _mm512_set1_epi32(tmp1);
16662         int valid_count3;
16663         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
16664         if(valid_count2 + valid_count3 <= 16) {
16665             vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
16666             valid_count2 += valid_count3;
16667             vec2 = expand_utf8_to_utf32(vec2);
16668             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
16669         } else {
16670             vec2 = expand_utf8_to_utf32(vec2);
16671             vec3 = expand_utf8_to_utf32(vec3);
16672             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
16673             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
16674         }
16675         ptr += 4*16;
16676     }
16677 
16678     if (ptr + 64 <= end) {
16679         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
16680         const __m512i v_80 = _mm512_set1_epi8(char(0x80));
16681         const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
16682         if(ascii == 0) {
16683             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
16684             output += 64;
16685             ptr += 64;
16686         } else {
16687             const __m512i lane0 = broadcast_epi128<0>(utf8);
16688             const __m512i lane1 = broadcast_epi128<1>(utf8);
16689             int valid_count0;
16690             __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
16691             const __m512i lane2 = broadcast_epi128<2>(utf8);
16692             int valid_count1;
16693             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
16694             if(valid_count0 + valid_count1 <= 16) {
16695                 vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
16696                 valid_count0 += valid_count1;
16697                 vec0 = expand_utf8_to_utf32(vec0);
16698                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
16699             } else {
16700                 vec0 = expand_utf8_to_utf32(vec0);
16701                 vec1 = expand_utf8_to_utf32(vec1);
16702                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
16703                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
16704             }
16705 
16706             const __m512i lane3 = broadcast_epi128<3>(utf8);
16707             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
16708 
16709             ptr += 3*16;
16710         }
16711     }
16712     return {ptr, output};
16713 }
16714 
16715 
16716 using utf8_to_utf16_result = std::pair<const char*, char16_t*>;
16717 /* end file src/icelake/icelake_from_valid_utf8.inl.cpp */
16718 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=icelake/icelake_utf8_validation.inl.cpp
16719 /* begin file src/icelake/icelake_utf8_validation.inl.cpp */
16720 // file included directly
16721 
16722 
check_special_cases(__m512i input,const __m512i prev1)16723 simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i prev1) {
16724   __m512i mask1 = _mm512_setr_epi64(
16725         0x0202020202020202,
16726         0x4915012180808080,
16727         0x0202020202020202,
16728         0x4915012180808080,
16729         0x0202020202020202,
16730         0x4915012180808080,
16731         0x0202020202020202,
16732         0x4915012180808080);
16733     const __m512i v_0f = _mm512_set1_epi8(0x0f);
16734     __m512i index1 = _mm512_and_si512(_mm512_srli_epi16(prev1, 4), v_0f);
16735 
16736     __m512i byte_1_high = _mm512_shuffle_epi8(mask1, index1);
16737     __m512i mask2 = _mm512_setr_epi64(
16738         0xcbcbcb8b8383a3e7,
16739         0xcbcbdbcbcbcbcbcb,
16740         0xcbcbcb8b8383a3e7,
16741         0xcbcbdbcbcbcbcbcb,
16742         0xcbcbcb8b8383a3e7,
16743         0xcbcbdbcbcbcbcbcb,
16744         0xcbcbcb8b8383a3e7,
16745         0xcbcbdbcbcbcbcbcb);
16746      __m512i index2 = _mm512_and_si512(prev1, v_0f);
16747 
16748     __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2);
16749     __m512i mask3 = _mm512_setr_epi64(
16750         0x101010101010101,
16751         0x1010101babaaee6,
16752         0x101010101010101,
16753         0x1010101babaaee6,
16754         0x101010101010101,
16755         0x1010101babaaee6,
16756         0x101010101010101,
16757         0x1010101babaaee6
16758     );
16759     __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
16760     __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3);
16761     return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128);
16762   }
16763 
check_multibyte_lengths(const __m512i input,const __m512i prev_input,const __m512i sc)16764   simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
16765       const __m512i prev_input, const __m512i sc) {
16766     __m512i prev2 = prev<2>(input, prev_input);
16767     __m512i prev3 = prev<3>(input, prev_input);
16768     __m512i is_third_byte  = _mm512_subs_epu8(prev2, _mm512_set1_epi8(0b11100000u-1)); // Only 111_____ will be > 0
16769     __m512i is_fourth_byte  = _mm512_subs_epu8(prev3, _mm512_set1_epi8(0b11110000u-1)); // Only 1111____ will be > 0
16770     __m512i is_third_or_fourth_byte = _mm512_or_si512(is_third_byte, is_fourth_byte);
16771     const __m512i v_7f = _mm512_set1_epi8(char(0x7f));
16772     is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte);
16773     // We want to compute (is_third_or_fourth_byte AND v80) XOR sc.
16774     const __m512i v_80 = _mm512_set1_epi8(char(0x80));
16775     return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc, 0b1101010);
16776     //__m512i is_third_or_fourth_byte_mask = _mm512_and_si512(is_third_or_fourth_byte, v_80);
16777     //return _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
16778   }
16779   //
16780   // Return nonzero if there are incomplete multibyte characters at the end of the block:
16781   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
16782   //
is_incomplete(const __m512i input)16783   simdutf_really_inline __m512i is_incomplete(const __m512i input) {
16784     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
16785     // ... 1111____ 111_____ 11______
16786     __m512i max_value = _mm512_setr_epi64(
16787         0xffffffffffffffff,
16788         0xffffffffffffffff,
16789         0xffffffffffffffff,
16790         0xffffffffffffffff,
16791         0xffffffffffffffff,
16792         0xffffffffffffffff,
16793         0xffffffffffffffff,
16794         0xbfdfefffffffffff);
16795     return _mm512_subs_epu8(input, max_value);
16796   }
16797 
16798   struct avx512_utf8_checker {
16799     // If this is nonzero, there has been a UTF-8 error.
16800     __m512i error{};
16801 
16802     // The last input we received
16803     __m512i prev_input_block{};
16804     // Whether the last input we received was incomplete (used for ASCII fast path)
16805     __m512i prev_incomplete{};
16806 
16807     //
16808     // Check whether the current bytes are valid UTF-8.
16809     //
check_utf8_bytessimdutf::icelake::__anone55652eb2e11::avx512_utf8_checker16810     simdutf_really_inline void check_utf8_bytes(const __m512i input, const __m512i prev_input) {
16811       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
16812       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
16813       __m512i prev1 = prev<1>(input, prev_input);
16814       __m512i sc = check_special_cases(input, prev1);
16815       this->error = _mm512_or_si512(check_multibyte_lengths(input, prev_input, sc), this->error);
16816     }
16817 
16818     // The only problem that can happen at EOF is that a multibyte character is too short
16819     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
16820     // too large in the first of two bytes.
check_eofsimdutf::icelake::__anone55652eb2e11::avx512_utf8_checker16821     simdutf_really_inline void check_eof() {
16822       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
16823       // possibly finish them.
16824       this->error = _mm512_or_si512(this->error, this->prev_incomplete);
16825     }
16826 
16827     // returns true if ASCII.
check_next_inputsimdutf::icelake::__anone55652eb2e11::avx512_utf8_checker16828     simdutf_really_inline bool check_next_input(const __m512i input) {
16829       const __m512i v_80 = _mm512_set1_epi8(char(0x80));
16830       const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
16831       if(ascii == 0) {
16832         this->error = _mm512_or_si512(this->error, this->prev_incomplete);
16833         return true;
16834       } else {
16835         this->check_utf8_bytes(input, this->prev_input_block);
16836         this->prev_incomplete = is_incomplete(input);
16837         this->prev_input_block = input;
16838         return false;
16839       }
16840     }
16841     // do not forget to call check_eof!
errorssimdutf::icelake::__anone55652eb2e11::avx512_utf8_checker16842     simdutf_really_inline bool errors() const {
16843         return _mm512_test_epi8_mask(this->error, this->error) != 0;
16844     }
16845 
16846   }; // struct avx512_utf8_checker
16847 /* end file src/icelake/icelake_utf8_validation.inl.cpp */
16848 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=icelake/icelake_from_utf8.inl.cpp
16849 /* begin file src/icelake/icelake_from_utf8.inl.cpp */
16850 // file included directly
16851 
16852 // File contains conversion procedure from possibly invalid UTF-8 strings.
16853 
16854 /**
16855  * Attempts to convert up to len 1-byte words from in (in UTF-8 format) to
16856  * out.
16857  * Returns the position of the input and output after the processing is
16858  * completed. Upon error, the output is set to null.
16859  */
16860 
16861 template <endianness big_endian>
fast_avx512_convert_utf8_to_utf16(const char * in,size_t len,char16_t * out)16862 utf8_to_utf16_result fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) {
16863   const char *const final_in = in + len;
16864   bool result = true;
16865   while (result) {
16866     if (in + 64 <= final_in) {
16867         result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
16868     } else if(in < final_in) {
16869         result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
16870     } else { break; }
16871   }
16872   if(!result) { out = nullptr; }
16873   return std::make_pair(in, out);
16874 }
16875 
16876 template <endianness big_endian>
fast_avx512_convert_utf8_to_utf16_with_errors(const char * in,size_t len,char16_t * out)16877 simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in, size_t len, char16_t *out) {
16878   const char *const init_in = in;
16879   const char16_t *const init_out = out;
16880   const char *const final_in = in + len;
16881   bool  result = true;
16882   while (result) {
16883     if (in + 64 <= final_in) {
16884         result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
16885     } else if(in < final_in) {
16886         result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
16887     } else { break; }
16888   }
16889   if(!result) {
16890     // rewind_and_convert_with_errors will seek a potential error from in onward,
16891     // with the ability to go back up to in - init_in bytes, and read final_in - in bytes forward.
16892     simdutf::result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(in - init_in, in, final_in - in, out);
16893     res.count += (in - init_in);
16894     return res;
16895   } else {
16896     return simdutf::result(error_code::SUCCESS,out - init_out);
16897   }
16898 }
16899 
16900 
16901 template <endianness big_endian, typename OUTPUT>
validating_utf8_to_fixed_length(const char * str,size_t len,OUTPUT * dwords)16902 std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) {
16903     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
16904     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
16905     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
16906     static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32");
16907 
16908     const char* ptr = str;
16909     const char* end = ptr + len;
16910     __m512i byteflip = _mm512_setr_epi64(
16911             0x0607040502030001,
16912             0x0e0f0c0d0a0b0809,
16913             0x0607040502030001,
16914             0x0e0f0c0d0a0b0809,
16915             0x0607040502030001,
16916             0x0e0f0c0d0a0b0809,
16917             0x0607040502030001,
16918             0x0e0f0c0d0a0b0809
16919         );
16920     OUTPUT* output = dwords;
16921     avx512_utf8_checker checker{};
16922     /**
16923      * In the main loop, we consume 64 bytes per iteration,
16924      * but we access 64 + 4 bytes.
16925      * We check for ptr + 64 + 64 <= end because
16926      * we want to be do maskless writes without overruns.
16927      */
16928     while (ptr + 64 + 64 <= end) {
16929         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
16930         if(checker.check_next_input(utf8)) {
16931             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
16932             output += 64;
16933             ptr += 64;
16934             continue;
16935         }
16936         const __m512i lane0 = broadcast_epi128<0>(utf8);
16937         const __m512i lane1 = broadcast_epi128<1>(utf8);
16938         int valid_count0;
16939         __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
16940         const __m512i lane2 = broadcast_epi128<2>(utf8);
16941         int valid_count1;
16942         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
16943         if(valid_count0 + valid_count1 <= 16) {
16944             vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
16945             valid_count0 += valid_count1;
16946             vec0 = expand_utf8_to_utf32(vec0);
16947             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
16948         } else {
16949             vec0 = expand_utf8_to_utf32(vec0);
16950             vec1 = expand_utf8_to_utf32(vec1);
16951             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
16952             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, false)
16953         }
16954         const __m512i lane3 = broadcast_epi128<3>(utf8);
16955         int valid_count2;
16956         __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
16957         uint32_t tmp1;
16958         ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
16959         const __m512i lane4 = _mm512_set1_epi32(tmp1);
16960         int valid_count3;
16961         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
16962         if(valid_count2 + valid_count3 <= 16) {
16963             vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
16964             valid_count2 += valid_count3;
16965             vec2 = expand_utf8_to_utf32(vec2);
16966             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
16967         } else {
16968             vec2 = expand_utf8_to_utf32(vec2);
16969             vec3 = expand_utf8_to_utf32(vec3);
16970             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
16971             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
16972         }
16973         ptr += 4*16;
16974     }
16975     const char* validatedptr = ptr; // validated up to ptr
16976 
16977     // For the final pass, we validate 64 bytes, but we only transcode
16978     // 3*16 bytes, so we may end up double-validating 16 bytes.
16979     if (ptr + 64 <= end) {
16980         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
16981         if(checker.check_next_input(utf8)) {
16982             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
16983             output += 64;
16984             ptr += 64;
16985         } else {
16986             const __m512i lane0 = broadcast_epi128<0>(utf8);
16987             const __m512i lane1 = broadcast_epi128<1>(utf8);
16988             int valid_count0;
16989             __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
16990             const __m512i lane2 = broadcast_epi128<2>(utf8);
16991             int valid_count1;
16992             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
16993             if(valid_count0 + valid_count1 <= 16) {
16994                 vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
16995                 valid_count0 += valid_count1;
16996                 vec0 = expand_utf8_to_utf32(vec0);
16997                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
16998             } else {
16999                 vec0 = expand_utf8_to_utf32(vec0);
17000                 vec1 = expand_utf8_to_utf32(vec1);
17001                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
17002                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
17003             }
17004 
17005             const __m512i lane3 = broadcast_epi128<3>(utf8);
17006             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
17007 
17008             ptr += 3*16;
17009         }
17010         validatedptr += 4*16;
17011     }
17012     {
17013        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr);
17014        checker.check_next_input(utf8);
17015     }
17016     checker.check_eof();
17017     if(checker.errors()) {
17018         return {ptr, nullptr}; // We found an error.
17019     }
17020     return {ptr, output};
17021 }
17022 
17023 // Like validating_utf8_to_fixed_length but returns as soon as an error is identified
17024 template <endianness big_endian, typename OUTPUT>
validating_utf8_to_fixed_length_with_constant_checks(const char * str,size_t len,OUTPUT * dwords)17025 std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_constant_checks(const char* str, size_t len, OUTPUT* dwords) {
17026     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
17027     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
17028     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
17029     static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32");
17030 
17031     const char* ptr = str;
17032     const char* end = ptr + len;
17033     __m512i byteflip = _mm512_setr_epi64(
17034             0x0607040502030001,
17035             0x0e0f0c0d0a0b0809,
17036             0x0607040502030001,
17037             0x0e0f0c0d0a0b0809,
17038             0x0607040502030001,
17039             0x0e0f0c0d0a0b0809,
17040             0x0607040502030001,
17041             0x0e0f0c0d0a0b0809
17042         );
17043     OUTPUT* output = dwords;
17044     avx512_utf8_checker checker{};
17045     /**
17046      * In the main loop, we consume 64 bytes per iteration,
17047      * but we access 64 + 4 bytes.
17048      * We check for ptr + 64 + 64 <= end because
17049      * we want to be do maskless writes without overruns.
17050      */
17051     while (ptr + 64 + 64 <= end) {
17052         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
17053         if(checker.check_next_input(utf8)) {
17054             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
17055             output += 64;
17056             ptr += 64;
17057             continue;
17058         }
17059         if(checker.errors()) {
17060             return {ptr, output, false}; // We found an error.
17061         }
17062         const __m512i lane0 = broadcast_epi128<0>(utf8);
17063         const __m512i lane1 = broadcast_epi128<1>(utf8);
17064         int valid_count0;
17065         __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
17066         const __m512i lane2 = broadcast_epi128<2>(utf8);
17067         int valid_count1;
17068         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
17069         if(valid_count0 + valid_count1 <= 16) {
17070             vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
17071             valid_count0 += valid_count1;
17072             vec0 = expand_utf8_to_utf32(vec0);
17073             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
17074         } else {
17075             vec0 = expand_utf8_to_utf32(vec0);
17076             vec1 = expand_utf8_to_utf32(vec1);
17077             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
17078             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, false)
17079         }
17080         const __m512i lane3 = broadcast_epi128<3>(utf8);
17081         int valid_count2;
17082         __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
17083         uint32_t tmp1;
17084         ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
17085         const __m512i lane4 = _mm512_set1_epi32(tmp1);
17086         int valid_count3;
17087         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
17088         if(valid_count2 + valid_count3 <= 16) {
17089             vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
17090             valid_count2 += valid_count3;
17091             vec2 = expand_utf8_to_utf32(vec2);
17092             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
17093         } else {
17094             vec2 = expand_utf8_to_utf32(vec2);
17095             vec3 = expand_utf8_to_utf32(vec3);
17096             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
17097             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
17098         }
17099         ptr += 4*16;
17100     }
17101     const char* validatedptr = ptr; // validated up to ptr
17102 
17103     // For the final pass, we validate 64 bytes, but we only transcode
17104     // 3*16 bytes, so we may end up double-validating 16 bytes.
17105     if (ptr + 64 <= end) {
17106         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
17107         if(checker.check_next_input(utf8)) {
17108             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
17109             output += 64;
17110             ptr += 64;
17111         } else if(checker.errors()) {
17112             return {ptr, output, false}; // We found an error.
17113         } else {
17114             const __m512i lane0 = broadcast_epi128<0>(utf8);
17115             const __m512i lane1 = broadcast_epi128<1>(utf8);
17116             int valid_count0;
17117             __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
17118             const __m512i lane2 = broadcast_epi128<2>(utf8);
17119             int valid_count1;
17120             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
17121             if(valid_count0 + valid_count1 <= 16) {
17122                 vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
17123                 valid_count0 += valid_count1;
17124                 vec0 = expand_utf8_to_utf32(vec0);
17125                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
17126             } else {
17127                 vec0 = expand_utf8_to_utf32(vec0);
17128                 vec1 = expand_utf8_to_utf32(vec1);
17129                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
17130                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
17131             }
17132 
17133             const __m512i lane3 = broadcast_epi128<3>(utf8);
17134             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
17135 
17136             ptr += 3*16;
17137         }
17138         validatedptr += 4*16;
17139     }
17140     {
17141        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr);
17142        checker.check_next_input(utf8);
17143     }
17144     checker.check_eof();
17145     if(checker.errors()) {
17146         return {ptr, output, false}; // We found an error.
17147     }
17148     return {ptr, output, true};
17149 }
17150 /* end file src/icelake/icelake_from_utf8.inl.cpp */
17151 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf32.inl.cpp
17152 /* begin file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
17153 // file included directly
17154 
17155 /*
17156   Returns a pair: the first unprocessed byte from buf and utf32_output
17157   A scalar routing should carry on the conversion of the tail.
17158 */
17159 template <endianness big_endian>
convert_utf16_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output)17160 std::tuple<const char16_t*, char32_t*, bool> convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
17161   const char16_t* end = buf + len;
17162   const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
17163   const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
17164   const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
17165   __mmask32 carry{0};
17166   const __m512i byteflip = _mm512_setr_epi64(
17167             0x0607040502030001,
17168             0x0e0f0c0d0a0b0809,
17169             0x0607040502030001,
17170             0x0e0f0c0d0a0b0809,
17171             0x0607040502030001,
17172             0x0e0f0c0d0a0b0809,
17173             0x0607040502030001,
17174             0x0e0f0c0d0a0b0809
17175         );
17176   while (buf + 32 <= end) {
17177     // Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
17178     __m512i in = _mm512_loadu_si512((__m512i*)buf);
17179     if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
17180 
17181     // H - bitmask for high surrogates
17182     const __mmask32 H = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
17183     // H - bitmask for low surrogates
17184     const __mmask32 L = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
17185 
17186     if ((H|L)) {
17187       // surrogate pair(s) in a register
17188       const __mmask32 V = (L ^ (carry | (H << 1)));   // A high surrogate must be followed by low one and a low one must be preceded by a high one.
17189                                                       // If valid, V should be equal to 0
17190 
17191       if(V == 0) {
17192         // valid case
17193         /*
17194             Input surrogate pair:
17195             |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
17196                 low surrogate      high surrogate
17197         */
17198         /*  1. Expand all words to 32-bit words
17199             in  |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
17200         */
17201         const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
17202         const __m512i second = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1));
17203 
17204         /*  2. Shift by one 16-bit word to align low surrogates with high surrogates
17205             in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
17206             shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
17207         */
17208         const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
17209         const __m512i shifted_second = _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
17210 
17211         /*  3. Align all high surrogates in first and second by shifting to the left by 10 bits
17212             |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
17213         */
17214         const __m512i aligned_first = _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
17215         const __m512i aligned_second = _mm512_mask_slli_epi32(second, (__mmask16)(H>>16), second, 10);
17216 
17217         /*  4. Remove surrogate prefixes and add offset 0x10000 by adding in, shifted and constant
17218             in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
17219             shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
17220             constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
17221         */
17222         const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
17223         const __m512i added_first = _mm512_mask_add_epi32(aligned_first, (__mmask16)H, aligned_first, shifted_first);
17224         const __m512i utf32_first = _mm512_mask_add_epi32(added_first, (__mmask16)H, added_first, constant);
17225 
17226         const __m512i added_second = _mm512_mask_add_epi32(aligned_second, (__mmask16)(H>>16), aligned_second, shifted_second);
17227         const __m512i utf32_second = _mm512_mask_add_epi32(added_second, (__mmask16)(H>>16), added_second, constant);
17228 
17229         //  5. Store all valid UTF-32 words (low surrogate positions and 32nd word are invalid)
17230         const __mmask32 valid = ~L & 0x7fffffff;
17231         // We deliberately do a _mm512_maskz_compress_epi32 followed by storeu_epi32
17232         // to ease performance portability to Zen 4.
17233         const __m512i compressed_first = _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
17234         const size_t howmany1 = count_ones((uint16_t)(valid));
17235         _mm512_storeu_si512((__m512i *) utf32_output,  compressed_first);
17236         utf32_output += howmany1;
17237         const __m512i compressed_second = _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
17238         const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
17239         // The following could be unsafe in some cases?
17240         //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
17241         _mm512_mask_storeu_epi32((__m512i *) utf32_output, __mmask16((1<<howmany2)-1), compressed_second);
17242         utf32_output += howmany2;
17243         // Only process 31 words, but keep track if the 31st word is a high surrogate as a carry
17244         buf += 31;
17245         carry = (H >> 30) & 0x1;
17246       } else {
17247         // invalid case
17248         return std::make_tuple(buf+carry, utf32_output, false);
17249       }
17250     } else {
17251       // no surrogates
17252       // extend all thirty-two 16-bit words to thirty-two 32-bit words
17253       _mm512_storeu_si512((__m512i *)(utf32_output), _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
17254       _mm512_storeu_si512((__m512i *)(utf32_output) + 1, _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1)));
17255       utf32_output += 32;
17256       buf += 32;
17257       carry = 0;
17258     }
17259   } // while
17260   return std::make_tuple(buf+carry, utf32_output, true);
17261 }
17262 /* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
17263 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf8.inl.cpp
17264 /* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
17265 // file included directly
17266 
17267 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
avx512_convert_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output)17268 std::pair<const char32_t*, char*> avx512_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
17269   const char32_t* end = buf + len;
17270   const __m256i v_0000 = _mm256_setzero_si256();
17271   const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
17272   const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
17273   const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
17274   const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
17275   const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
17276   __m256i running_max = _mm256_setzero_si256();
17277   __m256i forbidden_bytemask = _mm256_setzero_si256();
17278 
17279   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
17280 
17281   while (buf + 16 + safety_margin <= end) {
17282     __m256i in = _mm256_loadu_si256((__m256i*)buf);
17283     __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
17284     running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
17285 
17286     // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
17287     __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
17288     in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
17289 
17290     // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
17291 
17292     if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
17293       // 1. pack the bytes
17294       const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
17295       // 2. store (16 bytes)
17296       _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
17297       // 3. adjust pointers
17298       buf += 16;
17299       utf8_output += 16;
17300       continue; // we are done for this round!
17301     }
17302     // no bits set above 7th bit
17303     const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
17304     const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
17305 
17306     // no bits set above 11th bit
17307     const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
17308     const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
17309     if (one_or_two_bytes_bitmask == 0xffffffff) {
17310       // 1. prepare 2-byte values
17311       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
17312       // expected output   : [110a|aaaa|10bb|bbbb] x 8
17313       const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
17314       const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
17315 
17316       // t0 = [000a|aaaa|bbbb|bb00]
17317       const __m256i t0 = _mm256_slli_epi16(in_16, 2);
17318       // t1 = [000a|aaaa|0000|0000]
17319       const __m256i t1 = _mm256_and_si256(t0, v_1f00);
17320       // t2 = [0000|0000|00bb|bbbb]
17321       const __m256i t2 = _mm256_and_si256(in_16, v_003f);
17322       // t3 = [000a|aaaa|00bb|bbbb]
17323       const __m256i t3 = _mm256_or_si256(t1, t2);
17324       // t4 = [110a|aaaa|10bb|bbbb]
17325       const __m256i t4 = _mm256_or_si256(t3, v_c080);
17326 
17327       // 2. merge ASCII and 2-byte codewords
17328       const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
17329 
17330       // 3. prepare bitmask for 8-bit lookup
17331       const uint32_t M0 = one_byte_bitmask & 0x55555555;
17332       const uint32_t M1 = M0 >> 7;
17333       const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
17334       // 4. pack the bytes
17335 
17336       const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
17337       const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
17338 
17339       const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
17340       const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
17341 
17342       const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
17343       // 5. store bytes
17344       _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
17345       utf8_output += row[0];
17346       _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
17347       utf8_output += row_2[0];
17348 
17349       // 6. adjust pointers
17350       buf += 16;
17351       continue;
17352     }
17353     // Must check for overflow in packing
17354     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
17355     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
17356     if (saturation_bitmask == 0xffffffff) {
17357       // case: words from register produce either 1, 2 or 3 UTF-8 bytes
17358       const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
17359       forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
17360 
17361       const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
17362                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
17363                                               0x0000, 0x0202, 0x0404, 0x0606,
17364                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
17365 
17366       /* In this branch we handle three cases:
17367         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
17368         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
17369         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
17370 
17371         We expand the input word (16-bit) into two words (32-bit), thus
17372         we have room for four bytes. However, we need five distinct bit
17373         layouts. Note that the last byte in cases #2 and #3 is the same.
17374 
17375         We precompute byte 1 for case #1 and the common byte for cases #2 & #3
17376         in register t2.
17377 
17378         We precompute byte 1 for case #3 and -- **conditionally** -- precompute
17379         either byte 1 for case #2 or byte 2 for case #3. Note that they
17380         differ by exactly one bit.
17381 
17382         Finally from these two words we build proper UTF-8 sequence, taking
17383         into account the case (i.e, the number of bytes to write).
17384       */
17385       /**
17386        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
17387        * t2 => [0ccc|cccc] [10cc|cccc]
17388        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
17389        */
17390 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
17391       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
17392       const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
17393       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
17394       const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
17395       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
17396       const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
17397 
17398       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
17399       const __m256i s0 = _mm256_srli_epi16(in_16, 4);
17400       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
17401       const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
17402       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
17403       const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
17404       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
17405       const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
17406       const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
17407       const __m256i s4 = _mm256_xor_si256(s3, m0);
17408 #undef simdutf_vec
17409 
17410       // 4. expand words 16-bit => 32-bit
17411       const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
17412       const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
17413 
17414       // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
17415       const uint32_t mask = (one_byte_bitmask & 0x55555555) |
17416                             (one_or_two_bytes_bitmask & 0xaaaaaaaa);
17417       // Due to the wider registers, the following path is less likely to be useful.
17418       /*if(mask == 0) {
17419         // We only have three-byte words. Use fast path.
17420         const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
17421         const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
17422         const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
17423         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
17424         utf8_output += 12;
17425         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
17426         utf8_output += 12;
17427         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
17428         utf8_output += 12;
17429         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
17430         utf8_output += 12;
17431         buf += 16;
17432         continue;
17433       }*/
17434       const uint8_t mask0 = uint8_t(mask);
17435       const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
17436       const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
17437       const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
17438 
17439       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
17440       const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
17441       const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
17442       const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
17443 
17444       const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
17445       const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
17446       const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
17447       const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
17448 
17449 
17450       const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
17451       const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
17452       const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
17453       const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
17454 
17455       _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
17456       utf8_output += row0[0];
17457       _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
17458       utf8_output += row1[0];
17459       _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
17460       utf8_output += row2[0];
17461       _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
17462       utf8_output += row3[0];
17463       buf += 16;
17464     } else {
17465       // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
17466       // Let us do a scalar fallback.
17467       // It may seem wasteful to use scalar code, but being efficient with SIMD
17468       // may require large, non-trivial tables?
17469       size_t forward = 15;
17470       size_t k = 0;
17471       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
17472       for(; k < forward; k++) {
17473         uint32_t word = buf[k];
17474         if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
17475           *utf8_output++ = char(word);
17476         } else if((word & 0xFFFFF800)==0) { // 2-byte
17477           *utf8_output++ = char((word>>6) | 0b11000000);
17478           *utf8_output++ = char((word & 0b111111) | 0b10000000);
17479         } else if((word & 0xFFFF0000 )==0) {  // 3-byte
17480           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
17481           *utf8_output++ = char((word>>12) | 0b11100000);
17482           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
17483           *utf8_output++ = char((word & 0b111111) | 0b10000000);
17484         } else {  // 4-byte
17485           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
17486           *utf8_output++ = char((word>>18) | 0b11110000);
17487           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
17488           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
17489           *utf8_output++ = char((word & 0b111111) | 0b10000000);
17490         }
17491       }
17492       buf += k;
17493     }
17494   } // while
17495 
17496   // check for invalid input
17497   const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
17498   if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
17499     return std::make_pair(nullptr, utf8_output);
17500   }
17501 
17502   if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
17503 
17504   return std::make_pair(buf, utf8_output);
17505 }
17506 
17507 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
avx512_convert_utf32_to_utf8_with_errors(const char32_t * buf,size_t len,char * utf8_output)17508 std::pair<result, char*> avx512_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
17509   const char32_t* end = buf + len;
17510   const char32_t* start = buf;
17511 
17512   const __m256i v_0000 = _mm256_setzero_si256();
17513   const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
17514   const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
17515   const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
17516   const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
17517   const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
17518   const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
17519 
17520   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
17521 
17522   while (buf + 16 + safety_margin <= end) {
17523     __m256i in = _mm256_loadu_si256((__m256i*)buf);
17524     __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
17525     // Check for too large input
17526     const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
17527     if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
17528       return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
17529     }
17530 
17531     // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
17532     __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
17533     in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
17534 
17535     // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
17536 
17537     if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
17538       // 1. pack the bytes
17539       const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
17540       // 2. store (16 bytes)
17541       _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
17542       // 3. adjust pointers
17543       buf += 16;
17544       utf8_output += 16;
17545       continue; // we are done for this round!
17546     }
17547     // no bits set above 7th bit
17548     const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
17549     const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
17550 
17551     // no bits set above 11th bit
17552     const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
17553     const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
17554     if (one_or_two_bytes_bitmask == 0xffffffff) {
17555       // 1. prepare 2-byte values
17556       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
17557       // expected output   : [110a|aaaa|10bb|bbbb] x 8
17558       const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
17559       const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
17560 
17561       // t0 = [000a|aaaa|bbbb|bb00]
17562       const __m256i t0 = _mm256_slli_epi16(in_16, 2);
17563       // t1 = [000a|aaaa|0000|0000]
17564       const __m256i t1 = _mm256_and_si256(t0, v_1f00);
17565       // t2 = [0000|0000|00bb|bbbb]
17566       const __m256i t2 = _mm256_and_si256(in_16, v_003f);
17567       // t3 = [000a|aaaa|00bb|bbbb]
17568       const __m256i t3 = _mm256_or_si256(t1, t2);
17569       // t4 = [110a|aaaa|10bb|bbbb]
17570       const __m256i t4 = _mm256_or_si256(t3, v_c080);
17571 
17572       // 2. merge ASCII and 2-byte codewords
17573       const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
17574 
17575       // 3. prepare bitmask for 8-bit lookup
17576       const uint32_t M0 = one_byte_bitmask & 0x55555555;
17577       const uint32_t M1 = M0 >> 7;
17578       const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
17579       // 4. pack the bytes
17580 
17581       const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
17582       const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
17583 
17584       const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
17585       const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
17586 
17587       const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
17588       // 5. store bytes
17589       _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
17590       utf8_output += row[0];
17591       _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
17592       utf8_output += row_2[0];
17593 
17594       // 6. adjust pointers
17595       buf += 16;
17596       continue;
17597     }
17598     // Must check for overflow in packing
17599     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
17600     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
17601     if (saturation_bitmask == 0xffffffff) {
17602       // case: words from register produce either 1, 2 or 3 UTF-8 bytes
17603 
17604       // Check for illegal surrogate words
17605       const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
17606       const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
17607       if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
17608         return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
17609       }
17610 
17611       const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
17612                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
17613                                               0x0000, 0x0202, 0x0404, 0x0606,
17614                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
17615 
17616       /* In this branch we handle three cases:
17617         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
17618         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
17619         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
17620 
17621         We expand the input word (16-bit) into two words (32-bit), thus
17622         we have room for four bytes. However, we need five distinct bit
17623         layouts. Note that the last byte in cases #2 and #3 is the same.
17624 
17625         We precompute byte 1 for case #1 and the common byte for cases #2 & #3
17626         in register t2.
17627 
17628         We precompute byte 1 for case #3 and -- **conditionally** -- precompute
17629         either byte 1 for case #2 or byte 2 for case #3. Note that they
17630         differ by exactly one bit.
17631 
17632         Finally from these two words we build proper UTF-8 sequence, taking
17633         into account the case (i.e, the number of bytes to write).
17634       */
17635       /**
17636        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
17637        * t2 => [0ccc|cccc] [10cc|cccc]
17638        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
17639        */
17640 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
17641       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
17642       const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
17643       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
17644       const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
17645       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
17646       const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
17647 
17648       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
17649       const __m256i s0 = _mm256_srli_epi16(in_16, 4);
17650       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
17651       const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
17652       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
17653       const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
17654       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
17655       const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
17656       const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
17657       const __m256i s4 = _mm256_xor_si256(s3, m0);
17658 #undef simdutf_vec
17659 
17660       // 4. expand words 16-bit => 32-bit
17661       const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
17662       const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
17663 
17664       // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
17665       const uint32_t mask = (one_byte_bitmask & 0x55555555) |
17666                             (one_or_two_bytes_bitmask & 0xaaaaaaaa);
17667       // Due to the wider registers, the following path is less likely to be useful.
17668       /*if(mask == 0) {
17669         // We only have three-byte words. Use fast path.
17670         const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
17671         const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
17672         const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
17673         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
17674         utf8_output += 12;
17675         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
17676         utf8_output += 12;
17677         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
17678         utf8_output += 12;
17679         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
17680         utf8_output += 12;
17681         buf += 16;
17682         continue;
17683       }*/
17684       const uint8_t mask0 = uint8_t(mask);
17685       const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
17686       const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
17687       const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
17688 
17689       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
17690       const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
17691       const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
17692       const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
17693 
17694       const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
17695       const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
17696       const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
17697       const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
17698 
17699 
17700       const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
17701       const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
17702       const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
17703       const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
17704 
17705       _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
17706       utf8_output += row0[0];
17707       _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
17708       utf8_output += row1[0];
17709       _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
17710       utf8_output += row2[0];
17711       _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
17712       utf8_output += row3[0];
17713       buf += 16;
17714     } else {
17715       // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
17716       // Let us do a scalar fallback.
17717       // It may seem wasteful to use scalar code, but being efficient with SIMD
17718       // may require large, non-trivial tables?
17719       size_t forward = 15;
17720       size_t k = 0;
17721       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
17722       for(; k < forward; k++) {
17723         uint32_t word = buf[k];
17724         if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
17725           *utf8_output++ = char(word);
17726         } else if((word & 0xFFFFF800)==0) { // 2-byte
17727           *utf8_output++ = char((word>>6) | 0b11000000);
17728           *utf8_output++ = char((word & 0b111111) | 0b10000000);
17729         } else if((word & 0xFFFF0000 )==0) {  // 3-byte
17730           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
17731           *utf8_output++ = char((word>>12) | 0b11100000);
17732           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
17733           *utf8_output++ = char((word & 0b111111) | 0b10000000);
17734         } else {  // 4-byte
17735           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
17736           *utf8_output++ = char((word>>18) | 0b11110000);
17737           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
17738           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
17739           *utf8_output++ = char((word & 0b111111) | 0b10000000);
17740         }
17741       }
17742       buf += k;
17743     }
17744   } // while
17745 
17746   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
17747 }
17748 /* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
17749 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf16.inl.cpp
17750 /* begin file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
17751 // file included directly
17752 
17753 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
17754 template <endianness big_endian>
avx512_convert_utf32_to_utf16(const char32_t * buf,size_t len,char16_t * utf16_output)17755 std::pair<const char32_t*, char16_t*> avx512_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
17756   const char32_t* end = buf + len;
17757 
17758   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
17759   __m256i forbidden_bytemask = _mm256_setzero_si256();
17760 
17761 
17762   while (buf + 8 + safety_margin <= end) {
17763     __m256i in = _mm256_loadu_si256((__m256i*)buf);
17764 
17765     const __m256i v_00000000 = _mm256_setzero_si256();
17766     const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
17767 
17768     // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
17769     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
17770     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
17771 
17772     if (saturation_bitmask == 0xffffffff) {
17773       const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
17774       const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
17775       forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
17776 
17777       __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
17778       if (big_endian) {
17779         const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
17780         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
17781       }
17782       _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
17783       utf16_output += 8;
17784       buf += 8;
17785     } else {
17786       size_t forward = 7;
17787       size_t k = 0;
17788       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
17789       for(; k < forward; k++) {
17790         uint32_t word = buf[k];
17791         if((word & 0xFFFF0000)==0) {
17792           // will not generate a surrogate pair
17793           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
17794           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
17795         } else {
17796           // will generate a surrogate pair
17797           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
17798           word -= 0x10000;
17799           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
17800           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
17801           if (big_endian) {
17802             high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
17803             low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
17804           }
17805           *utf16_output++ = char16_t(high_surrogate);
17806           *utf16_output++ = char16_t(low_surrogate);
17807         }
17808       }
17809       buf += k;
17810     }
17811   }
17812 
17813   // check for invalid input
17814   if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
17815 
17816   return std::make_pair(buf, utf16_output);
17817 }
17818 
17819 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
17820 template <endianness big_endian>
avx512_convert_utf32_to_utf16_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output)17821 std::pair<result, char16_t*> avx512_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
17822   const char32_t* start = buf;
17823   const char32_t* end = buf + len;
17824 
17825   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
17826 
17827   while (buf + 8 + safety_margin <= end) {
17828     __m256i in = _mm256_loadu_si256((__m256i*)buf);
17829 
17830     const __m256i v_00000000 = _mm256_setzero_si256();
17831     const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
17832 
17833     // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
17834     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
17835     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
17836 
17837     if (saturation_bitmask == 0xffffffff) {
17838       const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
17839       const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
17840       const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
17841       if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
17842         return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
17843       }
17844 
17845       __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
17846       if (big_endian) {
17847         const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
17848         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
17849       }
17850       _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
17851       utf16_output += 8;
17852       buf += 8;
17853     } else {
17854       size_t forward = 7;
17855       size_t k = 0;
17856       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
17857       for(; k < forward; k++) {
17858         uint32_t word = buf[k];
17859         if((word & 0xFFFF0000)==0) {
17860           // will not generate a surrogate pair
17861           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
17862           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
17863         } else {
17864           // will generate a surrogate pair
17865           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
17866           word -= 0x10000;
17867           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
17868           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
17869           if (big_endian) {
17870             high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
17871             low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
17872           }
17873           *utf16_output++ = char16_t(high_surrogate);
17874           *utf16_output++ = char16_t(low_surrogate);
17875         }
17876       }
17877       buf += k;
17878     }
17879   }
17880 
17881   return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
17882 }
17883 /* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
17884 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=icelake/icelake_ascii_validation.inl.cpp
17885 /* begin file src/icelake/icelake_ascii_validation.inl.cpp */
17886 // file included directly
17887 
validate_ascii(const char * buf,size_t len)17888 bool validate_ascii(const char* buf, size_t len) {
17889   const char* end = buf + len;
17890   const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
17891   __m512i running_or = _mm512_setzero_si512();
17892   for (; buf + 64 <= end; buf += 64) {
17893     const __m512i utf8 = _mm512_loadu_si512((const __m512i*)buf);
17894     running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
17895   }
17896   if(buf < end) {
17897      const __m512i utf8 = _mm512_maskz_loadu_epi8((uint64_t(1) << (end-buf)) - 1,(const __m512i*)buf);
17898     running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
17899   }
17900   return (_mm512_test_epi8_mask(running_or, running_or) == 0);
17901 }
17902 /* end file src/icelake/icelake_ascii_validation.inl.cpp */
17903 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=icelake/icelake_utf32_validation.inl.cpp
17904 /* begin file src/icelake/icelake_utf32_validation.inl.cpp */
17905 // file included directly
17906 
validate_utf32(const char32_t * buf,size_t len)17907 const char32_t* validate_utf32(const char32_t* buf, size_t len) {
17908     const char32_t* end = len >= 16 ? buf + len - 16 : nullptr;
17909 
17910     const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
17911     __m512i currentmax = _mm512_setzero_si512();
17912     __m512i currentoffsetmax = _mm512_setzero_si512();
17913 
17914     while (buf <= end) {
17915       __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
17916       buf += 16;
17917       currentoffsetmax = _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
17918       currentmax = _mm512_max_epu32(utf32, currentmax);
17919     }
17920 
17921     const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff);
17922     const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff);
17923     __m512i is_zero = _mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax);
17924     if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
17925       return nullptr;
17926     }
17927     is_zero = _mm512_xor_si512(_mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
17928     if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
17929       return nullptr;
17930     }
17931 
17932     return buf;
17933 }
17934 /* end file src/icelake/icelake_utf32_validation.inl.cpp */
17935 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf8.inl.cpp
17936 /* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
17937 // file included directly
17938 
17939 /**
17940  * This function converts the input (inbuf, inlen), assumed to be valid
17941  * UTF16 (little endian) into UTF-8 (to outbuf). The number of words written
17942  * is written to 'outlen' and the function reports the number of input word
17943  * consumed.
17944  */
17945 template <endianness big_endian>
utf16_to_utf8_avx512i(const char16_t * inbuf,size_t inlen,unsigned char * outbuf,size_t * outlen)17946 size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
17947                                unsigned char *outbuf, size_t *outlen) {
17948   __m512i in;
17949   __mmask32 inmask = _cvtu32_mask32(0x7fffffff);
17950   __m512i byteflip = _mm512_setr_epi64(
17951             0x0607040502030001,
17952             0x0e0f0c0d0a0b0809,
17953             0x0607040502030001,
17954             0x0e0f0c0d0a0b0809,
17955             0x0607040502030001,
17956             0x0e0f0c0d0a0b0809,
17957             0x0607040502030001,
17958             0x0e0f0c0d0a0b0809
17959         );
17960   const char16_t * const inbuf_orig = inbuf;
17961   const unsigned char * const outbuf_orig = outbuf;
17962   size_t adjust = 0;
17963   int carry = 0;
17964 
17965   while (inlen >= 32) {
17966     in = _mm512_loadu_si512(inbuf);
17967     if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
17968     inlen -= 31;
17969   lastiteration:
17970     inbuf += 31;
17971 
17972   failiteration:
17973     const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
17974       inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
17975 
17976     if (_ktestz_mask32_u8(inmask, is234byte)) {
17977       // fast path for ASCII only
17978       _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
17979       outbuf += 31;
17980       carry = 0;
17981 
17982       if (inlen < 32) {
17983         goto tail;
17984       } else {
17985         continue;
17986       }
17987     }
17988 
17989     const __mmask32 is12byte =
17990         _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
17991 
17992     if (_ktestc_mask32_u8(is12byte, inmask)) {
17993       // fast path for 1 and 2 byte only
17994 
17995       const __m512i twobytes = _mm512_ternarylogic_epi32(
17996           _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
17997           _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
17998       in = _mm512_mask_add_epi16(in, is234byte, twobytes,
17999                                  _mm512_set1_epi16(int16_t(0x80c0)));
18000       const __m512i cmpmask =
18001           _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
18002                                   _mm512_set1_epi16(0x0800));
18003       const __mmask64 smoosh = _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
18004       const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
18005       _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh), _cvtmask64_u64(smoosh))),
18006                               out);
18007       outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
18008       carry = 0;
18009 
18010       if (inlen < 32) {
18011         goto tail;
18012       } else {
18013         continue;
18014       }
18015     }
18016     __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
18017     __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
18018 
18019 
18020     __m512i taglo = _mm512_set1_epi32(0x8080e000);
18021     __m512i taghi = taglo;
18022 
18023     const __m512i fc00masked = _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
18024     const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
18025         inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
18026     const __mmask32 losurr = _mm512_cmp_epu16_mask(
18027         fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
18028 
18029     int carryout = 0;
18030     if (!_kortestz_mask32_u8(hisurr, losurr)) {
18031       // handle surrogates
18032 
18033       __m512i los = _mm512_alignr_epi32(hi, lo, 1);
18034       __m512i his = _mm512_alignr_epi32(lo, hi, 1);
18035 
18036       const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
18037       taglo =
18038           _mm512_mask_mov_epi32(taglo,__mmask16(hisurr), _mm512_set1_epi32(0x808080f0));
18039       taghi =
18040           _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi), _mm512_set1_epi32(0x808080f0));
18041 
18042       lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
18043       hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
18044       los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
18045       his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
18046       lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
18047       hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
18048 
18049       carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
18050 
18051       const uint32_t  h = _cvtmask32_u32(hisurr);
18052       const uint32_t  l = _cvtmask32_u32(losurr);
18053       // check for mismatched surrogates
18054       if ((h + h + carry) ^ l) {
18055         const uint32_t lonohi = l & ~(h + h + carry);
18056         const uint32_t hinolo = h & ~(l >> 1);
18057         inlen = _tzcnt_u32(hinolo | lonohi);
18058         inmask = __mmask32(0x7fffffff & ((1 << inlen) - 1));
18059         in = _mm512_maskz_mov_epi16(inmask, in);
18060         adjust = (int)inlen - 31;
18061         inlen = 0;
18062         goto failiteration;
18063       }
18064     }
18065 
18066     hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff),hi);
18067     carry = carryout;
18068 
18069     __m512i mslo =
18070         _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
18071 
18072     __m512i mshi =
18073         _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
18074 
18075     const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
18076     const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
18077 
18078     const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
18079     const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
18080     const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
18081 
18082     taglo =
18083         _mm512_mask_mov_epi32(taglo, __mmask16(is12byte), _mm512_set1_epi32(0x80c00000));
18084     taghi =
18085         _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi), _mm512_set1_epi32(0x80c00000));
18086     __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
18087                                       _mm512_set1_epi32(0x00010101));
18088     __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
18089                                       _mm512_set1_epi32(0x00010101));
18090 
18091 
18092     magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
18093                                       _mm512_set1_epi32(0x00010101));
18094     magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
18095                                       _mm512_set1_epi32(0x00010101));
18096 
18097     mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
18098                                      0xea); // A&B|C
18099     mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
18100                                      0xea);
18101     mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
18102 
18103     mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
18104 
18105     const __mmask64 wantlo = _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
18106     const __mmask64 wanthi = _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
18107     const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
18108     const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
18109     const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
18110     const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
18111 
18112     uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
18113     uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
18114 
18115     _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
18116     _mm512_mask_storeu_epi8(outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), outhi);
18117     outbuf += advlo + advhi;
18118   }
18119   outbuf -= adjust;
18120 
18121 tail:
18122   if (inlen != 0) {
18123     // We must have inlen < 31.
18124     inmask = _cvtu32_mask32((1 << inlen) - 1);
18125     in = _mm512_maskz_loadu_epi16(inmask, inbuf);
18126     if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
18127     adjust = inlen - 31;
18128     inlen = 0;
18129     goto lastiteration;
18130   }
18131   *outlen = (outbuf - outbuf_orig) + adjust;
18132   return ((inbuf - inbuf_orig) + adjust);
18133 }
18134 /* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
18135 
18136 } // namespace
18137 } // namespace icelake
18138 } // namespace simdutf
18139 
18140 namespace simdutf {
18141 namespace icelake {
18142 
18143 
18144 simdutf_warn_unused int
detect_encodings(const char * input,size_t length) const18145 implementation::detect_encodings(const char *input,
18146                                  size_t length) const noexcept {
18147   // If there is a BOM, then we trust it.
18148   auto bom_encoding = simdutf::BOM::check_bom(input, length);
18149   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
18150   if (length % 2 == 0) {
18151     const char *buf = input;
18152 
18153     const char *start = buf;
18154     const char *end = input + length;
18155 
18156     bool is_utf8 = true;
18157     bool is_utf16 = true;
18158     bool is_utf32 = true;
18159 
18160     int out = 0;
18161 
18162     avx512_utf8_checker checker{};
18163     __m512i currentmax = _mm512_setzero_si512();
18164     while (buf + 64 <= end) {
18165       __m512i in = _mm512_loadu_si512((__m512i *)buf);
18166       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
18167       __mmask32 surrogates =
18168           _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
18169       if (surrogates) {
18170         is_utf8 = false;
18171 
18172         // Can still be either UTF-16LE or UTF-32 depending on the positions
18173         // of the surrogates To be valid UTF-32, a surrogate cannot be in the
18174         // two most significant bytes of any 32-bit word. On the other hand, to
18175         // be valid UTF-16LE, at least one surrogate must be in the two most
18176         // significant bytes of a 32-bit word since they always come in pairs in
18177         // UTF-16LE. Note that we always proceed in multiple of 4 before this
18178         // point so there is no offset in 32-bit words.
18179 
18180         if ((surrogates & 0xaaaaaaaa) != 0) {
18181           is_utf32 = false;
18182           __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(
18183               diff, _mm512_set1_epi16(uint16_t(0x0400)));
18184           __mmask32 lowsurrogates = surrogates ^ highsurrogates;
18185           // high must be followed by low
18186           if ((highsurrogates << 1) != lowsurrogates) {
18187             return simdutf::encoding_type::unspecified;
18188           }
18189 
18190           bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
18191           if (ends_with_high) {
18192             buf +=
18193                 31 *
18194                 sizeof(char16_t); // advance only by 31 words so that we start
18195                                   // with the high surrogate on the next round.
18196           } else {
18197             buf += 32 * sizeof(char16_t);
18198           }
18199           is_utf16 = validate_utf16le(reinterpret_cast<const char16_t *>(buf),
18200                                       (end - buf) / sizeof(char16_t));
18201           if (!is_utf16) {
18202             return simdutf::encoding_type::unspecified;
18203 
18204           } else {
18205             return simdutf::encoding_type::UTF16_LE;
18206           }
18207 
18208         } else {
18209           is_utf16 = false;
18210           // Check for UTF-32
18211           if (length % 4 == 0) {
18212             const char32_t *input32 = reinterpret_cast<const char32_t *>(buf);
18213             const char32_t *end32 =
18214                 reinterpret_cast<const char32_t *>(start) + length / 4;
18215             if (validate_utf32(input32, end32 - input32)) {
18216               return simdutf::encoding_type::UTF32_LE;
18217             }
18218           }
18219           return simdutf::encoding_type::unspecified;
18220         }
18221         break;
18222       }
18223       // If no surrogate, validate under other encodings as well
18224 
18225       // UTF-32 validation
18226       currentmax = _mm512_max_epu32(in, currentmax);
18227 
18228       // UTF-8 validation
18229       checker.check_next_input(in);
18230 
18231       buf += 64;
18232     }
18233 
18234     // Check which encodings are possible
18235 
18236     if (is_utf8) {
18237       size_t current_length = static_cast<size_t>(buf - start);
18238       if (current_length != length) {
18239         const __m512i utf8 = _mm512_maskz_loadu_epi8(
18240             (1ULL << (length - current_length)) - 1, (const __m512i *)buf);
18241         checker.check_next_input(utf8);
18242       }
18243       checker.check_eof();
18244       if (!checker.errors()) {
18245         out |= simdutf::encoding_type::UTF8;
18246       }
18247     }
18248 
18249     if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(
18250                         reinterpret_cast<const char16_t *>(buf),
18251                         (length - (buf - start)) / 2)) {
18252       out |= simdutf::encoding_type::UTF16_LE;
18253     }
18254 
18255     if (is_utf32 && (length % 4 == 0)) {
18256       currentmax = _mm512_max_epu32(
18257           _mm512_maskz_loadu_epi8(
18258               (1ULL << (length - static_cast<size_t>(buf - start))) - 1,
18259               (const __m512i *)buf),
18260           currentmax);
18261       __mmask16 outside_range = _mm512_cmp_epu32_mask(currentmax, _mm512_set1_epi32(0x10ffff),
18262                                 _MM_CMPINT_GT);
18263       if (outside_range == 0) {
18264         out |= simdutf::encoding_type::UTF32_LE;
18265       }
18266     }
18267 
18268     return out;
18269   } else if (implementation::validate_utf8(input, length)) {
18270     return simdutf::encoding_type::UTF8;
18271   } else {
18272     return simdutf::encoding_type::unspecified;
18273   }
18274 }
18275 
validate_utf8(const char * buf,size_t len) const18276 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
18277     avx512_utf8_checker checker{};
18278     const char* ptr = buf;
18279     const char* end = ptr + len;
18280     for (; ptr + 64 <= end; ptr += 64) {
18281         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
18282         checker.check_next_input(utf8);
18283     }
18284     {
18285        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr);
18286        checker.check_next_input(utf8);
18287     }
18288     checker.check_eof();
18289     return ! checker.errors();
18290 }
18291 
validate_utf8_with_errors(const char * buf,size_t len) const18292 simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
18293     avx512_utf8_checker checker{};
18294     const char* ptr = buf;
18295     const char* end = ptr + len;
18296     size_t count{0};
18297     for (; ptr + 64 <= end; ptr += 64) {
18298       const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
18299       checker.check_next_input(utf8);
18300       if(checker.errors()) {
18301         if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
18302         result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf + count), len - count);
18303         res.count += count;
18304         return res;
18305       }
18306       count += 64;
18307     }
18308     {
18309       const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr);
18310       checker.check_next_input(utf8);
18311       if(checker.errors()) {
18312         if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
18313         result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf + count), len - count);
18314         res.count += count;
18315         return res;
18316       } else {
18317         return result(error_code::SUCCESS, len);
18318       }
18319     }
18320 }
18321 
validate_ascii(const char * buf,size_t len) const18322 simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
18323   return icelake::validate_ascii(buf, len);
18324 }
18325 
validate_ascii_with_errors(const char * buf,size_t len) const18326 simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
18327   const char* buf_orig = buf;
18328   const char* end = buf + len;
18329   const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
18330   for (; buf + 64 <= end; buf += 64) {
18331     const __m512i input = _mm512_loadu_si512((const __m512i*)buf);
18332     __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
18333     if(notascii) {
18334       return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
18335     }
18336   }
18337   {
18338     const __m512i input = _mm512_maskz_loadu_epi8((1ULL<<(end - buf))-1, (const __m512i*)buf);
18339     __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
18340     if(notascii) {
18341       return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
18342     }
18343   }
18344   return result(error_code::SUCCESS, len);
18345 }
18346 
validate_utf16le(const char16_t * buf,size_t len) const18347 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
18348     const char16_t *end = buf + len;
18349 
18350     for(;buf + 32 <= end; ) {
18351       __m512i in = _mm512_loadu_si512((__m512i*)buf);
18352       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
18353       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
18354       if(surrogates) {
18355         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
18356         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
18357         // high must be followed by low
18358         if ((highsurrogates << 1) != lowsurrogates) {
18359            return false;
18360         }
18361         bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
18362         if(ends_with_high) {
18363           buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
18364         } else {
18365           buf += 32;
18366         }
18367       } else {
18368         buf += 32;
18369       }
18370     }
18371     if(buf < end) {
18372       __m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf);
18373       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
18374       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
18375       if(surrogates) {
18376         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
18377         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
18378         // high must be followed by low
18379         if ((highsurrogates << 1) != lowsurrogates) {
18380            return false;
18381         }
18382       }
18383     }
18384     return true;
18385 }
18386 
validate_utf16be(const char16_t * buf,size_t len) const18387 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
18388    const char16_t *end = buf + len;
18389    const __m512i byteflip = _mm512_setr_epi64(
18390             0x0607040502030001,
18391             0x0e0f0c0d0a0b0809,
18392             0x0607040502030001,
18393             0x0e0f0c0d0a0b0809,
18394             0x0607040502030001,
18395             0x0e0f0c0d0a0b0809,
18396             0x0607040502030001,
18397             0x0e0f0c0d0a0b0809
18398         );
18399     for(;buf + 32 <= end; ) {
18400       __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
18401       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
18402       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
18403       if(surrogates) {
18404         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
18405         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
18406         // high must be followed by low
18407         if ((highsurrogates << 1) != lowsurrogates) {
18408            return false;
18409         }
18410         bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
18411         if(ends_with_high) {
18412           buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
18413         } else {
18414           buf += 32;
18415         }
18416       } else {
18417         buf += 32;
18418       }
18419     }
18420     if(buf < end) {
18421       __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip);
18422       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
18423       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
18424       if(surrogates) {
18425         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
18426         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
18427         // high must be followed by low
18428         if ((highsurrogates << 1) != lowsurrogates) {
18429            return false;
18430         }
18431       }
18432     }
18433     return true;
18434 }
18435 
validate_utf16le_with_errors(const char16_t * buf,size_t len) const18436 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
18437     const char16_t *start_buf = buf;
18438     const char16_t *end = buf + len;
18439     for(;buf + 32 <= end; ) {
18440       __m512i in = _mm512_loadu_si512((__m512i*)buf);
18441       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
18442       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
18443       if(surrogates) {
18444         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
18445         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
18446         // high must be followed by low
18447         if ((highsurrogates << 1) != lowsurrogates) {
18448           uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
18449           uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
18450           return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
18451         }
18452         bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
18453         if(ends_with_high) {
18454           buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
18455         } else {
18456           buf += 32;
18457         }
18458       } else {
18459         buf += 32;
18460       }
18461     }
18462     if(buf < end) {
18463       __m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf);
18464       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
18465       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
18466       if(surrogates) {
18467         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
18468         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
18469         // high must be followed by low
18470         if ((highsurrogates << 1) != lowsurrogates) {
18471           uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
18472           uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
18473           return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
18474         }
18475       }
18476     }
18477     return result(error_code::SUCCESS, len);
18478 }
18479 
validate_utf16be_with_errors(const char16_t * buf,size_t len) const18480 simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
18481     const char16_t *start_buf = buf;
18482     const char16_t *end = buf + len;
18483     const __m512i byteflip = _mm512_setr_epi64(
18484             0x0607040502030001,
18485             0x0e0f0c0d0a0b0809,
18486             0x0607040502030001,
18487             0x0e0f0c0d0a0b0809,
18488             0x0607040502030001,
18489             0x0e0f0c0d0a0b0809,
18490             0x0607040502030001,
18491             0x0e0f0c0d0a0b0809
18492         );
18493     for(;buf + 32 <= end; ) {
18494       __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
18495       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
18496       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
18497       if(surrogates) {
18498         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
18499         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
18500         // high must be followed by low
18501         if ((highsurrogates << 1) != lowsurrogates) {
18502           uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
18503           uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
18504           return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
18505         }
18506         bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
18507         if(ends_with_high) {
18508           buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
18509         } else {
18510           buf += 32;
18511         }
18512       } else {
18513         buf += 32;
18514       }
18515     }
18516     if(buf < end) {
18517       __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip);
18518       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
18519       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
18520       if(surrogates) {
18521         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
18522         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
18523         // high must be followed by low
18524         if ((highsurrogates << 1) != lowsurrogates) {
18525           uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
18526           uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
18527           return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
18528         }
18529       }
18530     }
18531     return result(error_code::SUCCESS, len);
18532 }
18533 
validate_utf32(const char32_t * buf,size_t len) const18534 simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
18535   const char32_t * tail = icelake::validate_utf32(buf, len);
18536   if (tail) {
18537     return scalar::utf32::validate(tail, len - (tail - buf));
18538   } else {
18539     return false;
18540   }
18541 }
18542 
validate_utf32_with_errors(const char32_t * buf,size_t len) const18543 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
18544 
18545     const char32_t* end = len >= 16 ? buf + len - 16 : nullptr;
18546     const char32_t* buf_orig = buf;
18547     while (buf <= end) {
18548       __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
18549       __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
18550                                 _MM_CMPINT_GT);
18551       if (outside_range) {
18552         return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
18553       }
18554 
18555       __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
18556 
18557       __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
18558                                 _MM_CMPINT_GT);
18559       if (surrogate_range) {
18560         return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
18561       }
18562       buf += 16;
18563     }
18564     if(buf < buf_orig + len) {
18565       __m512i utf32 = _mm512_maskz_loadu_epi32(__mmask16((1<<(buf_orig + len - buf))-1),(const __m512i*)buf);
18566       __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
18567                                 _MM_CMPINT_GT);
18568       if (outside_range) {
18569         return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
18570       }
18571       __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
18572 
18573       __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
18574                                 _MM_CMPINT_GT);
18575       if (surrogate_range) {
18576         return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
18577       }
18578     }
18579 
18580     return result(error_code::SUCCESS, len);
18581 }
18582 
convert_utf8_to_utf16le(const char * buf,size_t len,char16_t * utf16_output) const18583 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
18584   utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
18585   if (ret.second == nullptr) {
18586     return 0;
18587   }
18588   return ret.second - utf16_output;
18589 }
18590 
convert_utf8_to_utf16be(const char * buf,size_t len,char16_t * utf16_output) const18591 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
18592   utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(buf, len, utf16_output);
18593   if (ret.second == nullptr) {
18594     return 0;
18595   }
18596   return ret.second - utf16_output;
18597 }
18598 
convert_utf8_to_utf16le_with_errors(const char * buf,size_t len,char16_t * utf16_output) const18599 simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
18600    return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
18601 }
18602 
convert_utf8_to_utf16be_with_errors(const char * buf,size_t len,char16_t * utf16_output) const18603 simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
18604    return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
18605 }
18606 
convert_valid_utf8_to_utf16le(const char * buf,size_t len,char16_t * utf16_output) const18607 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
18608   utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(buf, len, utf16_output);
18609   size_t saved_bytes = ret.second - utf16_output;
18610   const char* end = buf + len;
18611   if (ret.first == end) {
18612     return saved_bytes;
18613   }
18614 
18615   // Note: AVX512 procedure looks up 4 bytes forward, and
18616   //       correctly converts multi-byte chars even if their
18617   //       continuation bytes lie outsiede 16-byte window.
18618   //       It meas, we have to skip continuation bytes from
18619   //       the beginning ret.first, as they were already consumed.
18620   while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
18621       ret.first += 1;
18622   }
18623 
18624   if (ret.first != end) {
18625     const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
18626                                         ret.first, len - (ret.first - buf), ret.second);
18627     if (scalar_saved_bytes == 0) { return 0; }
18628     saved_bytes += scalar_saved_bytes;
18629   }
18630 
18631   return saved_bytes;
18632 }
18633 
convert_valid_utf8_to_utf16be(const char * buf,size_t len,char16_t * utf16_output) const18634 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
18635   utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(buf, len, utf16_output);
18636   size_t saved_bytes = ret.second - utf16_output;
18637   const char* end = buf + len;
18638   if (ret.first == end) {
18639     return saved_bytes;
18640   }
18641 
18642   // Note: AVX512 procedure looks up 4 bytes forward, and
18643   //       correctly converts multi-byte chars even if their
18644   //       continuation bytes lie outsiede 16-byte window.
18645   //       It meas, we have to skip continuation bytes from
18646   //       the beginning ret.first, as they were already consumed.
18647   while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
18648       ret.first += 1;
18649   }
18650 
18651   if (ret.first != end) {
18652     const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
18653                                         ret.first, len - (ret.first - buf), ret.second);
18654     if (scalar_saved_bytes == 0) { return 0; }
18655     saved_bytes += scalar_saved_bytes;
18656   }
18657 
18658   return saved_bytes;
18659 }
18660 
18661 
convert_utf8_to_utf32(const char * buf,size_t len,char32_t * utf32_out) const18662 simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept {
18663   uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
18664   utf8_to_utf32_result ret = icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
18665   if (ret.second == nullptr)
18666     return 0;
18667 
18668   size_t saved_bytes = ret.second - utf32_output;
18669   const char* end = buf + len;
18670   if (ret.first == end) {
18671     return saved_bytes;
18672   }
18673 
18674   // Note: the AVX512 procedure looks up 4 bytes forward, and
18675   //       correctly converts multi-byte chars even if their
18676   //       continuation bytes lie outside 16-byte window.
18677   //       It means, we have to skip continuation bytes from
18678   //       the beginning ret.first, as they were already consumed.
18679   while (ret.first != end and ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
18680       ret.first += 1;
18681   }
18682 
18683   if (ret.first != end) {
18684     const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
18685                                         ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
18686     if (scalar_saved_bytes == 0) { return 0; }
18687     saved_bytes += scalar_saved_bytes;
18688   }
18689 
18690   return saved_bytes;
18691 }
18692 
convert_utf8_to_utf32_with_errors(const char * buf,size_t len,char32_t * utf32) const18693 simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32) const noexcept {
18694   uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32);
18695   auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
18696   if (!std::get<2>(ret)) {
18697     auto new_buf = std::get<0>(ret);
18698     // rewind_and_convert_with_errors will seek a potential error from new_buf onward,
18699     // with the ability to go back up to new_buf - buf bytes, and read len - (new_buf - buf) bytes forward.
18700     result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(new_buf - buf, new_buf, len - (new_buf - buf), reinterpret_cast<char32_t *>(std::get<1>(ret)));
18701     res.count += (std::get<0>(ret) - buf);
18702     return res;
18703   }
18704   size_t saved_bytes = std::get<1>(ret) - utf32_output;
18705   const char* end = buf + len;
18706   if (std::get<0>(ret) == end) {
18707     return {simdutf::SUCCESS, saved_bytes};
18708   }
18709 
18710   // Note: the AVX512 procedure looks up 4 bytes forward, and
18711   //       correctly converts multi-byte chars even if their
18712   //       continuation bytes lie outside 16-byte window.
18713   //       It means, we have to skip continuation bytes from
18714   //       the beginning ret.first, as they were already consumed.
18715   while (std::get<0>(ret) != end and ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
18716       std::get<0>(ret) += 1;
18717   }
18718 
18719   if (std::get<0>(ret) != end) {
18720     auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
18721                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), reinterpret_cast<char32_t *>(utf32_output) + saved_bytes);
18722     if (scalar_result.error != simdutf::SUCCESS) {
18723       scalar_result.count +=  (std::get<0>(ret) - buf);
18724     } else {
18725       scalar_result.count += saved_bytes;
18726     }
18727     return scalar_result;
18728   }
18729 
18730   return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)};
18731 }
18732 
18733 
convert_valid_utf8_to_utf32(const char * buf,size_t len,char32_t * utf32_out) const18734 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept {
18735   uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
18736   utf8_to_utf32_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
18737   size_t saved_bytes = ret.second - utf32_output;
18738   const char* end = buf + len;
18739   if (ret.first == end) {
18740     return saved_bytes;
18741   }
18742 
18743   // Note: AVX512 procedure looks up 4 bytes forward, and
18744   //       correctly converts multi-byte chars even if their
18745   //       continuation bytes lie outsiede 16-byte window.
18746   //       It meas, we have to skip continuation bytes from
18747   //       the beginning ret.first, as they were already consumed.
18748   while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
18749       ret.first += 1;
18750   }
18751 
18752   if (ret.first != end) {
18753     const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
18754                                         ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
18755     if (scalar_saved_bytes == 0) { return 0; }
18756     saved_bytes += scalar_saved_bytes;
18757   }
18758 
18759   return saved_bytes;
18760 }
18761 
convert_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const18762 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
18763   size_t outlen;
18764   size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
18765   if(inlen != len) { return 0; }
18766   return outlen;
18767 }
18768 
convert_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const18769 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
18770   size_t outlen;
18771   size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
18772   if(inlen != len) { return 0; }
18773   return outlen;
18774 }
18775 
convert_utf16le_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output) const18776 simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
18777   size_t outlen;
18778   size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
18779   if(inlen != len) {
18780     result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf + inlen, len - outlen, utf8_output + outlen);
18781     res.count += inlen;
18782     return res;
18783   }
18784   return {simdutf::SUCCESS, outlen};
18785 }
18786 
convert_utf16be_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output) const18787 simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
18788   size_t outlen;
18789   size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
18790   if(inlen != len) {
18791     result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf + inlen, len - outlen, utf8_output + outlen);
18792     res.count += inlen;
18793     return res;
18794   }
18795   return {simdutf::SUCCESS, outlen};
18796 }
18797 
convert_valid_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const18798 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
18799   return convert_utf16le_to_utf8(buf, len, utf8_output);
18800 }
18801 
convert_valid_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const18802 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
18803   return convert_utf16be_to_utf8(buf, len, utf8_output);
18804 }
18805 
18806 
convert_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output) const18807 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
18808   std::pair<const char32_t*, char*> ret = avx512_convert_utf32_to_utf8(buf, len, utf8_output);
18809   if (ret.first == nullptr) { return 0; }
18810   size_t saved_bytes = ret.second - utf8_output;
18811   if (ret.first != buf + len) {
18812     const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
18813                                         ret.first, len - (ret.first - buf), ret.second);
18814     if (scalar_saved_bytes == 0) { return 0; }
18815     saved_bytes += scalar_saved_bytes;
18816   }
18817   return saved_bytes;
18818 }
18819 
convert_utf32_to_utf8_with_errors(const char32_t * buf,size_t len,char * utf8_output) const18820 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
18821   // ret.first.count is always the position in the buffer, not the number of words written even if finished
18822   std::pair<result, char*> ret = icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
18823   if (ret.first.count != len) {
18824     result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
18825                                         buf + ret.first.count, len - ret.first.count, ret.second);
18826     if (scalar_res.error) {
18827       scalar_res.count += ret.first.count;
18828       return scalar_res;
18829     } else {
18830       ret.second += scalar_res.count;
18831     }
18832   }
18833   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
18834   return ret.first;
18835 }
18836 
convert_valid_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output) const18837 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
18838   return convert_utf32_to_utf8(buf, len, utf8_output);
18839 }
18840 
convert_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_output) const18841 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
18842   std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
18843   if (ret.first == nullptr) { return 0; }
18844   size_t saved_bytes = ret.second - utf16_output;
18845   if (ret.first != buf + len) {
18846     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
18847                                         ret.first, len - (ret.first - buf), ret.second);
18848     if (scalar_saved_bytes == 0) { return 0; }
18849     saved_bytes += scalar_saved_bytes;
18850   }
18851   return saved_bytes;
18852 }
18853 
convert_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_output) const18854 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
18855   std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
18856   if (ret.first == nullptr) { return 0; }
18857   size_t saved_bytes = ret.second - utf16_output;
18858   if (ret.first != buf + len) {
18859     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
18860                                         ret.first, len - (ret.first - buf), ret.second);
18861     if (scalar_saved_bytes == 0) { return 0; }
18862     saved_bytes += scalar_saved_bytes;
18863   }
18864   return saved_bytes;
18865 }
18866 
convert_utf32_to_utf16le_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output) const18867 simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
18868   // ret.first.count is always the position in the buffer, not the number of words written even if finished
18869   std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
18870   if (ret.first.count != len) {
18871     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
18872                                         buf + ret.first.count, len - ret.first.count, ret.second);
18873     if (scalar_res.error) {
18874       scalar_res.count += ret.first.count;
18875       return scalar_res;
18876     } else {
18877       ret.second += scalar_res.count;
18878     }
18879   }
18880   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
18881   return ret.first;
18882 }
18883 
convert_utf32_to_utf16be_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output) const18884 simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
18885   // ret.first.count is always the position in the buffer, not the number of words written even if finished
18886   std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
18887   if (ret.first.count != len) {
18888     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
18889                                         buf + ret.first.count, len - ret.first.count, ret.second);
18890     if (scalar_res.error) {
18891       scalar_res.count += ret.first.count;
18892       return scalar_res;
18893     } else {
18894       ret.second += scalar_res.count;
18895     }
18896   }
18897   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
18898   return ret.first;
18899 }
18900 
convert_valid_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_output) const18901 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
18902   return convert_utf32_to_utf16le(buf, len, utf16_output);
18903 }
18904 
convert_valid_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_output) const18905 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
18906   return convert_utf32_to_utf16be(buf, len, utf16_output);
18907 }
18908 
convert_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const18909 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
18910   std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
18911   if (!std::get<2>(ret)) { return 0; }
18912   size_t saved_bytes = std::get<1>(ret) - utf32_output;
18913   if (std::get<0>(ret) != buf + len) {
18914     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
18915                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
18916     if (scalar_saved_bytes == 0) { return 0; }
18917     saved_bytes += scalar_saved_bytes;
18918   }
18919   return saved_bytes;
18920 }
18921 
convert_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const18922 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
18923   std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
18924   if (!std::get<2>(ret)) { return 0; }
18925   size_t saved_bytes = std::get<1>(ret) - utf32_output;
18926   if (std::get<0>(ret) != buf + len) {
18927     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
18928                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
18929     if (scalar_saved_bytes == 0) { return 0; }
18930     saved_bytes += scalar_saved_bytes;
18931   }
18932   return saved_bytes;
18933 }
18934 
convert_utf16le_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output) const18935 simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
18936   std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
18937   if (!std::get<2>(ret)) {
18938     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
18939                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
18940     scalar_res.count += (std::get<0>(ret) - buf);
18941     return scalar_res;
18942   }
18943   size_t saved_bytes = std::get<1>(ret) - utf32_output;
18944   if (std::get<0>(ret) != buf + len) {
18945     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
18946                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
18947     if (scalar_res.error) {
18948       scalar_res.count += (std::get<0>(ret) - buf);
18949       return scalar_res;
18950     } else {
18951       scalar_res.count += saved_bytes;
18952       return scalar_res;
18953     }
18954   }
18955   return simdutf::result(simdutf::SUCCESS, saved_bytes);
18956 }
18957 
convert_utf16be_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output) const18958 simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
18959   std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
18960   if (!std::get<2>(ret)) {
18961     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
18962                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
18963     scalar_res.count += (std::get<0>(ret) - buf);
18964     return scalar_res;
18965   }
18966   size_t saved_bytes = std::get<1>(ret) - utf32_output;
18967   if (std::get<0>(ret) != buf + len) {
18968     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
18969                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
18970     if (scalar_res.error) {
18971       scalar_res.count += (std::get<0>(ret) - buf);
18972       return scalar_res;
18973     } else {
18974       scalar_res.count += saved_bytes;
18975       return scalar_res;
18976     }
18977   }
18978   return simdutf::result(simdutf::SUCCESS, saved_bytes);
18979 }
18980 
convert_valid_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const18981 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
18982   std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
18983   if (!std::get<2>(ret)) { return 0; }
18984   size_t saved_bytes = std::get<1>(ret) - utf32_output;
18985   if (std::get<0>(ret) != buf + len) {
18986     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
18987                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
18988     if (scalar_saved_bytes == 0) { return 0; }
18989     saved_bytes += scalar_saved_bytes;
18990   }
18991   return saved_bytes;
18992 }
18993 
convert_valid_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const18994 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
18995   std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
18996   if (!std::get<2>(ret)) { return 0; }
18997   size_t saved_bytes = std::get<1>(ret) - utf32_output;
18998   if (std::get<0>(ret) != buf + len) {
18999     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
19000                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
19001     if (scalar_saved_bytes == 0) { return 0; }
19002     saved_bytes += scalar_saved_bytes;
19003   }
19004   return saved_bytes;
19005 }
19006 
change_endianness_utf16(const char16_t * input,size_t length,char16_t * output) const19007 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
19008   size_t pos = 0;
19009   const __m512i byteflip = _mm512_setr_epi64(
19010             0x0607040502030001,
19011             0x0e0f0c0d0a0b0809,
19012             0x0607040502030001,
19013             0x0e0f0c0d0a0b0809,
19014             0x0607040502030001,
19015             0x0e0f0c0d0a0b0809,
19016             0x0607040502030001,
19017             0x0e0f0c0d0a0b0809
19018         );
19019   while (pos + 32 <= length) {
19020     __m512i utf16 = _mm512_loadu_si512((const __m512i*)(input + pos));
19021     utf16 = _mm512_shuffle_epi8(utf16, byteflip);
19022     _mm512_storeu_si512(output + pos, utf16);
19023     pos += 32;
19024   }
19025   if(pos < length) {
19026     __mmask32 m((1<< (length - pos))-1);
19027     __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i*)(input + pos));
19028     utf16 = _mm512_shuffle_epi8(utf16, byteflip);
19029     _mm512_mask_storeu_epi16(output + pos, m, utf16);
19030   }
19031 }
19032 
19033 
count_utf16le(const char16_t * input,size_t length) const19034 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
19035   const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
19036   const char16_t* ptr = input;
19037 
19038   const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
19039   const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
19040 
19041   size_t count{0};
19042 
19043   while (ptr <= end) {
19044     __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
19045     ptr += 32;
19046     uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
19047     count += count_ones(not_high_surrogate);
19048   }
19049 
19050   return count + scalar::utf16::count_code_points<endianness::LITTLE>(ptr, length - (ptr - input));
19051 }
19052 
count_utf16be(const char16_t * input,size_t length) const19053 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
19054   const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
19055   const char16_t* ptr = input;
19056 
19057   const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
19058   const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
19059 
19060   size_t count{0};
19061   const __m512i byteflip = _mm512_setr_epi64(
19062             0x0607040502030001,
19063             0x0e0f0c0d0a0b0809,
19064             0x0607040502030001,
19065             0x0e0f0c0d0a0b0809,
19066             0x0607040502030001,
19067             0x0e0f0c0d0a0b0809,
19068             0x0607040502030001,
19069             0x0e0f0c0d0a0b0809
19070         );
19071   while (ptr <= end) {
19072     __m512i utf16 = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)ptr), byteflip);
19073     ptr += 32;
19074     uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
19075     count += count_ones(not_high_surrogate);
19076   }
19077 
19078   return count + scalar::utf16::count_code_points<endianness::BIG>(ptr, length - (ptr - input));
19079 }
19080 
19081 
count_utf8(const char * input,size_t length) const19082 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
19083   const char* end = length >= 64 ? input + length - 64 : nullptr;
19084   const char* ptr = input;
19085 
19086   const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
19087 
19088   size_t count{0};
19089 
19090   while (ptr <= end) {
19091     __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
19092     ptr += 64;
19093     uint64_t continuation_bitmask = static_cast<uint64_t>(_mm512_cmple_epi8_mask(utf8, continuation));
19094     count += 64 - count_ones(continuation_bitmask);
19095   }
19096 
19097   return count + scalar::utf8::count_code_points(ptr, length - (ptr - input));
19098 }
19099 
19100 
utf8_length_from_utf16le(const char16_t * input,size_t length) const19101 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
19102   const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
19103   const char16_t* ptr = input;
19104 
19105   const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
19106   const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
19107   const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
19108   const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
19109 
19110   size_t count{0};
19111 
19112   while (ptr <= end) {
19113     __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
19114     ptr += 32;
19115     __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
19116     __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
19117     __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
19118     __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
19119 
19120     size_t ascii_count = count_ones(ascii_bitmask);
19121     size_t two_bytes_count = count_ones(two_bytes_bitmask);
19122     size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
19123     size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
19124 
19125     count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
19126   }
19127 
19128   return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(ptr, length - (ptr - input));
19129 }
19130 
utf8_length_from_utf16be(const char16_t * input,size_t length) const19131 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
19132   const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
19133   const char16_t* ptr = input;
19134 
19135   const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
19136   const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
19137   const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
19138   const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
19139 
19140   size_t count{0};
19141   const __m512i byteflip = _mm512_setr_epi64(
19142             0x0607040502030001,
19143             0x0e0f0c0d0a0b0809,
19144             0x0607040502030001,
19145             0x0e0f0c0d0a0b0809,
19146             0x0607040502030001,
19147             0x0e0f0c0d0a0b0809,
19148             0x0607040502030001,
19149             0x0e0f0c0d0a0b0809
19150         );
19151   while (ptr <= end) {
19152     __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
19153     utf16 = _mm512_shuffle_epi8(utf16, byteflip);
19154     ptr += 32;
19155     __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
19156     __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
19157     __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
19158     __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
19159 
19160     size_t ascii_count = count_ones(ascii_bitmask);
19161     size_t two_bytes_count = count_ones(two_bytes_bitmask);
19162     size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
19163     size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
19164     count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
19165   }
19166 
19167   return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(ptr, length - (ptr - input));
19168 }
19169 
utf32_length_from_utf16le(const char16_t * input,size_t length) const19170 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
19171   return implementation::count_utf16le(input, length);
19172 }
19173 
utf32_length_from_utf16be(const char16_t * input,size_t length) const19174 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
19175   return implementation::count_utf16be(input, length);
19176 }
19177 
utf16_length_from_utf8(const char * input,size_t length) const19178 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
19179     size_t pos = 0;
19180     size_t count = 0;
19181     // This algorithm could no doubt be improved!
19182     for(;pos + 64 <= length; pos += 64) {
19183       __m512i utf8 = _mm512_loadu_si512((const __m512i*)(input+pos));
19184       uint64_t utf8_continuation_mask = _mm512_cmple_epi8_mask(utf8, _mm512_set1_epi8(-65+1));
19185       // We count one word for anything that is not a continuation (so
19186       // leading bytes).
19187       count += 64 - count_ones(utf8_continuation_mask);
19188       uint64_t utf8_4byte = _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240)));
19189       count += count_ones(utf8_4byte);
19190     }
19191     return count + scalar::utf8::utf16_length_from_utf8(input + pos, length - pos);
19192 }
19193 
utf8_length_from_utf32(const char32_t * input,size_t length) const19194 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
19195   const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
19196   const char32_t* ptr = input;
19197 
19198   const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
19199   const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
19200   const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
19201 
19202   size_t count{0};
19203 
19204   while (ptr <= end) {
19205     __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
19206     ptr += 16;
19207     __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
19208     __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
19209     __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32, v_0000_ffff);
19210 
19211     size_t ascii_count = count_ones(ascii_bitmask);
19212     size_t two_bytes_count = count_ones(two_bytes_bitmask);
19213     size_t three_bytes_count = count_ones(three_bytes_bitmask);
19214     size_t four_bytes_count = 16 - ascii_count - two_bytes_count - three_bytes_count;
19215     count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 4*four_bytes_count;
19216   }
19217 
19218   return count + scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
19219 }
19220 
utf16_length_from_utf32(const char32_t * input,size_t length) const19221 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
19222   const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
19223   const char32_t* ptr = input;
19224 
19225   const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
19226 
19227   size_t count{0};
19228 
19229   while (ptr <= end) {
19230     __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
19231     ptr += 16;
19232     __mmask16 surrogates_bitmask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
19233 
19234     count += 16 + count_ones(surrogates_bitmask);
19235   }
19236 
19237   return count + scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
19238 }
19239 
utf32_length_from_utf8(const char * input,size_t length) const19240 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
19241   return implementation::count_utf8(input, length);
19242 }
19243 
19244 } // namespace icelake
19245 } // namespace simdutf
19246 
19247 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/icelake/end.h
19248 /* begin file src/simdutf/icelake/end.h */
19249 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
19250 // nothing needed.
19251 #else
19252 SIMDUTF_UNTARGET_REGION
19253 #endif
19254 
19255 
19256 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
19257 SIMDUTF_POP_DISABLE_WARNINGS
19258 #endif // end of workaround
19259 /* end file src/simdutf/icelake/end.h */
19260 /* end file src/icelake/implementation.cpp */
19261 #endif
19262 #if SIMDUTF_IMPLEMENTATION_HASWELL
19263 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/implementation.cpp
19264 /* begin file src/haswell/implementation.cpp */
19265 
19266 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h
19267 /* begin file src/simdutf/haswell/begin.h */
19268 // redefining SIMDUTF_IMPLEMENTATION to "haswell"
19269 // #define SIMDUTF_IMPLEMENTATION haswell
19270 
19271 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
19272 // nothing needed.
19273 #else
19274 SIMDUTF_TARGET_HASWELL
19275 #endif
19276 
19277 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
19278 SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
19279 #endif // end of workaround
19280 /* end file src/simdutf/haswell/begin.h */
19281 namespace simdutf {
19282 namespace haswell {
19283 namespace {
19284 #ifndef SIMDUTF_HASWELL_H
19285 #error "haswell.h must be included"
19286 #endif
19287 using namespace simd;
19288 
19289 
is_ascii(const simd8x64<uint8_t> & input)19290 simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
19291   return input.reduce_or().is_ascii();
19292 }
19293 
must_be_continuation(const simd8<uint8_t> prev1,const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)19294 simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
19295   simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
19296   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
19297   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
19298   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
19299   return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
19300 }
19301 
must_be_2_3_continuation(const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)19302 simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
19303   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
19304   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
19305   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
19306   return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
19307 }
19308 
19309 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_detect_encodings.cpp
19310 /* begin file src/haswell/avx2_detect_encodings.cpp */
19311 template<class checker>
19312 // len is known to be a multiple of 2 when this is called
avx2_detect_encodings(const char * buf,size_t len)19313 int avx2_detect_encodings(const char * buf, size_t len) {
19314     const char* start = buf;
19315     const char* end = buf + len;
19316 
19317     bool is_utf8 = true;
19318     bool is_utf16 = true;
19319     bool is_utf32 = true;
19320 
19321     int out = 0;
19322 
19323     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
19324     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
19325 
19326     __m256i currentmax = _mm256_setzero_si256();
19327 
19328     checker check{};
19329 
19330     while(buf + 64 <= end) {
19331         __m256i in = _mm256_loadu_si256((__m256i*)buf);
19332         __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
19333 
19334         const auto u0 = simd16<uint16_t>(in);
19335         const auto u1 = simd16<uint16_t>(nextin);
19336 
19337         const auto v0 = u0.shr<8>();
19338         const auto v1 = u1.shr<8>();
19339 
19340         const auto in16 = simd16<uint16_t>::pack(v0, v1);
19341 
19342         const auto surrogates_wordmask0 = (in16 & v_f8) == v_d8;
19343         uint32_t surrogates_bitmask0 = surrogates_wordmask0.to_bitmask();
19344 
19345         // Check for surrogates
19346         if (surrogates_bitmask0 != 0x0) {
19347             // Cannot be UTF8
19348             is_utf8 = false;
19349             // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates
19350             // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
19351             // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant
19352             // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
19353             // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words.
19354 
19355             if ((surrogates_bitmask0 & 0xaaaaaaaa) != 0) {
19356                 is_utf32 = false;
19357                 // Code from avx2_validate_utf16le.cpp
19358                 const char16_t * input = reinterpret_cast<const char16_t*>(buf);
19359                 const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
19360 
19361                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
19362                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
19363 
19364                 const uint32_t V0 = ~surrogates_bitmask0;
19365 
19366                 const auto    vH0 = (in16 & v_fc) == v_dc;
19367                 const uint32_t H0 = vH0.to_bitmask();
19368 
19369                 const uint32_t L0 = ~H0 & surrogates_bitmask0;
19370 
19371                 const uint32_t a0 = L0 & (H0 >> 1);
19372                 const uint32_t b0 = a0 << 1;
19373                 const uint32_t c0 = V0 | a0 | b0;
19374 
19375                 if (c0 == 0xffffffff) {
19376                     input += simd16<uint16_t>::ELEMENTS * 2;
19377                 } else if (c0 == 0x7fffffff) {
19378                     input += simd16<uint16_t>::ELEMENTS * 2 - 1;
19379                 } else {
19380                     return simdutf::encoding_type::unspecified;
19381                 }
19382 
19383                 while (input + simd16<uint16_t>::ELEMENTS * 2 < end16) {
19384                     const auto in0 = simd16<uint16_t>(input);
19385                     const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
19386 
19387                     const auto t0 = in0.shr<8>();
19388                     const auto t1 = in1.shr<8>();
19389 
19390                     const auto in_16 = simd16<uint16_t>::pack(t0, t1);
19391 
19392                     const auto surrogates_wordmask = (in_16 & v_f8) == v_d8;
19393                     const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
19394                     if (surrogates_bitmask == 0x0) {
19395                         input += simd16<uint16_t>::ELEMENTS * 2;
19396                     } else {
19397                         const uint32_t V = ~surrogates_bitmask;
19398 
19399                         const auto    vH = (in_16 & v_fc) == v_dc;
19400                         const uint32_t H = vH.to_bitmask();
19401 
19402                         const uint32_t L = ~H & surrogates_bitmask;
19403 
19404                         const uint32_t a = L & (H >> 1);
19405 
19406                         const uint32_t b = a << 1;
19407 
19408                         const uint32_t c = V | a | b;
19409 
19410                         if (c == 0xffffffff) {
19411                             input += simd16<uint16_t>::ELEMENTS * 2;
19412                         } else if (c == 0x7fffffff) {
19413                             input += simd16<uint16_t>::ELEMENTS * 2 - 1;
19414                         } else {
19415                             return simdutf::encoding_type::unspecified;
19416                         }
19417                     }
19418                 }
19419             } else {
19420                 is_utf16 = false;
19421                 // Check for UTF-32
19422                 if (len % 4 == 0) {
19423                     const char32_t * input = reinterpret_cast<const char32_t*>(buf);
19424                     const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
19425 
19426                     // Must start checking for surrogates
19427                     __m256i currentoffsetmax = _mm256_setzero_si256();
19428                     const __m256i offset = _mm256_set1_epi32(0xffff2000);
19429                     const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
19430 
19431                     currentmax = _mm256_max_epu32(in, currentmax);
19432                     currentmax = _mm256_max_epu32(nextin, currentmax);
19433 
19434                     currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
19435                     currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(nextin, offset), currentoffsetmax);
19436 
19437                     while (input + 8 < end32) {
19438                         const __m256i in32 = _mm256_loadu_si256((__m256i *)input);
19439                         currentmax = _mm256_max_epu32(in32,currentmax);
19440                         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in32, offset), currentoffsetmax);
19441                         input += 8;
19442                     }
19443 
19444                     __m256i forbidden_words = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
19445                     if(_mm256_testz_si256(forbidden_words, forbidden_words) == 0) {
19446                         return simdutf::encoding_type::unspecified;
19447                     }
19448                 } else {
19449                     return simdutf::encoding_type::unspecified;
19450                 }
19451             }
19452             break;
19453         }
19454         // If no surrogate, validate under other encodings as well
19455 
19456         // UTF-32 validation
19457         currentmax = _mm256_max_epu32(in, currentmax);
19458         currentmax = _mm256_max_epu32(nextin, currentmax);
19459 
19460         // UTF-8 validation
19461         // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
19462         simd::simd8x64<uint8_t> in8(in, nextin);
19463         check.check_next_input(in8);
19464 
19465         buf += 64;
19466     }
19467 
19468     // Check which encodings are possible
19469 
19470     if (is_utf8) {
19471         if (static_cast<size_t>(buf - start) != len) {
19472             uint8_t block[64]{};
19473             std::memset(block, 0x20, 64);
19474             std::memcpy(block, buf, len - (buf - start));
19475             simd::simd8x64<uint8_t> in(block);
19476             check.check_next_input(in);
19477         }
19478         if (!check.errors()) {
19479             out |= simdutf::encoding_type::UTF8;
19480         }
19481     }
19482 
19483     if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
19484         out |= simdutf::encoding_type::UTF16_LE;
19485     }
19486 
19487     if (is_utf32 && (len % 4 == 0)) {
19488         const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
19489         __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
19490         if (_mm256_testz_si256(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
19491             out |= simdutf::encoding_type::UTF32_LE;
19492         }
19493     }
19494 
19495     return out;
19496 }
19497 /* end file src/haswell/avx2_detect_encodings.cpp */
19498 
19499 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_validate_utf16.cpp
19500 /* begin file src/haswell/avx2_validate_utf16.cpp */
19501 /*
19502     In UTF-16 words in range 0xD800 to 0xDFFF have special meaning.
19503 
19504     In a vectorized algorithm we want to examine the most significant
19505     nibble in order to select a fast path. If none of highest nibbles
19506     are 0xD (13), than we are sure that UTF-16 chunk in a vector
19507     register is valid.
19508 
19509     Let us analyze what we need to check if the nibble is 0xD. The
19510     value of the preceding nibble determines what we have:
19511 
19512     0xd000 .. 0xd7ff - a valid word
19513     0xd800 .. 0xdbff - low surrogate
19514     0xdc00 .. 0xdfff - high surrogate
19515 
19516     Other constraints we have to consider:
19517     - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
19518     - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
19519     - there must not be sole low surrogate nor high surrogate
19520 
19521     We're going to build three bitmasks based on the 3rd nibble:
19522     - V = valid word,
19523     - L = low surrogate (0xd800 .. 0xdbff)
19524     - H = high surrogate (0xdc00 .. 0xdfff)
19525 
19526       0   1   2   3   4   5   6   7    <--- word index
19527     [ V | L | H | L | H | V | V | L ]
19528       1   0   0   0   0   1   1   0     - V = valid masks
19529       0   1   0   1   0   0   0   1     - L = low surrogate
19530       0   0   1   0   1   0   0   0     - H high surrogate
19531 
19532 
19533       1   0   0   0   0   1   1   0   V = valid masks
19534       0   1   0   1   0   0   0   0   a = L & (H >> 1)
19535       0   0   1   0   1   0   0   0   b = a << 1
19536       1   1   1   1   1   1   1   0   c = V | a | b
19537                                   ^
19538                                   the last bit can be zero, we just consume 7 words
19539                                   and recheck this word in the next iteration
19540 */
19541 
19542 /* Returns:
19543    - pointer to the last unprocessed character (a scalar fallback should check the rest);
19544    - nullptr if an error was detected.
19545 */
19546 template <endianness big_endian>
avx2_validate_utf16(const char16_t * input,size_t size)19547 const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) {
19548     const char16_t* end = input + size;
19549 
19550     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
19551     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
19552     const auto v_fc = simd8<uint8_t>::splat(0xfc);
19553     const auto v_dc = simd8<uint8_t>::splat(0xdc);
19554 
19555     while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
19556         // 0. Load data: since the validation takes into account only higher
19557         //    byte of each word, we compress the two vectors into one which
19558         //    consists only the higher bytes.
19559         auto in0 = simd16<uint16_t>(input);
19560         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
19561 
19562         if (big_endian) {
19563             in0 = in0.swap_bytes();
19564             in1 = in1.swap_bytes();
19565         }
19566 
19567         const auto t0 = in0.shr<8>();
19568         const auto t1 = in1.shr<8>();
19569 
19570         const auto in = simd16<uint16_t>::pack(t0, t1);
19571 
19572         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
19573         const auto surrogates_wordmask = (in & v_f8) == v_d8;
19574         const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
19575         if (surrogates_bitmask == 0x0) {
19576             input += simd16<uint16_t>::ELEMENTS * 2;
19577         } else {
19578             // 2. We have some surrogates that have to be distinguished:
19579             //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
19580             //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
19581             //
19582             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
19583 
19584             // V - non-surrogate words
19585             //     V = not surrogates_wordmask
19586             const uint32_t V = ~surrogates_bitmask;
19587 
19588             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
19589             const auto    vH = (in & v_fc) == v_dc;
19590             const uint32_t H = vH.to_bitmask();
19591 
19592             // L - word mask for low surrogates
19593             //     L = not H and surrogates_wordmask
19594             const uint32_t L = ~H & surrogates_bitmask;
19595 
19596             const uint32_t a = L & (H >> 1);  // A low surrogate must be followed by high one.
19597                                               // (A low surrogate placed in the 7th register's word
19598                                               // is an exception we handle.)
19599             const uint32_t b = a << 1;        // Just mark that the opposite fact is hold,
19600                                               // thanks to that we have only two masks for valid case.
19601             const uint32_t c = V | a | b;     // Combine all the masks into the final one.
19602 
19603             if (c == 0xffffffff) {
19604                 // The whole input register contains valid UTF-16, i.e.,
19605                 // either single words or proper surrogate pairs.
19606                 input += simd16<uint16_t>::ELEMENTS * 2;
19607             } else if (c == 0x7fffffff) {
19608                 // The 31 lower words of the input register contains valid UTF-16.
19609                 // The 31 word may be either a low or high surrogate. It the next
19610                 // iteration we 1) check if the low surrogate is followed by a high
19611                 // one, 2) reject sole high surrogate.
19612                 input += simd16<uint16_t>::ELEMENTS * 2 - 1;
19613             } else {
19614                 return nullptr;
19615             }
19616         }
19617     }
19618 
19619     return input;
19620 }
19621 
19622 
19623 template <endianness big_endian>
avx2_validate_utf16_with_errors(const char16_t * input,size_t size)19624 const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size) {
19625     const char16_t* start = input;
19626     const char16_t* end = input + size;
19627 
19628     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
19629     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
19630     const auto v_fc = simd8<uint8_t>::splat(0xfc);
19631     const auto v_dc = simd8<uint8_t>::splat(0xdc);
19632 
19633     while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
19634         // 0. Load data: since the validation takes into account only higher
19635         //    byte of each word, we compress the two vectors into one which
19636         //    consists only the higher bytes.
19637         auto in0 = simd16<uint16_t>(input);
19638         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
19639 
19640         if (big_endian) {
19641             in0 = in0.swap_bytes();
19642             in1 = in1.swap_bytes();
19643         }
19644 
19645         const auto t0 = in0.shr<8>();
19646         const auto t1 = in1.shr<8>();
19647 
19648         const auto in = simd16<uint16_t>::pack(t0, t1);
19649 
19650         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
19651         const auto surrogates_wordmask = (in & v_f8) == v_d8;
19652         const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
19653         if (surrogates_bitmask == 0x0) {
19654             input += simd16<uint16_t>::ELEMENTS * 2;
19655         } else {
19656             // 2. We have some surrogates that have to be distinguished:
19657             //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
19658             //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
19659             //
19660             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
19661 
19662             // V - non-surrogate words
19663             //     V = not surrogates_wordmask
19664             const uint32_t V = ~surrogates_bitmask;
19665 
19666             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
19667             const auto    vH = (in & v_fc) == v_dc;
19668             const uint32_t H = vH.to_bitmask();
19669 
19670             // L - word mask for low surrogates
19671             //     L = not H and surrogates_wordmask
19672             const uint32_t L = ~H & surrogates_bitmask;
19673 
19674             const uint32_t a = L & (H >> 1);  // A low surrogate must be followed by high one.
19675                                               // (A low surrogate placed in the 7th register's word
19676                                               // is an exception we handle.)
19677             const uint32_t b = a << 1;        // Just mark that the opposite fact is hold,
19678                                               // thanks to that we have only two masks for valid case.
19679             const uint32_t c = V | a | b;     // Combine all the masks into the final one.
19680 
19681             if (c == 0xffffffff) {
19682                 // The whole input register contains valid UTF-16, i.e.,
19683                 // either single words or proper surrogate pairs.
19684                 input += simd16<uint16_t>::ELEMENTS * 2;
19685             } else if (c == 0x7fffffff) {
19686                 // The 31 lower words of the input register contains valid UTF-16.
19687                 // The 31 word may be either a low or high surrogate. It the next
19688                 // iteration we 1) check if the low surrogate is followed by a high
19689                 // one, 2) reject sole high surrogate.
19690                 input += simd16<uint16_t>::ELEMENTS * 2 - 1;
19691             } else {
19692                 return result(error_code::SURROGATE, input - start);
19693             }
19694         }
19695     }
19696 
19697     return result(error_code::SUCCESS, input - start);
19698 }
19699 /* end file src/haswell/avx2_validate_utf16.cpp */
19700 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_validate_utf32le.cpp
19701 /* begin file src/haswell/avx2_validate_utf32le.cpp */
19702 /* Returns:
19703    - pointer to the last unprocessed character (a scalar fallback should check the rest);
19704    - nullptr if an error was detected.
19705 */
avx2_validate_utf32le(const char32_t * input,size_t size)19706 const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) {
19707     const char32_t* end = input + size;
19708 
19709     const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
19710     const __m256i offset = _mm256_set1_epi32(0xffff2000);
19711     const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
19712     __m256i currentmax = _mm256_setzero_si256();
19713     __m256i currentoffsetmax = _mm256_setzero_si256();
19714 
19715     while (input + 8 < end) {
19716         const __m256i in = _mm256_loadu_si256((__m256i *)input);
19717         currentmax = _mm256_max_epu32(in,currentmax);
19718         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
19719         input += 8;
19720     }
19721     __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
19722     if(_mm256_testz_si256(is_zero, is_zero) == 0) {
19723         return nullptr;
19724     }
19725 
19726     is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
19727     if(_mm256_testz_si256(is_zero, is_zero) == 0) {
19728         return nullptr;
19729     }
19730 
19731     return input;
19732 }
19733 
19734 
avx2_validate_utf32le_with_errors(const char32_t * input,size_t size)19735 const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size) {
19736     const char32_t* start = input;
19737     const char32_t* end = input + size;
19738 
19739     const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
19740     const __m256i offset = _mm256_set1_epi32(0xffff2000);
19741     const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
19742     __m256i currentmax = _mm256_setzero_si256();
19743     __m256i currentoffsetmax = _mm256_setzero_si256();
19744 
19745     while (input + 8 < end) {
19746         const __m256i in = _mm256_loadu_si256((__m256i *)input);
19747         currentmax = _mm256_max_epu32(in,currentmax);
19748         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
19749 
19750         __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
19751         if(_mm256_testz_si256(is_zero, is_zero) == 0) {
19752             return result(error_code::TOO_LARGE, input - start);
19753         }
19754 
19755         is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
19756         if(_mm256_testz_si256(is_zero, is_zero) == 0) {
19757             return result(error_code::SURROGATE, input - start);
19758         }
19759         input += 8;
19760     }
19761 
19762     return result(error_code::SUCCESS, input - start);
19763 }
19764 /* end file src/haswell/avx2_validate_utf32le.cpp */
19765 
19766 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf16.cpp
19767 /* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */
19768 // depends on "tables/utf8_to_utf16_tables.h"
19769 
19770 
19771 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
19772 // end of the code points. Only the least significant 12 bits of the mask
19773 // are accessed.
19774 // It returns how many bytes were consumed (up to 12).
19775 template <endianness big_endian>
convert_masked_utf8_to_utf16(const char * input,uint64_t utf8_end_of_code_point_mask,char16_t * & utf16_output)19776 size_t convert_masked_utf8_to_utf16(const char *input,
19777                            uint64_t utf8_end_of_code_point_mask,
19778                            char16_t *&utf16_output) {
19779   // we use an approach where we try to process up to 12 input bytes.
19780   // Why 12 input bytes and not 16? Because we are concerned with the size of
19781   // the lookup tables. Also 12 is nicely divisible by two and three.
19782   //
19783   //
19784   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
19785   // beneficial to have fast paths that depend on branch prediction but have less latency.
19786   // This results in more instructions but, potentially, also higher speeds.
19787   //
19788   // We first try a few fast paths.
19789   const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
19790   const __m128i in = _mm_loadu_si128((__m128i *)input);
19791   const uint16_t input_utf8_end_of_code_point_mask =
19792       utf8_end_of_code_point_mask & 0xfff;
19793   if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
19794     // We process the data in chunks of 16 bytes.
19795     __m256i ascii = _mm256_cvtepu8_epi16(in);
19796     if (big_endian) {
19797       const __m256i swap256 = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
19798                                   17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
19799       ascii = _mm256_shuffle_epi8(ascii, swap256);
19800     }
19801     _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
19802     utf16_output += 16; // We wrote 16 16-bit characters.
19803     return 16; // We consumed 16 bytes.
19804   }
19805   if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
19806     // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
19807     // There is probably a more efficient sequence, but the following might do.
19808     const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
19809     const __m128i perm = _mm_shuffle_epi8(in, sh);
19810     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
19811     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
19812     __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
19813     if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
19814     _mm_storeu_si128((__m128i *)utf16_output, composed);
19815     utf16_output += 8; // We wrote 16 bytes, 8 code points.
19816     return 16;
19817   }
19818   if(input_utf8_end_of_code_point_mask == 0x924) {
19819     // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
19820     // There is probably a more efficient sequence, but the following might do.
19821     const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
19822     const __m128i perm = _mm_shuffle_epi8(in, sh);
19823     const __m128i ascii =
19824         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
19825     const __m128i middlebyte =
19826         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
19827     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
19828     const __m128i highbyte =
19829         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
19830     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
19831     const __m128i composed =
19832         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
19833     __m128i composed_repacked = _mm_packus_epi32(composed, composed);
19834     if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
19835     _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
19836     utf16_output += 4;
19837     return 12;
19838   }
19839 
19840   const uint8_t idx =
19841       simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
19842   const uint8_t consumed =
19843       simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
19844   if (idx < 64) {
19845     // SIX (6) input code-words
19846     // this is a relatively easy scenario
19847     // we process SIX (6) input code-words. The max length in bytes of six code
19848     // words spanning between 1 and 2 bytes each is 12 bytes. On processors
19849     // where pdep/pext is fast, we might be able to use a small lookup table.
19850     const __m128i sh =
19851         _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
19852     const __m128i perm = _mm_shuffle_epi8(in, sh);
19853     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
19854     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
19855     __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
19856     if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
19857     _mm_storeu_si128((__m128i *)utf16_output, composed);
19858     utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential overflow of 4 bytes.
19859   } else if (idx < 145) {
19860     // FOUR (4) input code-words
19861     const __m128i sh =
19862         _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
19863     const __m128i perm = _mm_shuffle_epi8(in, sh);
19864     const __m128i ascii =
19865         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
19866     const __m128i middlebyte =
19867         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
19868     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
19869     const __m128i highbyte =
19870         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
19871     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
19872     const __m128i composed =
19873         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
19874     __m128i composed_repacked = _mm_packus_epi32(composed, composed);
19875     if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
19876     _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
19877     utf16_output += 4; // Here we overflow by 8 bytes.
19878   } else if (idx < 209) {
19879     // TWO (2) input code-words
19880     //////////////
19881     // There might be garbage inputs where a leading byte mascarades as a four-byte
19882     // leading byte (by being followed by 3 continuation byte), but is not greater than
19883     // 0xf0. This could trigger a buffer overflow if we only counted leading
19884     // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
19885     // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
19886     // We do as at the cost of an extra mask.
19887     /////////////
19888     const __m128i sh =
19889         _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
19890     const __m128i perm = _mm_shuffle_epi8(in, sh);
19891     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
19892     const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
19893     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
19894     __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
19895     // correct for spurious high bit
19896     const __m128i correct =
19897         _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
19898     middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
19899     const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
19900     // We deliberately carry the leading four bits in highbyte if they are present,
19901     // we remove them later when computing hightenbits.
19902     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
19903     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
19904     // When we need to generate a surrogate pair (leading byte > 0xF0), then
19905     // the corresponding 32-bit value in 'composed'  will be greater than
19906     // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
19907     // location of the surrogate pairs.
19908     const __m128i composed =
19909         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
19910                      _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
19911     const __m128i composedminus =
19912         _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
19913     const __m128i lowtenbits =
19914         _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
19915     // Notice the 0x3ff mask:
19916     const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
19917     const __m128i lowtenbitsadd =
19918         _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
19919     const __m128i hightenbitsadd =
19920         _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
19921     const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
19922     __m128i surrogates =
19923         _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
19924     uint32_t basic_buffer[4];
19925     uint32_t basic_buffer_swap[4];
19926     if (big_endian) {
19927       _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
19928       surrogates = _mm_shuffle_epi8(surrogates, swap);
19929     }
19930     _mm_storeu_si128((__m128i *)basic_buffer, composed);
19931     uint32_t surrogate_buffer[4];
19932     _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
19933     for (size_t i = 0; i < 3; i++) {
19934       if(basic_buffer[i] > 0x3c00000) {
19935         utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
19936         utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
19937         utf16_output += 2;
19938       } else  {
19939         utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
19940         utf16_output++;
19941       }
19942     }
19943   } else {
19944     // here we know that there is an error but we do not handle errors
19945   }
19946   return consumed;
19947 }
19948 /* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */
19949 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf32.cpp
19950 /* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */
19951 // depends on "tables/utf8_to_utf16_tables.h"
19952 
19953 
19954 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
19955 // end of the code points. Only the least significant 12 bits of the mask
19956 // are accessed.
19957 // It returns how many bytes were consumed (up to 12).
convert_masked_utf8_to_utf32(const char * input,uint64_t utf8_end_of_code_point_mask,char32_t * & utf32_output)19958 size_t convert_masked_utf8_to_utf32(const char *input,
19959                            uint64_t utf8_end_of_code_point_mask,
19960                            char32_t *&utf32_output) {
19961   // we use an approach where we try to process up to 12 input bytes.
19962   // Why 12 input bytes and not 16? Because we are concerned with the size of
19963   // the lookup tables. Also 12 is nicely divisible by two and three.
19964   //
19965   //
19966   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
19967   // beneficial to have fast paths that depend on branch prediction but have less latency.
19968   // This results in more instructions but, potentially, also higher speeds.
19969   //
19970   // We first try a few fast paths.
19971   const __m128i in = _mm_loadu_si128((__m128i *)input);
19972   const uint16_t input_utf8_end_of_code_point_mask =
19973       utf8_end_of_code_point_mask & 0xfff;
19974   if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
19975     // We process the data in chunks of 16 bytes.
19976     _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu8_epi32(in));
19977     _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output+8), _mm256_cvtepu8_epi32(_mm_srli_si128(in,8)));
19978     utf32_output += 16; // We wrote 16 32-bit characters.
19979     return 16; // We consumed 16 bytes.
19980   }
19981   if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
19982     // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
19983     // There is probably a more efficient sequence, but the following might do.
19984     const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
19985     const __m128i perm = _mm_shuffle_epi8(in, sh);
19986     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
19987     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
19988     const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
19989     _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed));
19990     utf32_output += 8; // We wrote 16 bytes, 8 code points.
19991     return 16;
19992   }
19993   if(input_utf8_end_of_code_point_mask == 0x924) {
19994     // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
19995     // There is probably a more efficient sequence, but the following might do.
19996     const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
19997     const __m128i perm = _mm_shuffle_epi8(in, sh);
19998     const __m128i ascii =
19999         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
20000     const __m128i middlebyte =
20001         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
20002     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
20003     const __m128i highbyte =
20004         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
20005     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
20006     const __m128i composed =
20007         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
20008     _mm_storeu_si128((__m128i *)utf32_output, composed);
20009     utf32_output += 4;
20010     return 12;
20011   }
20012   /// We do not have a fast path available, so we fallback.
20013 
20014   const uint8_t idx =
20015       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
20016   const uint8_t consumed =
20017       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
20018   if (idx < 64) {
20019     // SIX (6) input code-words
20020     // this is a relatively easy scenario
20021     // we process SIX (6) input code-words. The max length in bytes of six code
20022     // words spanning between 1 and 2 bytes each is 12 bytes. On processors
20023     // where pdep/pext is fast, we might be able to use a small lookup table.
20024     const __m128i sh =
20025         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
20026     const __m128i perm = _mm_shuffle_epi8(in, sh);
20027     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
20028     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
20029     const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
20030     _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed));
20031     utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
20032     // overflow of 32 - 24 = 8 bytes.
20033   } else if (idx < 145) {
20034     // FOUR (4) input code-words
20035     const __m128i sh =
20036         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
20037     const __m128i perm = _mm_shuffle_epi8(in, sh);
20038     const __m128i ascii =
20039         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
20040     const __m128i middlebyte =
20041         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
20042     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
20043     const __m128i highbyte =
20044         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
20045     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
20046     const __m128i composed =
20047         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
20048     _mm_storeu_si128((__m128i *)utf32_output, composed);
20049     utf32_output += 4;
20050   } else if (idx < 209) {
20051     // TWO (2) input code-words
20052     const __m128i sh =
20053         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
20054     const __m128i perm = _mm_shuffle_epi8(in, sh);
20055     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
20056     const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
20057     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
20058     __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
20059     // correct for spurious high bit
20060     const __m128i correct =
20061         _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
20062     middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
20063     const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
20064     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
20065     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
20066     const __m128i composed =
20067         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
20068                      _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
20069     _mm_storeu_si128((__m128i *)utf32_output, composed);
20070     utf32_output += 3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
20071   } else {
20072     // here we know that there is an error but we do not handle errors
20073   }
20074   return consumed;
20075 }
20076 /* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */
20077 
20078 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf8.cpp
20079 /* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */
20080 /*
20081     The vectorized algorithm works on single SSE register i.e., it
20082     loads eight 16-bit words.
20083 
20084     We consider three cases:
20085     1. an input register contains no surrogates and each value
20086        is in range 0x0000 .. 0x07ff.
20087     2. an input register contains no surrogates and values are
20088        is in range 0x0000 .. 0xffff.
20089     3. an input register contains surrogates --- i.e. codepoints
20090        can have 16 or 32 bits.
20091 
20092     Ad 1.
20093 
20094     When values are less than 0x0800, it means that a 16-bit words
20095     can be converted into: 1) single UTF8 byte (when it's an ASCII
20096     char) or 2) two UTF8 bytes.
20097 
20098     For this case we do only some shuffle to obtain these 2-byte
20099     codes and finally compress the whole SSE register with a single
20100     shuffle.
20101 
20102     We need 256-entry lookup table to get a compression pattern
20103     and the number of output bytes in the compressed vector register.
20104     Each entry occupies 17 bytes.
20105 
20106     Ad 2.
20107 
20108     When values fit in 16-bit words, but are above 0x07ff, then
20109     a single word may produce one, two or three UTF8 bytes.
20110 
20111     We prepare data for all these three cases in two registers.
20112     The first register contains lower two UTF8 bytes (used in all
20113     cases), while the second one contains just the third byte for
20114     the three-UTF8-bytes case.
20115 
20116     Finally these two registers are interleaved forming eight-element
20117     array of 32-bit values. The array spans two SSE registers.
20118     The bytes from the registers are compressed using two shuffles.
20119 
20120     We need 256-entry lookup table to get a compression pattern
20121     and the number of output bytes in the compressed vector register.
20122     Each entry occupies 17 bytes.
20123 
20124 
20125     To summarize:
20126     - We need two 256-entry tables that have 8704 bytes in total.
20127 */
20128 
20129 
20130 /*
20131   Returns a pair: the first unprocessed byte from buf and utf8_output
20132   A scalar routing should carry on the conversion of the tail.
20133 */
20134 template <endianness big_endian>
avx2_convert_utf16_to_utf8(const char16_t * buf,size_t len,char * utf8_output)20135 std::pair<const char16_t*, char*> avx2_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) {
20136   const char16_t* end = buf + len;
20137   const __m256i v_0000 = _mm256_setzero_si256();
20138   const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
20139   const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
20140   const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
20141   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
20142 
20143   while (buf + 16 + safety_margin <= end) {
20144     __m256i in = _mm256_loadu_si256((__m256i*)buf);
20145     if (big_endian) {
20146       const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
20147                                   17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
20148       in = _mm256_shuffle_epi8(in, swap);
20149     }
20150     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
20151     const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
20152     if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
20153         // 1. pack the bytes
20154         const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
20155         // 2. store (16 bytes)
20156         _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
20157         // 3. adjust pointers
20158         buf += 16;
20159         utf8_output += 16;
20160         continue; // we are done for this round!
20161     }
20162     // no bits set above 7th bit
20163     const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
20164     const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
20165 
20166     // no bits set above 11th bit
20167     const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
20168     const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
20169     if (one_or_two_bytes_bitmask == 0xffffffff) {
20170 
20171           // 1. prepare 2-byte values
20172           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
20173           // expected output   : [110a|aaaa|10bb|bbbb] x 8
20174           const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
20175           const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
20176 
20177           // t0 = [000a|aaaa|bbbb|bb00]
20178           const __m256i t0 = _mm256_slli_epi16(in, 2);
20179           // t1 = [000a|aaaa|0000|0000]
20180           const __m256i t1 = _mm256_and_si256(t0, v_1f00);
20181           // t2 = [0000|0000|00bb|bbbb]
20182           const __m256i t2 = _mm256_and_si256(in, v_003f);
20183           // t3 = [000a|aaaa|00bb|bbbb]
20184           const __m256i t3 = _mm256_or_si256(t1, t2);
20185           // t4 = [110a|aaaa|10bb|bbbb]
20186           const __m256i t4 = _mm256_or_si256(t3, v_c080);
20187 
20188           // 2. merge ASCII and 2-byte codewords
20189           const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
20190 
20191           // 3. prepare bitmask for 8-bit lookup
20192           const uint32_t M0 = one_byte_bitmask & 0x55555555;
20193           const uint32_t M1 = M0 >> 7;
20194           const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
20195           // 4. pack the bytes
20196 
20197           const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
20198           const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
20199 
20200           const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
20201           const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
20202 
20203           const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
20204           // 5. store bytes
20205           _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
20206           utf8_output += row[0];
20207           _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
20208           utf8_output += row_2[0];
20209 
20210           // 6. adjust pointers
20211           buf += 16;
20212           continue;
20213     }
20214     // 1. Check if there are any surrogate word in the input chunk.
20215     //    We have also deal with situation when there is a surrogate word
20216     //    at the end of a chunk.
20217     const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
20218 
20219     // bitmask = 0x0000 if there are no surrogates
20220     //         = 0xc000 if the last word is a surrogate
20221     const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
20222     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
20223     // it is likely an uncommon occurrence.
20224     if (surrogates_bitmask == 0x00000000) {
20225       // case: words from register produce either 1, 2 or 3 UTF-8 bytes
20226         const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
20227                                                 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
20228                                                 0x0000, 0x0202, 0x0404, 0x0606,
20229                                                 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
20230 
20231         /* In this branch we handle three cases:
20232            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
20233            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
20234            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
20235 
20236           We expand the input word (16-bit) into two words (32-bit), thus
20237           we have room for four bytes. However, we need five distinct bit
20238           layouts. Note that the last byte in cases #2 and #3 is the same.
20239 
20240           We precompute byte 1 for case #1 and the common byte for cases #2 & #3
20241           in register t2.
20242 
20243           We precompute byte 1 for case #3 and -- **conditionally** -- precompute
20244           either byte 1 for case #2 or byte 2 for case #3. Note that they
20245           differ by exactly one bit.
20246 
20247           Finally from these two words we build proper UTF-8 sequence, taking
20248           into account the case (i.e, the number of bytes to write).
20249         */
20250         /**
20251          * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
20252          * t2 => [0ccc|cccc] [10cc|cccc]
20253          * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
20254          */
20255 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
20256         // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
20257         const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
20258         // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
20259         const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
20260         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
20261         const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
20262 
20263         // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
20264         const __m256i s0 = _mm256_srli_epi16(in, 4);
20265         // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
20266         const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
20267         // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
20268         const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
20269         // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
20270         const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
20271         const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
20272         const __m256i s4 = _mm256_xor_si256(s3, m0);
20273 #undef simdutf_vec
20274 
20275         // 4. expand words 16-bit => 32-bit
20276         const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
20277         const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
20278 
20279         // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
20280         const uint32_t mask = (one_byte_bitmask & 0x55555555) |
20281                               (one_or_two_bytes_bitmask & 0xaaaaaaaa);
20282         // Due to the wider registers, the following path is less likely to be useful.
20283         /*if(mask == 0) {
20284           // We only have three-byte words. Use fast path.
20285           const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
20286           const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
20287           const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
20288           _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
20289           utf8_output += 12;
20290           _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
20291           utf8_output += 12;
20292           _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
20293           utf8_output += 12;
20294           _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
20295           utf8_output += 12;
20296           buf += 16;
20297           continue;
20298         }*/
20299         const uint8_t mask0 = uint8_t(mask);
20300         const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
20301         const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
20302         const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
20303 
20304         const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
20305         const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
20306         const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
20307         const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
20308 
20309         const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
20310         const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
20311         const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
20312         const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
20313 
20314 
20315         const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
20316         const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
20317         const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
20318         const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
20319 
20320         _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
20321         utf8_output += row0[0];
20322         _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
20323         utf8_output += row1[0];
20324         _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
20325         utf8_output += row2[0];
20326         _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
20327         utf8_output += row3[0];
20328         buf += 16;
20329     // surrogate pair(s) in a register
20330     } else {
20331       // Let us do a scalar fallback.
20332       // It may seem wasteful to use scalar code, but being efficient with SIMD
20333       // in the presence of surrogate pairs may require non-trivial tables.
20334       size_t forward = 15;
20335       size_t k = 0;
20336       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
20337       for(; k < forward; k++) {
20338         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
20339         if((word & 0xFF80)==0) {
20340           *utf8_output++ = char(word);
20341         } else if((word & 0xF800)==0) {
20342           *utf8_output++ = char((word>>6) | 0b11000000);
20343           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20344         } else if((word &0xF800 ) != 0xD800) {
20345           *utf8_output++ = char((word>>12) | 0b11100000);
20346           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
20347           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20348         } else {
20349           // must be a surrogate pair
20350           uint16_t diff = uint16_t(word - 0xD800);
20351           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
20352           k++;
20353           uint16_t diff2 = uint16_t(next_word - 0xDC00);
20354           if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf8_output); }
20355           uint32_t value = (diff << 10) + diff2 + 0x10000;
20356           *utf8_output++ = char((value>>18) | 0b11110000);
20357           *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
20358           *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
20359           *utf8_output++ = char((value & 0b111111) | 0b10000000);
20360         }
20361       }
20362       buf += k;
20363     }
20364   } // while
20365   return std::make_pair(buf, utf8_output);
20366 }
20367 
20368 
20369 /*
20370   Returns a pair: a result struct and utf8_output.
20371   If there is an error, the count field of the result is the position of the error.
20372   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
20373   A scalar routing should carry on the conversion of the tail if needed.
20374 */
20375 template <endianness big_endian>
avx2_convert_utf16_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output)20376 std::pair<result, char*> avx2_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
20377   const char16_t* start = buf;
20378   const char16_t* end = buf + len;
20379 
20380   const __m256i v_0000 = _mm256_setzero_si256();
20381   const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
20382   const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
20383   const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
20384   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
20385 
20386   while (buf + 16 + safety_margin <= end) {
20387     __m256i in = _mm256_loadu_si256((__m256i*)buf);
20388     if (big_endian) {
20389       const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
20390                                   17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
20391       in = _mm256_shuffle_epi8(in, swap);
20392     }
20393     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
20394     const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
20395     if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
20396         // 1. pack the bytes
20397         const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
20398         // 2. store (16 bytes)
20399         _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
20400         // 3. adjust pointers
20401         buf += 16;
20402         utf8_output += 16;
20403         continue; // we are done for this round!
20404     }
20405     // no bits set above 7th bit
20406     const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
20407     const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
20408 
20409     // no bits set above 11th bit
20410     const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
20411     const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
20412     if (one_or_two_bytes_bitmask == 0xffffffff) {
20413 
20414           // 1. prepare 2-byte values
20415           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
20416           // expected output   : [110a|aaaa|10bb|bbbb] x 8
20417           const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
20418           const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
20419 
20420           // t0 = [000a|aaaa|bbbb|bb00]
20421           const __m256i t0 = _mm256_slli_epi16(in, 2);
20422           // t1 = [000a|aaaa|0000|0000]
20423           const __m256i t1 = _mm256_and_si256(t0, v_1f00);
20424           // t2 = [0000|0000|00bb|bbbb]
20425           const __m256i t2 = _mm256_and_si256(in, v_003f);
20426           // t3 = [000a|aaaa|00bb|bbbb]
20427           const __m256i t3 = _mm256_or_si256(t1, t2);
20428           // t4 = [110a|aaaa|10bb|bbbb]
20429           const __m256i t4 = _mm256_or_si256(t3, v_c080);
20430 
20431           // 2. merge ASCII and 2-byte codewords
20432           const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
20433 
20434           // 3. prepare bitmask for 8-bit lookup
20435           const uint32_t M0 = one_byte_bitmask & 0x55555555;
20436           const uint32_t M1 = M0 >> 7;
20437           const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
20438           // 4. pack the bytes
20439 
20440           const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
20441           const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
20442 
20443           const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
20444           const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
20445 
20446           const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
20447           // 5. store bytes
20448           _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
20449           utf8_output += row[0];
20450           _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
20451           utf8_output += row_2[0];
20452 
20453           // 6. adjust pointers
20454           buf += 16;
20455           continue;
20456     }
20457     // 1. Check if there are any surrogate word in the input chunk.
20458     //    We have also deal with situation when there is a surrogate word
20459     //    at the end of a chunk.
20460     const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
20461 
20462     // bitmask = 0x0000 if there are no surrogates
20463     //         = 0xc000 if the last word is a surrogate
20464     const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
20465     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
20466     // it is likely an uncommon occurrence.
20467     if (surrogates_bitmask == 0x00000000) {
20468       // case: words from register produce either 1, 2 or 3 UTF-8 bytes
20469         const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
20470                                                 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
20471                                                 0x0000, 0x0202, 0x0404, 0x0606,
20472                                                 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
20473 
20474         /* In this branch we handle three cases:
20475            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
20476            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
20477            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
20478 
20479           We expand the input word (16-bit) into two words (32-bit), thus
20480           we have room for four bytes. However, we need five distinct bit
20481           layouts. Note that the last byte in cases #2 and #3 is the same.
20482 
20483           We precompute byte 1 for case #1 and the common byte for cases #2 & #3
20484           in register t2.
20485 
20486           We precompute byte 1 for case #3 and -- **conditionally** -- precompute
20487           either byte 1 for case #2 or byte 2 for case #3. Note that they
20488           differ by exactly one bit.
20489 
20490           Finally from these two words we build proper UTF-8 sequence, taking
20491           into account the case (i.e, the number of bytes to write).
20492         */
20493         /**
20494          * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
20495          * t2 => [0ccc|cccc] [10cc|cccc]
20496          * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
20497          */
20498 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
20499         // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
20500         const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
20501         // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
20502         const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
20503         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
20504         const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
20505 
20506         // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
20507         const __m256i s0 = _mm256_srli_epi16(in, 4);
20508         // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
20509         const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
20510         // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
20511         const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
20512         // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
20513         const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
20514         const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
20515         const __m256i s4 = _mm256_xor_si256(s3, m0);
20516 #undef simdutf_vec
20517 
20518         // 4. expand words 16-bit => 32-bit
20519         const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
20520         const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
20521 
20522         // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
20523         const uint32_t mask = (one_byte_bitmask & 0x55555555) |
20524                               (one_or_two_bytes_bitmask & 0xaaaaaaaa);
20525         // Due to the wider registers, the following path is less likely to be useful.
20526         /*if(mask == 0) {
20527           // We only have three-byte words. Use fast path.
20528           const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
20529           const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
20530           const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
20531           _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
20532           utf8_output += 12;
20533           _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
20534           utf8_output += 12;
20535           _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
20536           utf8_output += 12;
20537           _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
20538           utf8_output += 12;
20539           buf += 16;
20540           continue;
20541         }*/
20542         const uint8_t mask0 = uint8_t(mask);
20543         const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
20544         const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
20545         const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
20546 
20547         const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
20548         const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
20549         const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
20550         const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
20551 
20552         const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
20553         const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
20554         const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
20555         const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
20556 
20557 
20558         const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
20559         const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
20560         const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
20561         const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
20562 
20563         _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
20564         utf8_output += row0[0];
20565         _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
20566         utf8_output += row1[0];
20567         _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
20568         utf8_output += row2[0];
20569         _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
20570         utf8_output += row3[0];
20571         buf += 16;
20572     // surrogate pair(s) in a register
20573     } else {
20574       // Let us do a scalar fallback.
20575       // It may seem wasteful to use scalar code, but being efficient with SIMD
20576       // in the presence of surrogate pairs may require non-trivial tables.
20577       size_t forward = 15;
20578       size_t k = 0;
20579       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
20580       for(; k < forward; k++) {
20581         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
20582         if((word & 0xFF80)==0) {
20583           *utf8_output++ = char(word);
20584         } else if((word & 0xF800)==0) {
20585           *utf8_output++ = char((word>>6) | 0b11000000);
20586           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20587         } else if((word &0xF800 ) != 0xD800) {
20588           *utf8_output++ = char((word>>12) | 0b11100000);
20589           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
20590           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20591         } else {
20592           // must be a surrogate pair
20593           uint16_t diff = uint16_t(word - 0xD800);
20594           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
20595           k++;
20596           uint16_t diff2 = uint16_t(next_word - 0xDC00);
20597           if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); }
20598           uint32_t value = (diff << 10) + diff2 + 0x10000;
20599           *utf8_output++ = char((value>>18) | 0b11110000);
20600           *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
20601           *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
20602           *utf8_output++ = char((value & 0b111111) | 0b10000000);
20603         }
20604       }
20605       buf += k;
20606     }
20607   } // while
20608   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
20609 }
20610 /* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */
20611 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf32.cpp
20612 /* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */
20613 /*
20614     The vectorized algorithm works on single SSE register i.e., it
20615     loads eight 16-bit words.
20616 
20617     We consider three cases:
20618     1. an input register contains no surrogates and each value
20619        is in range 0x0000 .. 0x07ff.
20620     2. an input register contains no surrogates and values are
20621        is in range 0x0000 .. 0xffff.
20622     3. an input register contains surrogates --- i.e. codepoints
20623        can have 16 or 32 bits.
20624 
20625     Ad 1.
20626 
20627     When values are less than 0x0800, it means that a 16-bit words
20628     can be converted into: 1) single UTF8 byte (when it's an ASCII
20629     char) or 2) two UTF8 bytes.
20630 
20631     For this case we do only some shuffle to obtain these 2-byte
20632     codes and finally compress the whole SSE register with a single
20633     shuffle.
20634 
20635     We need 256-entry lookup table to get a compression pattern
20636     and the number of output bytes in the compressed vector register.
20637     Each entry occupies 17 bytes.
20638 
20639     Ad 2.
20640 
20641     When values fit in 16-bit words, but are above 0x07ff, then
20642     a single word may produce one, two or three UTF8 bytes.
20643 
20644     We prepare data for all these three cases in two registers.
20645     The first register contains lower two UTF8 bytes (used in all
20646     cases), while the second one contains just the third byte for
20647     the three-UTF8-bytes case.
20648 
20649     Finally these two registers are interleaved forming eight-element
20650     array of 32-bit values. The array spans two SSE registers.
20651     The bytes from the registers are compressed using two shuffles.
20652 
20653     We need 256-entry lookup table to get a compression pattern
20654     and the number of output bytes in the compressed vector register.
20655     Each entry occupies 17 bytes.
20656 
20657 
20658     To summarize:
20659     - We need two 256-entry tables that have 8704 bytes in total.
20660 */
20661 
20662 
20663 /*
20664   Returns a pair: the first unprocessed byte from buf and utf32_output
20665   A scalar routing should carry on the conversion of the tail.
20666 */
20667 template <endianness big_endian>
avx2_convert_utf16_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output)20668 std::pair<const char16_t*, char32_t*> avx2_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
20669   const char16_t* end = buf + len;
20670   const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
20671   const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
20672 
20673   while (buf + 16 <= end) {
20674     __m256i in = _mm256_loadu_si256((__m256i*)buf);
20675     if (big_endian) {
20676       const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
20677                                   17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
20678       in = _mm256_shuffle_epi8(in, swap);
20679     }
20680 
20681     // 1. Check if there are any surrogate word in the input chunk.
20682     //    We have also deal with situation when there is a surrogate word
20683     //    at the end of a chunk.
20684     const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
20685 
20686     // bitmask = 0x0000 if there are no surrogates
20687     //         = 0xc000 if the last word is a surrogate
20688     const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
20689     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
20690     // it is likely an uncommon occurrence.
20691     if (surrogates_bitmask == 0x00000000) {
20692       // case: we extend all sixteen 16-bit words to sixteen 32-bit words
20693         _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
20694         _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1)));
20695         utf32_output += 16;
20696         buf += 16;
20697     // surrogate pair(s) in a register
20698     } else {
20699       // Let us do a scalar fallback.
20700       // It may seem wasteful to use scalar code, but being efficient with SIMD
20701       // in the presence of surrogate pairs may require non-trivial tables.
20702       size_t forward = 15;
20703       size_t k = 0;
20704       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
20705       for(; k < forward; k++) {
20706         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
20707         if((word &0xF800 ) != 0xD800) {
20708           // No surrogate pair
20709           *utf32_output++ = char32_t(word);
20710         } else {
20711           // must be a surrogate pair
20712           uint16_t diff = uint16_t(word - 0xD800);
20713           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
20714           k++;
20715           uint16_t diff2 = uint16_t(next_word - 0xDC00);
20716           if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf32_output); }
20717           uint32_t value = (diff << 10) + diff2 + 0x10000;
20718           *utf32_output++ = char32_t(value);
20719         }
20720       }
20721       buf += k;
20722     }
20723   } // while
20724   return std::make_pair(buf, utf32_output);
20725 }
20726 
20727 
20728 /*
20729   Returns a pair: a result struct and utf8_output.
20730   If there is an error, the count field of the result is the position of the error.
20731   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
20732   A scalar routing should carry on the conversion of the tail if needed.
20733 */
20734 template <endianness big_endian>
avx2_convert_utf16_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output)20735 std::pair<result, char32_t*> avx2_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
20736   const char16_t* start = buf;
20737   const char16_t* end = buf + len;
20738   const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
20739   const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
20740 
20741   while (buf + 16 <= end) {
20742     __m256i in = _mm256_loadu_si256((__m256i*)buf);
20743     if (big_endian) {
20744       const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
20745                                   17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
20746       in = _mm256_shuffle_epi8(in, swap);
20747     }
20748 
20749     // 1. Check if there are any surrogate word in the input chunk.
20750     //    We have also deal with situation when there is a surrogate word
20751     //    at the end of a chunk.
20752     const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
20753 
20754     // bitmask = 0x0000 if there are no surrogates
20755     //         = 0xc000 if the last word is a surrogate
20756     const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
20757     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
20758     // it is likely an uncommon occurrence.
20759     if (surrogates_bitmask == 0x00000000) {
20760       // case: we extend all sixteen 16-bit words to sixteen 32-bit words
20761         _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
20762         _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1)));
20763         utf32_output += 16;
20764         buf += 16;
20765     // surrogate pair(s) in a register
20766     } else {
20767       // Let us do a scalar fallback.
20768       // It may seem wasteful to use scalar code, but being efficient with SIMD
20769       // in the presence of surrogate pairs may require non-trivial tables.
20770       size_t forward = 15;
20771       size_t k = 0;
20772       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
20773       for(; k < forward; k++) {
20774         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
20775         if((word &0xF800 ) != 0xD800) {
20776           // No surrogate pair
20777           *utf32_output++ = char32_t(word);
20778         } else {
20779           // must be a surrogate pair
20780           uint16_t diff = uint16_t(word - 0xD800);
20781           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
20782           k++;
20783           uint16_t diff2 = uint16_t(next_word - 0xDC00);
20784           if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); }
20785           uint32_t value = (diff << 10) + diff2 + 0x10000;
20786           *utf32_output++ = char32_t(value);
20787         }
20788       }
20789       buf += k;
20790     }
20791   } // while
20792   return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
20793 }
20794 /* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */
20795 
20796 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf8.cpp
20797 /* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */
avx2_convert_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output)20798 std::pair<const char32_t*, char*> avx2_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
20799   const char32_t* end = buf + len;
20800   const __m256i v_0000 = _mm256_setzero_si256();
20801   const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
20802   const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
20803   const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
20804   const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
20805   const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
20806   __m256i running_max = _mm256_setzero_si256();
20807   __m256i forbidden_bytemask = _mm256_setzero_si256();
20808 
20809   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
20810 
20811   while (buf + 16 + safety_margin <= end) {
20812     __m256i in = _mm256_loadu_si256((__m256i*)buf);
20813     __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
20814     running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
20815 
20816     // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
20817     __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
20818     in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
20819 
20820     // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
20821 
20822     if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
20823       // 1. pack the bytes
20824       const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
20825       // 2. store (16 bytes)
20826       _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
20827       // 3. adjust pointers
20828       buf += 16;
20829       utf8_output += 16;
20830       continue; // we are done for this round!
20831     }
20832     // no bits set above 7th bit
20833     const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
20834     const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
20835 
20836     // no bits set above 11th bit
20837     const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
20838     const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
20839     if (one_or_two_bytes_bitmask == 0xffffffff) {
20840       // 1. prepare 2-byte values
20841       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
20842       // expected output   : [110a|aaaa|10bb|bbbb] x 8
20843       const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
20844       const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
20845 
20846       // t0 = [000a|aaaa|bbbb|bb00]
20847       const __m256i t0 = _mm256_slli_epi16(in_16, 2);
20848       // t1 = [000a|aaaa|0000|0000]
20849       const __m256i t1 = _mm256_and_si256(t0, v_1f00);
20850       // t2 = [0000|0000|00bb|bbbb]
20851       const __m256i t2 = _mm256_and_si256(in_16, v_003f);
20852       // t3 = [000a|aaaa|00bb|bbbb]
20853       const __m256i t3 = _mm256_or_si256(t1, t2);
20854       // t4 = [110a|aaaa|10bb|bbbb]
20855       const __m256i t4 = _mm256_or_si256(t3, v_c080);
20856 
20857       // 2. merge ASCII and 2-byte codewords
20858       const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
20859 
20860       // 3. prepare bitmask for 8-bit lookup
20861       const uint32_t M0 = one_byte_bitmask & 0x55555555;
20862       const uint32_t M1 = M0 >> 7;
20863       const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
20864       // 4. pack the bytes
20865 
20866       const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
20867       const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
20868 
20869       const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
20870       const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
20871 
20872       const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
20873       // 5. store bytes
20874       _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
20875       utf8_output += row[0];
20876       _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
20877       utf8_output += row_2[0];
20878 
20879       // 6. adjust pointers
20880       buf += 16;
20881       continue;
20882     }
20883     // Must check for overflow in packing
20884     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
20885     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
20886     if (saturation_bitmask == 0xffffffff) {
20887       // case: words from register produce either 1, 2 or 3 UTF-8 bytes
20888       const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
20889       forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
20890 
20891       const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
20892                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
20893                                               0x0000, 0x0202, 0x0404, 0x0606,
20894                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
20895 
20896       /* In this branch we handle three cases:
20897         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
20898         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
20899         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
20900 
20901         We expand the input word (16-bit) into two words (32-bit), thus
20902         we have room for four bytes. However, we need five distinct bit
20903         layouts. Note that the last byte in cases #2 and #3 is the same.
20904 
20905         We precompute byte 1 for case #1 and the common byte for cases #2 & #3
20906         in register t2.
20907 
20908         We precompute byte 1 for case #3 and -- **conditionally** -- precompute
20909         either byte 1 for case #2 or byte 2 for case #3. Note that they
20910         differ by exactly one bit.
20911 
20912         Finally from these two words we build proper UTF-8 sequence, taking
20913         into account the case (i.e, the number of bytes to write).
20914       */
20915       /**
20916        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
20917        * t2 => [0ccc|cccc] [10cc|cccc]
20918        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
20919        */
20920 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
20921       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
20922       const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
20923       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
20924       const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
20925       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
20926       const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
20927 
20928       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
20929       const __m256i s0 = _mm256_srli_epi16(in_16, 4);
20930       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
20931       const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
20932       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
20933       const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
20934       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
20935       const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
20936       const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
20937       const __m256i s4 = _mm256_xor_si256(s3, m0);
20938 #undef simdutf_vec
20939 
20940       // 4. expand words 16-bit => 32-bit
20941       const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
20942       const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
20943 
20944       // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
20945       const uint32_t mask = (one_byte_bitmask & 0x55555555) |
20946                             (one_or_two_bytes_bitmask & 0xaaaaaaaa);
20947       // Due to the wider registers, the following path is less likely to be useful.
20948       /*if(mask == 0) {
20949         // We only have three-byte words. Use fast path.
20950         const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
20951         const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
20952         const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
20953         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
20954         utf8_output += 12;
20955         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
20956         utf8_output += 12;
20957         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
20958         utf8_output += 12;
20959         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
20960         utf8_output += 12;
20961         buf += 16;
20962         continue;
20963       }*/
20964       const uint8_t mask0 = uint8_t(mask);
20965       const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
20966       const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
20967       const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
20968 
20969       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
20970       const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
20971       const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
20972       const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
20973 
20974       const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
20975       const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
20976       const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
20977       const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
20978 
20979 
20980       const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
20981       const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
20982       const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
20983       const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
20984 
20985       _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
20986       utf8_output += row0[0];
20987       _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
20988       utf8_output += row1[0];
20989       _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
20990       utf8_output += row2[0];
20991       _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
20992       utf8_output += row3[0];
20993       buf += 16;
20994     } else {
20995       // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
20996       // Let us do a scalar fallback.
20997       // It may seem wasteful to use scalar code, but being efficient with SIMD
20998       // may require large, non-trivial tables?
20999       size_t forward = 15;
21000       size_t k = 0;
21001       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
21002       for(; k < forward; k++) {
21003         uint32_t word = buf[k];
21004         if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
21005           *utf8_output++ = char(word);
21006         } else if((word & 0xFFFFF800)==0) { // 2-byte
21007           *utf8_output++ = char((word>>6) | 0b11000000);
21008           *utf8_output++ = char((word & 0b111111) | 0b10000000);
21009         } else if((word & 0xFFFF0000 )==0) {  // 3-byte
21010           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
21011           *utf8_output++ = char((word>>12) | 0b11100000);
21012           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
21013           *utf8_output++ = char((word & 0b111111) | 0b10000000);
21014         } else {  // 4-byte
21015           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
21016           *utf8_output++ = char((word>>18) | 0b11110000);
21017           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
21018           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
21019           *utf8_output++ = char((word & 0b111111) | 0b10000000);
21020         }
21021       }
21022       buf += k;
21023     }
21024   } // while
21025 
21026   // check for invalid input
21027   const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
21028   if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
21029     return std::make_pair(nullptr, utf8_output);
21030   }
21031 
21032   if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
21033 
21034   return std::make_pair(buf, utf8_output);
21035 }
21036 
21037 
avx2_convert_utf32_to_utf8_with_errors(const char32_t * buf,size_t len,char * utf8_output)21038 std::pair<result, char*> avx2_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
21039   const char32_t* end = buf + len;
21040   const char32_t* start = buf;
21041 
21042   const __m256i v_0000 = _mm256_setzero_si256();
21043   const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
21044   const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
21045   const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
21046   const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
21047   const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
21048   const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
21049 
21050   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
21051 
21052   while (buf + 16 + safety_margin <= end) {
21053     __m256i in = _mm256_loadu_si256((__m256i*)buf);
21054     __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
21055     // Check for too large input
21056     const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
21057     if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
21058       return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
21059     }
21060 
21061     // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
21062     __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
21063     in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
21064 
21065     // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
21066 
21067     if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
21068       // 1. pack the bytes
21069       const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
21070       // 2. store (16 bytes)
21071       _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
21072       // 3. adjust pointers
21073       buf += 16;
21074       utf8_output += 16;
21075       continue; // we are done for this round!
21076     }
21077     // no bits set above 7th bit
21078     const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
21079     const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
21080 
21081     // no bits set above 11th bit
21082     const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
21083     const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
21084     if (one_or_two_bytes_bitmask == 0xffffffff) {
21085       // 1. prepare 2-byte values
21086       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
21087       // expected output   : [110a|aaaa|10bb|bbbb] x 8
21088       const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
21089       const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
21090 
21091       // t0 = [000a|aaaa|bbbb|bb00]
21092       const __m256i t0 = _mm256_slli_epi16(in_16, 2);
21093       // t1 = [000a|aaaa|0000|0000]
21094       const __m256i t1 = _mm256_and_si256(t0, v_1f00);
21095       // t2 = [0000|0000|00bb|bbbb]
21096       const __m256i t2 = _mm256_and_si256(in_16, v_003f);
21097       // t3 = [000a|aaaa|00bb|bbbb]
21098       const __m256i t3 = _mm256_or_si256(t1, t2);
21099       // t4 = [110a|aaaa|10bb|bbbb]
21100       const __m256i t4 = _mm256_or_si256(t3, v_c080);
21101 
21102       // 2. merge ASCII and 2-byte codewords
21103       const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
21104 
21105       // 3. prepare bitmask for 8-bit lookup
21106       const uint32_t M0 = one_byte_bitmask & 0x55555555;
21107       const uint32_t M1 = M0 >> 7;
21108       const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
21109       // 4. pack the bytes
21110 
21111       const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
21112       const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
21113 
21114       const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
21115       const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
21116 
21117       const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
21118       // 5. store bytes
21119       _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
21120       utf8_output += row[0];
21121       _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
21122       utf8_output += row_2[0];
21123 
21124       // 6. adjust pointers
21125       buf += 16;
21126       continue;
21127     }
21128     // Must check for overflow in packing
21129     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
21130     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
21131     if (saturation_bitmask == 0xffffffff) {
21132       // case: words from register produce either 1, 2 or 3 UTF-8 bytes
21133 
21134       // Check for illegal surrogate words
21135       const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
21136       const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
21137       if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
21138         return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
21139       }
21140 
21141       const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
21142                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
21143                                               0x0000, 0x0202, 0x0404, 0x0606,
21144                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
21145 
21146       /* In this branch we handle three cases:
21147         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
21148         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
21149         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
21150 
21151         We expand the input word (16-bit) into two words (32-bit), thus
21152         we have room for four bytes. However, we need five distinct bit
21153         layouts. Note that the last byte in cases #2 and #3 is the same.
21154 
21155         We precompute byte 1 for case #1 and the common byte for cases #2 & #3
21156         in register t2.
21157 
21158         We precompute byte 1 for case #3 and -- **conditionally** -- precompute
21159         either byte 1 for case #2 or byte 2 for case #3. Note that they
21160         differ by exactly one bit.
21161 
21162         Finally from these two words we build proper UTF-8 sequence, taking
21163         into account the case (i.e, the number of bytes to write).
21164       */
21165       /**
21166        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
21167        * t2 => [0ccc|cccc] [10cc|cccc]
21168        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
21169        */
21170 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
21171       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
21172       const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
21173       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
21174       const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
21175       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
21176       const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
21177 
21178       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
21179       const __m256i s0 = _mm256_srli_epi16(in_16, 4);
21180       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
21181       const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
21182       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
21183       const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
21184       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
21185       const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
21186       const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
21187       const __m256i s4 = _mm256_xor_si256(s3, m0);
21188 #undef simdutf_vec
21189 
21190       // 4. expand words 16-bit => 32-bit
21191       const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
21192       const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
21193 
21194       // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
21195       const uint32_t mask = (one_byte_bitmask & 0x55555555) |
21196                             (one_or_two_bytes_bitmask & 0xaaaaaaaa);
21197       // Due to the wider registers, the following path is less likely to be useful.
21198       /*if(mask == 0) {
21199         // We only have three-byte words. Use fast path.
21200         const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
21201         const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
21202         const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
21203         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
21204         utf8_output += 12;
21205         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
21206         utf8_output += 12;
21207         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
21208         utf8_output += 12;
21209         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
21210         utf8_output += 12;
21211         buf += 16;
21212         continue;
21213       }*/
21214       const uint8_t mask0 = uint8_t(mask);
21215       const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
21216       const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
21217       const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
21218 
21219       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
21220       const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
21221       const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
21222       const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
21223 
21224       const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
21225       const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
21226       const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
21227       const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
21228 
21229 
21230       const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
21231       const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
21232       const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
21233       const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
21234 
21235       _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
21236       utf8_output += row0[0];
21237       _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
21238       utf8_output += row1[0];
21239       _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
21240       utf8_output += row2[0];
21241       _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
21242       utf8_output += row3[0];
21243       buf += 16;
21244     } else {
21245       // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
21246       // Let us do a scalar fallback.
21247       // It may seem wasteful to use scalar code, but being efficient with SIMD
21248       // may require large, non-trivial tables?
21249       size_t forward = 15;
21250       size_t k = 0;
21251       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
21252       for(; k < forward; k++) {
21253         uint32_t word = buf[k];
21254         if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
21255           *utf8_output++ = char(word);
21256         } else if((word & 0xFFFFF800)==0) { // 2-byte
21257           *utf8_output++ = char((word>>6) | 0b11000000);
21258           *utf8_output++ = char((word & 0b111111) | 0b10000000);
21259         } else if((word & 0xFFFF0000 )==0) {  // 3-byte
21260           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
21261           *utf8_output++ = char((word>>12) | 0b11100000);
21262           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
21263           *utf8_output++ = char((word & 0b111111) | 0b10000000);
21264         } else {  // 4-byte
21265           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
21266           *utf8_output++ = char((word>>18) | 0b11110000);
21267           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
21268           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
21269           *utf8_output++ = char((word & 0b111111) | 0b10000000);
21270         }
21271       }
21272       buf += k;
21273     }
21274   } // while
21275 
21276   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
21277 }
21278 /* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */
21279 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf16.cpp
21280 /* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */
21281 template <endianness big_endian>
avx2_convert_utf32_to_utf16(const char32_t * buf,size_t len,char16_t * utf16_output)21282 std::pair<const char32_t*, char16_t*> avx2_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
21283   const char32_t* end = buf + len;
21284 
21285   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
21286   __m256i forbidden_bytemask = _mm256_setzero_si256();
21287 
21288 
21289   while (buf + 8 + safety_margin <= end) {
21290     __m256i in = _mm256_loadu_si256((__m256i*)buf);
21291 
21292     const __m256i v_00000000 = _mm256_setzero_si256();
21293     const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
21294 
21295     // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
21296     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
21297     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
21298 
21299     if (saturation_bitmask == 0xffffffff) {
21300       const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
21301       const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
21302       forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
21303 
21304       __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
21305       if (big_endian) {
21306         const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
21307         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
21308       }
21309       _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
21310       utf16_output += 8;
21311       buf += 8;
21312     } else {
21313       size_t forward = 7;
21314       size_t k = 0;
21315       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
21316       for(; k < forward; k++) {
21317         uint32_t word = buf[k];
21318         if((word & 0xFFFF0000)==0) {
21319           // will not generate a surrogate pair
21320           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
21321           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
21322         } else {
21323           // will generate a surrogate pair
21324           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
21325           word -= 0x10000;
21326           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
21327           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
21328           if (big_endian) {
21329             high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
21330             low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
21331           }
21332           *utf16_output++ = char16_t(high_surrogate);
21333           *utf16_output++ = char16_t(low_surrogate);
21334         }
21335       }
21336       buf += k;
21337     }
21338   }
21339 
21340   // check for invalid input
21341   if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
21342 
21343   return std::make_pair(buf, utf16_output);
21344 }
21345 
21346 
21347 template <endianness big_endian>
avx2_convert_utf32_to_utf16_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output)21348 std::pair<result, char16_t*> avx2_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
21349   const char32_t* start = buf;
21350   const char32_t* end = buf + len;
21351 
21352   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
21353 
21354   while (buf + 8 + safety_margin <= end) {
21355     __m256i in = _mm256_loadu_si256((__m256i*)buf);
21356 
21357     const __m256i v_00000000 = _mm256_setzero_si256();
21358     const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
21359 
21360     // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
21361     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
21362     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
21363 
21364     if (saturation_bitmask == 0xffffffff) {
21365       const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
21366       const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
21367       const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
21368       if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
21369         return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
21370       }
21371 
21372       __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
21373       if (big_endian) {
21374         const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
21375         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
21376       }
21377       _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
21378       utf16_output += 8;
21379       buf += 8;
21380     } else {
21381       size_t forward = 7;
21382       size_t k = 0;
21383       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
21384       for(; k < forward; k++) {
21385         uint32_t word = buf[k];
21386         if((word & 0xFFFF0000)==0) {
21387           // will not generate a surrogate pair
21388           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
21389           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
21390         } else {
21391           // will generate a surrogate pair
21392           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
21393           word -= 0x10000;
21394           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
21395           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
21396           if (big_endian) {
21397             high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
21398             low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
21399           }
21400           *utf16_output++ = char16_t(high_surrogate);
21401           *utf16_output++ = char16_t(low_surrogate);
21402         }
21403       }
21404       buf += k;
21405     }
21406   }
21407 
21408   return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
21409 }
21410 /* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */
21411 } // unnamed namespace
21412 } // namespace haswell
21413 } // namespace simdutf
21414 
21415 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
21416 /* begin file src/generic/buf_block_reader.h */
21417 namespace simdutf {
21418 namespace haswell {
21419 namespace {
21420 
21421 // Walks through a buffer in block-sized increments, loading the last part with spaces
21422 template<size_t STEP_SIZE>
21423 struct buf_block_reader {
21424 public:
21425   simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
21426   simdutf_really_inline size_t block_index();
21427   simdutf_really_inline bool has_full_block() const;
21428   simdutf_really_inline const uint8_t *full_block() const;
21429   /**
21430    * Get the last block, padded with spaces.
21431    *
21432    * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
21433    * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
21434    * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
21435    *
21436    * @return the number of effective characters in the last block.
21437    */
21438   simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
21439   simdutf_really_inline void advance();
21440 private:
21441   const uint8_t *buf;
21442   const size_t len;
21443   const size_t lenminusstep;
21444   size_t idx;
21445 };
21446 
21447 // Routines to print masks and text for debugging bitmask operations
format_input_text_64(const uint8_t * text)21448 simdutf_unused static char * format_input_text_64(const uint8_t *text) {
21449   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
21450   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
21451     buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
21452   }
21453   buf[sizeof(simd8x64<uint8_t>)] = '\0';
21454   return buf;
21455 }
21456 
21457 // Routines to print masks and text for debugging bitmask operations
format_input_text(const simd8x64<uint8_t> & in)21458 simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
21459   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
21460   in.store(reinterpret_cast<uint8_t*>(buf));
21461   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
21462     if (buf[i] < ' ') { buf[i] = '_'; }
21463   }
21464   buf[sizeof(simd8x64<uint8_t>)] = '\0';
21465   return buf;
21466 }
21467 
format_mask(uint64_t mask)21468 simdutf_unused static char * format_mask(uint64_t mask) {
21469   static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
21470   for (size_t i=0; i<64; i++) {
21471     buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
21472   }
21473   buf[64] = '\0';
21474   return buf;
21475 }
21476 
21477 template<size_t STEP_SIZE>
buf_block_reader(const uint8_t * _buf,size_t _len)21478 simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
21479 
21480 template<size_t STEP_SIZE>
block_index()21481 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
21482 
21483 template<size_t STEP_SIZE>
has_full_block() const21484 simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
21485   return idx < lenminusstep;
21486 }
21487 
21488 template<size_t STEP_SIZE>
full_block() const21489 simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
21490   return &buf[idx];
21491 }
21492 
21493 template<size_t STEP_SIZE>
get_remainder(uint8_t * dst) const21494 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
21495   if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
21496   std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
21497   std::memcpy(dst, buf + idx, len - idx);
21498   return len - idx;
21499 }
21500 
21501 template<size_t STEP_SIZE>
advance()21502 simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
21503   idx += STEP_SIZE;
21504 }
21505 
21506 } // unnamed namespace
21507 } // namespace haswell
21508 } // namespace simdutf
21509 /* end file src/generic/buf_block_reader.h */
21510 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
21511 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
21512 namespace simdutf {
21513 namespace haswell {
21514 namespace {
21515 namespace utf8_validation {
21516 
21517 using namespace simd;
21518 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)21519   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
21520 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
21521 // Bit 1 = Too Long (ASCII followed by continuation)
21522 // Bit 2 = Overlong 3-byte
21523 // Bit 4 = Surrogate
21524 // Bit 5 = Overlong 2-byte
21525 // Bit 7 = Two Continuations
21526     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
21527                                                 // 11______ 11______
21528     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
21529     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
21530     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
21531     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
21532     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
21533     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
21534                                                 // 11110100 101_____
21535                                                 // 11110101 1001____
21536                                                 // 11110101 101_____
21537                                                 // 1111011_ 1001____
21538                                                 // 1111011_ 101_____
21539                                                 // 11111___ 1001____
21540                                                 // 11111___ 101_____
21541     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
21542                                                 // 11110101 1000____
21543                                                 // 1111011_ 1000____
21544                                                 // 11111___ 1000____
21545     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
21546 
21547     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
21548       // 0_______ ________ <ASCII in byte 1>
21549       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
21550       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
21551       // 10______ ________ <continuation in byte 1>
21552       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
21553       // 1100____ ________ <two byte lead in byte 1>
21554       TOO_SHORT | OVERLONG_2,
21555       // 1101____ ________ <two byte lead in byte 1>
21556       TOO_SHORT,
21557       // 1110____ ________ <three byte lead in byte 1>
21558       TOO_SHORT | OVERLONG_3 | SURROGATE,
21559       // 1111____ ________ <four+ byte lead in byte 1>
21560       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
21561     );
21562     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
21563     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
21564       // ____0000 ________
21565       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
21566       // ____0001 ________
21567       CARRY | OVERLONG_2,
21568       // ____001_ ________
21569       CARRY,
21570       CARRY,
21571 
21572       // ____0100 ________
21573       CARRY | TOO_LARGE,
21574       // ____0101 ________
21575       CARRY | TOO_LARGE | TOO_LARGE_1000,
21576       // ____011_ ________
21577       CARRY | TOO_LARGE | TOO_LARGE_1000,
21578       CARRY | TOO_LARGE | TOO_LARGE_1000,
21579 
21580       // ____1___ ________
21581       CARRY | TOO_LARGE | TOO_LARGE_1000,
21582       CARRY | TOO_LARGE | TOO_LARGE_1000,
21583       CARRY | TOO_LARGE | TOO_LARGE_1000,
21584       CARRY | TOO_LARGE | TOO_LARGE_1000,
21585       CARRY | TOO_LARGE | TOO_LARGE_1000,
21586       // ____1101 ________
21587       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
21588       CARRY | TOO_LARGE | TOO_LARGE_1000,
21589       CARRY | TOO_LARGE | TOO_LARGE_1000
21590     );
21591     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
21592       // ________ 0_______ <ASCII in byte 2>
21593       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
21594       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
21595 
21596       // ________ 1000____
21597       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
21598       // ________ 1001____
21599       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
21600       // ________ 101_____
21601       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
21602       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
21603 
21604       // ________ 11______
21605       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
21606     );
21607     return (byte_1_high & byte_1_low & byte_2_high);
21608   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)21609   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
21610       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
21611     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
21612     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
21613     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
21614     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
21615     return must23_80 ^ sc;
21616   }
21617 
21618   //
21619   // Return nonzero if there are incomplete multibyte characters at the end of the block:
21620   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
21621   //
is_incomplete(const simd8<uint8_t> input)21622   simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
21623     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
21624     // ... 1111____ 111_____ 11______
21625     static const uint8_t max_array[32] = {
21626       255, 255, 255, 255, 255, 255, 255, 255,
21627       255, 255, 255, 255, 255, 255, 255, 255,
21628       255, 255, 255, 255, 255, 255, 255, 255,
21629       255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
21630     };
21631     const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
21632     return input.gt_bits(max_value);
21633   }
21634 
21635   struct utf8_checker {
21636     // If this is nonzero, there has been a UTF-8 error.
21637     simd8<uint8_t> error;
21638     // The last input we received
21639     simd8<uint8_t> prev_input_block;
21640     // Whether the last input we received was incomplete (used for ASCII fast path)
21641     simd8<uint8_t> prev_incomplete;
21642 
21643     //
21644     // Check whether the current bytes are valid UTF-8.
21645     //
check_utf8_bytessimdutf::haswell::__anone55652eb3111::utf8_validation::utf8_checker21646     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
21647       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
21648       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
21649       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
21650       simd8<uint8_t> sc = check_special_cases(input, prev1);
21651       this->error |= check_multibyte_lengths(input, prev_input, sc);
21652     }
21653 
21654     // The only problem that can happen at EOF is that a multibyte character is too short
21655     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
21656     // too large in the first of two bytes.
check_eofsimdutf::haswell::__anone55652eb3111::utf8_validation::utf8_checker21657     simdutf_really_inline void check_eof() {
21658       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
21659       // possibly finish them.
21660       this->error |= this->prev_incomplete;
21661     }
21662 
check_next_inputsimdutf::haswell::__anone55652eb3111::utf8_validation::utf8_checker21663     simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
21664       if(simdutf_likely(is_ascii(input))) {
21665         this->error |= this->prev_incomplete;
21666       } else {
21667         // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
21668         static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
21669             "We support either two or four chunks per 64-byte block.");
21670         if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
21671           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
21672           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
21673         } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
21674           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
21675           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
21676           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
21677           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
21678         }
21679         this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
21680         this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
21681 
21682       }
21683     }
21684 
21685     // do not forget to call check_eof!
errorssimdutf::haswell::__anone55652eb3111::utf8_validation::utf8_checker21686     simdutf_really_inline bool errors() const {
21687       return this->error.any_bits_set_anywhere();
21688     }
21689 
21690   }; // struct utf8_checker
21691 } // namespace utf8_validation
21692 
21693 using utf8_validation::utf8_checker;
21694 
21695 } // unnamed namespace
21696 } // namespace haswell
21697 } // namespace simdutf
21698 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
21699 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
21700 /* begin file src/generic/utf8_validation/utf8_validator.h */
21701 namespace simdutf {
21702 namespace haswell {
21703 namespace {
21704 namespace utf8_validation {
21705 
21706 /**
21707  * Validates that the string is actual UTF-8.
21708  */
21709 template<class checker>
generic_validate_utf8(const uint8_t * input,size_t length)21710 bool generic_validate_utf8(const uint8_t * input, size_t length) {
21711     checker c{};
21712     buf_block_reader<64> reader(input, length);
21713     while (reader.has_full_block()) {
21714       simd::simd8x64<uint8_t> in(reader.full_block());
21715       c.check_next_input(in);
21716       reader.advance();
21717     }
21718     uint8_t block[64]{};
21719     reader.get_remainder(block);
21720     simd::simd8x64<uint8_t> in(block);
21721     c.check_next_input(in);
21722     reader.advance();
21723     c.check_eof();
21724     return !c.errors();
21725 }
21726 
generic_validate_utf8(const char * input,size_t length)21727 bool generic_validate_utf8(const char * input, size_t length) {
21728   return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
21729 }
21730 
21731 /**
21732  * Validates that the string is actual UTF-8 and stops on errors.
21733  */
21734 template<class checker>
generic_validate_utf8_with_errors(const uint8_t * input,size_t length)21735 result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
21736     checker c{};
21737     buf_block_reader<64> reader(input, length);
21738     size_t count{0};
21739     while (reader.has_full_block()) {
21740       simd::simd8x64<uint8_t> in(reader.full_block());
21741       c.check_next_input(in);
21742       if(c.errors()) {
21743         if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
21744         result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
21745         res.count += count;
21746         return res;
21747       }
21748       reader.advance();
21749       count += 64;
21750     }
21751     uint8_t block[64]{};
21752     reader.get_remainder(block);
21753     simd::simd8x64<uint8_t> in(block);
21754     c.check_next_input(in);
21755     reader.advance();
21756     c.check_eof();
21757     if (c.errors()) {
21758       result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
21759       res.count += count;
21760       return res;
21761     } else {
21762       return result(error_code::SUCCESS, length);
21763     }
21764 }
21765 
generic_validate_utf8_with_errors(const char * input,size_t length)21766 result generic_validate_utf8_with_errors(const char * input, size_t length) {
21767   return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
21768 }
21769 
21770 template<class checker>
generic_validate_ascii(const uint8_t * input,size_t length)21771 bool generic_validate_ascii(const uint8_t * input, size_t length) {
21772     buf_block_reader<64> reader(input, length);
21773     uint8_t blocks[64]{};
21774     simd::simd8x64<uint8_t> running_or(blocks);
21775     while (reader.has_full_block()) {
21776       simd::simd8x64<uint8_t> in(reader.full_block());
21777       running_or |= in;
21778       reader.advance();
21779     }
21780     uint8_t block[64]{};
21781     reader.get_remainder(block);
21782     simd::simd8x64<uint8_t> in(block);
21783     running_or |= in;
21784     return running_or.is_ascii();
21785 }
21786 
generic_validate_ascii(const char * input,size_t length)21787 bool generic_validate_ascii(const char * input, size_t length) {
21788   return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
21789 }
21790 
21791 template<class checker>
generic_validate_ascii_with_errors(const uint8_t * input,size_t length)21792 result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
21793   buf_block_reader<64> reader(input, length);
21794   size_t count{0};
21795   while (reader.has_full_block()) {
21796     simd::simd8x64<uint8_t> in(reader.full_block());
21797     if (!in.is_ascii()) {
21798       result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
21799       return result(res.error, count + res.count);
21800     }
21801     reader.advance();
21802 
21803     count += 64;
21804   }
21805   uint8_t block[64]{};
21806   reader.get_remainder(block);
21807   simd::simd8x64<uint8_t> in(block);
21808   if (!in.is_ascii()) {
21809     result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
21810     return result(res.error, count + res.count);
21811   } else {
21812     return result(error_code::SUCCESS, length);
21813   }
21814 }
21815 
generic_validate_ascii_with_errors(const char * input,size_t length)21816 result generic_validate_ascii_with_errors(const char * input, size_t length) {
21817   return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
21818 }
21819 
21820 } // namespace utf8_validation
21821 } // unnamed namespace
21822 } // namespace haswell
21823 } // namespace simdutf
21824 /* end file src/generic/utf8_validation/utf8_validator.h */
21825 // transcoding from UTF-8 to UTF-16
21826 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
21827 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
21828 
21829 
21830 namespace simdutf {
21831 namespace haswell {
21832 namespace {
21833 namespace utf8_to_utf16 {
21834 
21835 using namespace simd;
21836 
21837 template <endianness endian>
convert_valid(const char * input,size_t size,char16_t * utf16_output)21838 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
21839     char16_t* utf16_output) noexcept {
21840   // The implementation is not specific to haswell and should be moved to the generic directory.
21841   size_t pos = 0;
21842   char16_t* start{utf16_output};
21843   const size_t safety_margin = 16; // to avoid overruns!
21844   while(pos + 64 + safety_margin <= size) {
21845     // this loop could be unrolled further. For example, we could process the mask
21846     // far more than 64 bytes.
21847     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
21848     if(in.is_ascii()) {
21849       in.store_ascii_as_utf16<endian>(utf16_output);
21850       utf16_output += 64;
21851       pos += 64;
21852     } else {
21853       // Slow path. We hope that the compiler will recognize that this is a slow path.
21854       // Anything that is not a continuation mask is a 'leading byte', that is, the
21855       // start of a new code point.
21856       uint64_t utf8_continuation_mask = in.lt(-65 + 1);
21857       // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
21858       uint64_t utf8_leading_mask = ~utf8_continuation_mask;
21859       // The *start* of code points is not so useful, rather, we want the *end* of code points.
21860       uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
21861       // We process in blocks of up to 12 bytes except possibly
21862       // for fast paths which may process up to 16 bytes. For the
21863       // slow path to work, we should have at least 12 input bytes left.
21864       size_t max_starting_point = (pos + 64) - 12;
21865       // Next loop is going to run at least five times when using solely
21866       // the slow/regular path, and at least four times if there are fast paths.
21867       while(pos < max_starting_point) {
21868         // Performance note: our ability to compute 'consumed' and
21869         // then shift and recompute is critical. If there is a
21870         // latency of, say, 4 cycles on getting 'consumed', then
21871         // the inner loop might have a total latency of about 6 cycles.
21872         // Yet we process between 6 to 12 inputs bytes, thus we get
21873         // a speed limit between 1 cycle/byte and 0.5 cycle/byte
21874         // for this section of the code. Hence, there is a limit
21875         // to how much we can further increase this latency before
21876         // it seriously harms performance.
21877         //
21878         // Thus we may allow convert_masked_utf8_to_utf16 to process
21879         // more bytes at a time under a fast-path mode where 16 bytes
21880         // are consumed at once (e.g., when encountering ASCII).
21881         size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
21882                             utf8_end_of_code_point_mask, utf16_output);
21883         pos += consumed;
21884         utf8_end_of_code_point_mask >>= consumed;
21885       }
21886       // At this point there may remain between 0 and 12 bytes in the
21887       // 64-byte block. These bytes will be processed again. So we have an
21888       // 80% efficiency (in the worst case). In practice we expect an
21889       // 85% to 90% efficiency.
21890     }
21891   }
21892   utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
21893   return utf16_output - start;
21894 }
21895 
21896 } // namespace utf8_to_utf16
21897 } // unnamed namespace
21898 } // namespace haswell
21899 } // namespace simdutf
21900 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
21901 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
21902 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
21903 
21904 
21905 namespace simdutf {
21906 namespace haswell {
21907 namespace {
21908 namespace utf8_to_utf16 {
21909 using namespace simd;
21910 
21911 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)21912   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
21913 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
21914 // Bit 1 = Too Long (ASCII followed by continuation)
21915 // Bit 2 = Overlong 3-byte
21916 // Bit 4 = Surrogate
21917 // Bit 5 = Overlong 2-byte
21918 // Bit 7 = Two Continuations
21919     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
21920                                                 // 11______ 11______
21921     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
21922     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
21923     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
21924     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
21925     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
21926     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
21927                                                 // 11110100 101_____
21928                                                 // 11110101 1001____
21929                                                 // 11110101 101_____
21930                                                 // 1111011_ 1001____
21931                                                 // 1111011_ 101_____
21932                                                 // 11111___ 1001____
21933                                                 // 11111___ 101_____
21934     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
21935                                                 // 11110101 1000____
21936                                                 // 1111011_ 1000____
21937                                                 // 11111___ 1000____
21938     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
21939 
21940     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
21941       // 0_______ ________ <ASCII in byte 1>
21942       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
21943       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
21944       // 10______ ________ <continuation in byte 1>
21945       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
21946       // 1100____ ________ <two byte lead in byte 1>
21947       TOO_SHORT | OVERLONG_2,
21948       // 1101____ ________ <two byte lead in byte 1>
21949       TOO_SHORT,
21950       // 1110____ ________ <three byte lead in byte 1>
21951       TOO_SHORT | OVERLONG_3 | SURROGATE,
21952       // 1111____ ________ <four+ byte lead in byte 1>
21953       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
21954     );
21955     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
21956     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
21957       // ____0000 ________
21958       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
21959       // ____0001 ________
21960       CARRY | OVERLONG_2,
21961       // ____001_ ________
21962       CARRY,
21963       CARRY,
21964 
21965       // ____0100 ________
21966       CARRY | TOO_LARGE,
21967       // ____0101 ________
21968       CARRY | TOO_LARGE | TOO_LARGE_1000,
21969       // ____011_ ________
21970       CARRY | TOO_LARGE | TOO_LARGE_1000,
21971       CARRY | TOO_LARGE | TOO_LARGE_1000,
21972 
21973       // ____1___ ________
21974       CARRY | TOO_LARGE | TOO_LARGE_1000,
21975       CARRY | TOO_LARGE | TOO_LARGE_1000,
21976       CARRY | TOO_LARGE | TOO_LARGE_1000,
21977       CARRY | TOO_LARGE | TOO_LARGE_1000,
21978       CARRY | TOO_LARGE | TOO_LARGE_1000,
21979       // ____1101 ________
21980       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
21981       CARRY | TOO_LARGE | TOO_LARGE_1000,
21982       CARRY | TOO_LARGE | TOO_LARGE_1000
21983     );
21984     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
21985       // ________ 0_______ <ASCII in byte 2>
21986       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
21987       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
21988 
21989       // ________ 1000____
21990       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
21991       // ________ 1001____
21992       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
21993       // ________ 101_____
21994       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
21995       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
21996 
21997       // ________ 11______
21998       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
21999     );
22000     return (byte_1_high & byte_1_low & byte_2_high);
22001   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)22002   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
22003       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
22004     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
22005     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
22006     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
22007     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
22008     return must23_80 ^ sc;
22009   }
22010 
22011 
22012   struct validating_transcoder {
22013     // If this is nonzero, there has been a UTF-8 error.
22014     simd8<uint8_t> error;
22015 
validating_transcodersimdutf::haswell::__anone55652eb3411::utf8_to_utf16::validating_transcoder22016     validating_transcoder() : error(uint8_t(0)) {}
22017     //
22018     // Check whether the current bytes are valid UTF-8.
22019     //
check_utf8_bytessimdutf::haswell::__anone55652eb3411::utf8_to_utf16::validating_transcoder22020     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
22021       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
22022       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
22023       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
22024       simd8<uint8_t> sc = check_special_cases(input, prev1);
22025       this->error |= check_multibyte_lengths(input, prev_input, sc);
22026     }
22027 
22028 
22029     template <endianness endian>
convertsimdutf::haswell::__anone55652eb3411::utf8_to_utf16::validating_transcoder22030     simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
22031       size_t pos = 0;
22032       char16_t* start{utf16_output};
22033       // In the worst case, we have the haswell kernel which can cause an overflow of
22034       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
22035       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
22036       // much more than 8 bytes. However, you cannot generally assume that you have valid
22037       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
22038       // to give us a good margin.
22039       size_t leading_byte = 0;
22040       size_t margin = size;
22041       for(; margin > 0 && leading_byte < 8; margin--) {
22042         leading_byte += (int8_t(in[margin-1]) > -65);
22043       }
22044       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
22045       const size_t safety_margin = size - margin + 1; // to avoid overruns!
22046       while(pos + 64 + safety_margin <= size) {
22047         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
22048         if(input.is_ascii()) {
22049           input.store_ascii_as_utf16<endian>(utf16_output);
22050           utf16_output += 64;
22051           pos += 64;
22052         } else {
22053           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
22054           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
22055               "We support either two or four chunks per 64-byte block.");
22056           auto zero = simd8<uint8_t>{uint8_t(0)};
22057           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
22058             this->check_utf8_bytes(input.chunks[0], zero);
22059             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
22060           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
22061             this->check_utf8_bytes(input.chunks[0], zero);
22062             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
22063             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
22064             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
22065           }
22066           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
22067           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
22068           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
22069           // We process in blocks of up to 12 bytes except possibly
22070           // for fast paths which may process up to 16 bytes. For the
22071           // slow path to work, we should have at least 12 input bytes left.
22072           size_t max_starting_point = (pos + 64) - 12;
22073           // Next loop is going to run at least five times.
22074           while(pos < max_starting_point) {
22075             // Performance note: our ability to compute 'consumed' and
22076             // then shift and recompute is critical. If there is a
22077             // latency of, say, 4 cycles on getting 'consumed', then
22078             // the inner loop might have a total latency of about 6 cycles.
22079             // Yet we process between 6 to 12 inputs bytes, thus we get
22080             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
22081             // for this section of the code. Hence, there is a limit
22082             // to how much we can further increase this latency before
22083             // it seriously harms performance.
22084             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
22085                             utf8_end_of_code_point_mask, utf16_output);
22086             pos += consumed;
22087             utf8_end_of_code_point_mask >>= consumed;
22088           }
22089           // At this point there may remain between 0 and 12 bytes in the
22090           // 64-byte block. These bytes will be processed again. So we have an
22091           // 80% efficiency (in the worst case). In practice we expect an
22092           // 85% to 90% efficiency.
22093         }
22094       }
22095       if(errors()) { return 0; }
22096       if(pos < size) {
22097         size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
22098         if(howmany == 0) { return 0; }
22099         utf16_output += howmany;
22100       }
22101       return utf16_output - start;
22102     }
22103 
22104     template <endianness endian>
convert_with_errorssimdutf::haswell::__anone55652eb3411::utf8_to_utf16::validating_transcoder22105     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
22106       size_t pos = 0;
22107       char16_t* start{utf16_output};
22108       // In the worst case, we have the haswell kernel which can cause an overflow of
22109       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
22110       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
22111       // much more than 8 bytes. However, you cannot generally assume that you have valid
22112       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
22113       // to give us a good margin.
22114       size_t leading_byte = 0;
22115       size_t margin = size;
22116       for(; margin > 0 && leading_byte < 8; margin--) {
22117         leading_byte += (int8_t(in[margin-1]) > -65);
22118       }
22119       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
22120       const size_t safety_margin = size - margin + 1; // to avoid overruns!
22121       while(pos + 64 + safety_margin <= size) {
22122         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
22123         if(input.is_ascii()) {
22124           input.store_ascii_as_utf16<endian>(utf16_output);
22125           utf16_output += 64;
22126           pos += 64;
22127         } else {
22128           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
22129           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
22130               "We support either two or four chunks per 64-byte block.");
22131           auto zero = simd8<uint8_t>{uint8_t(0)};
22132           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
22133             this->check_utf8_bytes(input.chunks[0], zero);
22134             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
22135           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
22136             this->check_utf8_bytes(input.chunks[0], zero);
22137             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
22138             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
22139             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
22140           }
22141           if (errors()) {
22142             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
22143             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
22144             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
22145             res.count += pos;
22146             return res;
22147           }
22148           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
22149           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
22150           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
22151           // We process in blocks of up to 12 bytes except possibly
22152           // for fast paths which may process up to 16 bytes. For the
22153           // slow path to work, we should have at least 12 input bytes left.
22154           size_t max_starting_point = (pos + 64) - 12;
22155           // Next loop is going to run at least five times.
22156           while(pos < max_starting_point) {
22157             // Performance note: our ability to compute 'consumed' and
22158             // then shift and recompute is critical. If there is a
22159             // latency of, say, 4 cycles on getting 'consumed', then
22160             // the inner loop might have a total latency of about 6 cycles.
22161             // Yet we process between 6 to 12 inputs bytes, thus we get
22162             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
22163             // for this section of the code. Hence, there is a limit
22164             // to how much we can further increase this latency before
22165             // it seriously harms performance.
22166             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
22167                             utf8_end_of_code_point_mask, utf16_output);
22168             pos += consumed;
22169             utf8_end_of_code_point_mask >>= consumed;
22170           }
22171           // At this point there may remain between 0 and 12 bytes in the
22172           // 64-byte block. These bytes will be processed again. So we have an
22173           // 80% efficiency (in the worst case). In practice we expect an
22174           // 85% to 90% efficiency.
22175         }
22176       }
22177       if(errors()) {
22178         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
22179         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
22180         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
22181         res.count += pos;
22182         return res;
22183       }
22184       if(pos < size) {
22185         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
22186         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
22187         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
22188         if (res.error) {    // In case of error, we want the error position
22189           res.count += pos;
22190           return res;
22191         } else {    // In case of success, we want the number of word written
22192           utf16_output += res.count;
22193         }
22194       }
22195       return result(error_code::SUCCESS, utf16_output - start);
22196     }
22197 
errorssimdutf::haswell::__anone55652eb3411::utf8_to_utf16::validating_transcoder22198     simdutf_really_inline bool errors() const {
22199       return this->error.any_bits_set_anywhere();
22200     }
22201 
22202   }; // struct utf8_checker
22203 } // utf8_to_utf16 namespace
22204 } // unnamed namespace
22205 } // namespace haswell
22206 } // namespace simdutf
22207 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
22208 // transcoding from UTF-8 to UTF-32
22209 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
22210 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
22211 
22212 namespace simdutf {
22213 namespace haswell {
22214 namespace {
22215 namespace utf8_to_utf32 {
22216 
22217 using namespace simd;
22218 
22219 
convert_valid(const char * input,size_t size,char32_t * utf32_output)22220 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
22221     char32_t* utf32_output) noexcept {
22222   size_t pos = 0;
22223   char32_t* start{utf32_output};
22224   const size_t safety_margin = 16; // to avoid overruns!
22225   while(pos + 64 + safety_margin <= size) {
22226     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
22227     if(in.is_ascii()) {
22228       in.store_ascii_as_utf32(utf32_output);
22229       utf32_output += 64;
22230       pos += 64;
22231     } else {
22232     // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
22233     uint64_t utf8_continuation_mask = in.lt(-65 + 1);
22234     uint64_t utf8_leading_mask = ~utf8_continuation_mask;
22235     uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
22236     size_t max_starting_point = (pos + 64) - 12;
22237     while(pos < max_starting_point) {
22238       size_t consumed = convert_masked_utf8_to_utf32(input + pos,
22239                           utf8_end_of_code_point_mask, utf32_output);
22240       pos += consumed;
22241       utf8_end_of_code_point_mask >>= consumed;
22242       }
22243     }
22244   }
22245   utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
22246   return utf32_output - start;
22247 }
22248 
22249 
22250 } // namespace utf8_to_utf32
22251 } // unnamed namespace
22252 } // namespace haswell
22253 } // namespace simdutf
22254 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
22255 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
22256 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
22257 
22258 
22259 namespace simdutf {
22260 namespace haswell {
22261 namespace {
22262 namespace utf8_to_utf32 {
22263 using namespace simd;
22264 
22265 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)22266   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
22267 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
22268 // Bit 1 = Too Long (ASCII followed by continuation)
22269 // Bit 2 = Overlong 3-byte
22270 // Bit 4 = Surrogate
22271 // Bit 5 = Overlong 2-byte
22272 // Bit 7 = Two Continuations
22273     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
22274                                                 // 11______ 11______
22275     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
22276     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
22277     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
22278     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
22279     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
22280     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
22281                                                 // 11110100 101_____
22282                                                 // 11110101 1001____
22283                                                 // 11110101 101_____
22284                                                 // 1111011_ 1001____
22285                                                 // 1111011_ 101_____
22286                                                 // 11111___ 1001____
22287                                                 // 11111___ 101_____
22288     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
22289                                                 // 11110101 1000____
22290                                                 // 1111011_ 1000____
22291                                                 // 11111___ 1000____
22292     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
22293 
22294     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
22295       // 0_______ ________ <ASCII in byte 1>
22296       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
22297       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
22298       // 10______ ________ <continuation in byte 1>
22299       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
22300       // 1100____ ________ <two byte lead in byte 1>
22301       TOO_SHORT | OVERLONG_2,
22302       // 1101____ ________ <two byte lead in byte 1>
22303       TOO_SHORT,
22304       // 1110____ ________ <three byte lead in byte 1>
22305       TOO_SHORT | OVERLONG_3 | SURROGATE,
22306       // 1111____ ________ <four+ byte lead in byte 1>
22307       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
22308     );
22309     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
22310     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
22311       // ____0000 ________
22312       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
22313       // ____0001 ________
22314       CARRY | OVERLONG_2,
22315       // ____001_ ________
22316       CARRY,
22317       CARRY,
22318 
22319       // ____0100 ________
22320       CARRY | TOO_LARGE,
22321       // ____0101 ________
22322       CARRY | TOO_LARGE | TOO_LARGE_1000,
22323       // ____011_ ________
22324       CARRY | TOO_LARGE | TOO_LARGE_1000,
22325       CARRY | TOO_LARGE | TOO_LARGE_1000,
22326 
22327       // ____1___ ________
22328       CARRY | TOO_LARGE | TOO_LARGE_1000,
22329       CARRY | TOO_LARGE | TOO_LARGE_1000,
22330       CARRY | TOO_LARGE | TOO_LARGE_1000,
22331       CARRY | TOO_LARGE | TOO_LARGE_1000,
22332       CARRY | TOO_LARGE | TOO_LARGE_1000,
22333       // ____1101 ________
22334       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
22335       CARRY | TOO_LARGE | TOO_LARGE_1000,
22336       CARRY | TOO_LARGE | TOO_LARGE_1000
22337     );
22338     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
22339       // ________ 0_______ <ASCII in byte 2>
22340       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
22341       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
22342 
22343       // ________ 1000____
22344       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
22345       // ________ 1001____
22346       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
22347       // ________ 101_____
22348       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
22349       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
22350 
22351       // ________ 11______
22352       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
22353     );
22354     return (byte_1_high & byte_1_low & byte_2_high);
22355   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)22356   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
22357       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
22358     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
22359     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
22360     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
22361     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
22362     return must23_80 ^ sc;
22363   }
22364 
22365 
22366   struct validating_transcoder {
22367     // If this is nonzero, there has been a UTF-8 error.
22368     simd8<uint8_t> error;
22369 
validating_transcodersimdutf::haswell::__anone55652eb3611::utf8_to_utf32::validating_transcoder22370     validating_transcoder() : error(uint8_t(0)) {}
22371     //
22372     // Check whether the current bytes are valid UTF-8.
22373     //
check_utf8_bytessimdutf::haswell::__anone55652eb3611::utf8_to_utf32::validating_transcoder22374     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
22375       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
22376       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
22377       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
22378       simd8<uint8_t> sc = check_special_cases(input, prev1);
22379       this->error |= check_multibyte_lengths(input, prev_input, sc);
22380     }
22381 
22382 
22383 
convertsimdutf::haswell::__anone55652eb3611::utf8_to_utf32::validating_transcoder22384     simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
22385       size_t pos = 0;
22386       char32_t* start{utf32_output};
22387       // In the worst case, we have the haswell kernel which can cause an overflow of
22388       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
22389       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
22390       // much more than 8 bytes. However, you cannot generally assume that you have valid
22391       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
22392       // to give us a good margin.
22393       size_t leading_byte = 0;
22394       size_t margin = size;
22395       for(; margin > 0 && leading_byte < 4; margin--) {
22396         leading_byte += (int8_t(in[margin-1]) > -65);
22397       }
22398       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
22399       const size_t safety_margin = size - margin + 1; // to avoid overruns!
22400       while(pos + 64 + safety_margin <= size) {
22401         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
22402         if(input.is_ascii()) {
22403           input.store_ascii_as_utf32(utf32_output);
22404           utf32_output += 64;
22405           pos += 64;
22406         } else {
22407           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
22408           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
22409               "We support either two or four chunks per 64-byte block.");
22410           auto zero = simd8<uint8_t>{uint8_t(0)};
22411           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
22412             this->check_utf8_bytes(input.chunks[0], zero);
22413             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
22414           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
22415             this->check_utf8_bytes(input.chunks[0], zero);
22416             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
22417             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
22418             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
22419           }
22420           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
22421           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
22422           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
22423           // We process in blocks of up to 12 bytes except possibly
22424           // for fast paths which may process up to 16 bytes. For the
22425           // slow path to work, we should have at least 12 input bytes left.
22426           size_t max_starting_point = (pos + 64) - 12;
22427           // Next loop is going to run at least five times.
22428           while(pos < max_starting_point) {
22429             // Performance note: our ability to compute 'consumed' and
22430             // then shift and recompute is critical. If there is a
22431             // latency of, say, 4 cycles on getting 'consumed', then
22432             // the inner loop might have a total latency of about 6 cycles.
22433             // Yet we process between 6 to 12 inputs bytes, thus we get
22434             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
22435             // for this section of the code. Hence, there is a limit
22436             // to how much we can further increase this latency before
22437             // it seriously harms performance.
22438             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
22439                             utf8_end_of_code_point_mask, utf32_output);
22440             pos += consumed;
22441             utf8_end_of_code_point_mask >>= consumed;
22442           }
22443           // At this point there may remain between 0 and 12 bytes in the
22444           // 64-byte block. These bytes will be processed again. So we have an
22445           // 80% efficiency (in the worst case). In practice we expect an
22446           // 85% to 90% efficiency.
22447         }
22448       }
22449       if(errors()) { return 0; }
22450       if(pos < size) {
22451         size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
22452         if(howmany == 0) { return 0; }
22453         utf32_output += howmany;
22454       }
22455       return utf32_output - start;
22456     }
22457 
convert_with_errorssimdutf::haswell::__anone55652eb3611::utf8_to_utf32::validating_transcoder22458     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
22459       size_t pos = 0;
22460       char32_t* start{utf32_output};
22461       // In the worst case, we have the haswell kernel which can cause an overflow of
22462       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
22463       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
22464       // much more than 8 bytes. However, you cannot generally assume that you have valid
22465       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
22466       // to give us a good margin.
22467       size_t leading_byte = 0;
22468       size_t margin = size;
22469       for(; margin > 0 && leading_byte < 4; margin--) {
22470         leading_byte += (int8_t(in[margin-1]) > -65);
22471       }
22472       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
22473       const size_t safety_margin = size - margin + 1; // to avoid overruns!
22474       while(pos + 64 + safety_margin <= size) {
22475         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
22476         if(input.is_ascii()) {
22477           input.store_ascii_as_utf32(utf32_output);
22478           utf32_output += 64;
22479           pos += 64;
22480         } else {
22481           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
22482           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
22483               "We support either two or four chunks per 64-byte block.");
22484           auto zero = simd8<uint8_t>{uint8_t(0)};
22485           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
22486             this->check_utf8_bytes(input.chunks[0], zero);
22487             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
22488           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
22489             this->check_utf8_bytes(input.chunks[0], zero);
22490             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
22491             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
22492             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
22493           }
22494           if (errors()) {
22495             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
22496             res.count += pos;
22497             return res;
22498           }
22499           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
22500           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
22501           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
22502           // We process in blocks of up to 12 bytes except possibly
22503           // for fast paths which may process up to 16 bytes. For the
22504           // slow path to work, we should have at least 12 input bytes left.
22505           size_t max_starting_point = (pos + 64) - 12;
22506           // Next loop is going to run at least five times.
22507           while(pos < max_starting_point) {
22508             // Performance note: our ability to compute 'consumed' and
22509             // then shift and recompute is critical. If there is a
22510             // latency of, say, 4 cycles on getting 'consumed', then
22511             // the inner loop might have a total latency of about 6 cycles.
22512             // Yet we process between 6 to 12 inputs bytes, thus we get
22513             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
22514             // for this section of the code. Hence, there is a limit
22515             // to how much we can further increase this latency before
22516             // it seriously harms performance.
22517             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
22518                             utf8_end_of_code_point_mask, utf32_output);
22519             pos += consumed;
22520             utf8_end_of_code_point_mask >>= consumed;
22521           }
22522           // At this point there may remain between 0 and 12 bytes in the
22523           // 64-byte block. These bytes will be processed again. So we have an
22524           // 80% efficiency (in the worst case). In practice we expect an
22525           // 85% to 90% efficiency.
22526         }
22527       }
22528       if(errors()) {
22529         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
22530         res.count += pos;
22531         return res;
22532       }
22533       if(pos < size) {
22534         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
22535         if (res.error) {    // In case of error, we want the error position
22536           res.count += pos;
22537           return res;
22538         } else {    // In case of success, we want the number of word written
22539           utf32_output += res.count;
22540         }
22541       }
22542       return result(error_code::SUCCESS, utf32_output - start);
22543     }
22544 
errorssimdutf::haswell::__anone55652eb3611::utf8_to_utf32::validating_transcoder22545     simdutf_really_inline bool errors() const {
22546       return this->error.any_bits_set_anywhere();
22547     }
22548 
22549   }; // struct utf8_checker
22550 } // utf8_to_utf32 namespace
22551 } // unnamed namespace
22552 } // namespace haswell
22553 } // namespace simdutf
22554 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
22555 // other functions
22556 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8.h
22557 /* begin file src/generic/utf8.h */
22558 
22559 namespace simdutf {
22560 namespace haswell {
22561 namespace {
22562 namespace utf8 {
22563 
22564 using namespace simd;
22565 
count_code_points(const char * in,size_t size)22566 simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
22567     size_t pos = 0;
22568     size_t count = 0;
22569     for(;pos + 64 <= size; pos += 64) {
22570       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
22571       uint64_t utf8_continuation_mask = input.lt(-65 + 1);
22572       count += 64 - count_ones(utf8_continuation_mask);
22573     }
22574     return count + scalar::utf8::count_code_points(in + pos, size - pos);
22575 }
22576 
22577 
utf16_length_from_utf8(const char * in,size_t size)22578 simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
22579     size_t pos = 0;
22580     size_t count = 0;
22581     // This algorithm could no doubt be improved!
22582     for(;pos + 64 <= size; pos += 64) {
22583       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
22584       uint64_t utf8_continuation_mask = input.lt(-65 + 1);
22585       // We count one word for anything that is not a continuation (so
22586       // leading bytes).
22587       count += 64 - count_ones(utf8_continuation_mask);
22588       int64_t utf8_4byte = input.gteq_unsigned(240);
22589       count += count_ones(utf8_4byte);
22590     }
22591     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
22592 }
22593 
22594 
utf32_length_from_utf8(const char * in,size_t size)22595 simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
22596     return count_code_points(in, size);
22597 }
22598 } // utf8 namespace
22599 } // unnamed namespace
22600 } // namespace haswell
22601 } // namespace simdutf
22602 /* end file src/generic/utf8.h */
22603 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf16.h
22604 /* begin file src/generic/utf16.h */
22605 namespace simdutf {
22606 namespace haswell {
22607 namespace {
22608 namespace utf16 {
22609 
22610 template <endianness big_endian>
count_code_points(const char16_t * in,size_t size)22611 simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
22612     size_t pos = 0;
22613     size_t count = 0;
22614     for(;pos + 32 <= size; pos += 32) {
22615       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
22616       if (!match_system(big_endian)) input.swap_bytes();
22617       uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
22618       count += count_ones(not_pair) / 2;
22619     }
22620     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
22621 }
22622 
22623 template <endianness big_endian>
utf8_length_from_utf16(const char16_t * in,size_t size)22624 simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
22625     size_t pos = 0;
22626     size_t count = 0;
22627     // This algorithm could no doubt be improved!
22628     for(;pos + 32 <= size; pos += 32) {
22629       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
22630       if (!match_system(big_endian)) input.swap_bytes();
22631       uint64_t ascii_mask = input.lteq(0x7F);
22632       uint64_t twobyte_mask = input.lteq(0x7FF);
22633       uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
22634 
22635       size_t ascii_count = count_ones(ascii_mask) / 2;
22636       size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
22637       size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
22638       size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
22639       count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
22640     }
22641     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
22642 }
22643 
22644 template <endianness big_endian>
utf32_length_from_utf16(const char16_t * in,size_t size)22645 simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
22646     return count_code_points<big_endian>(in, size);
22647 }
22648 
change_endianness_utf16(const char16_t * in,size_t size,char16_t * output)22649 simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
22650   size_t pos = 0;
22651 
22652   while (pos + 32 <= size) {
22653     simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
22654     input.swap_bytes();
22655     input.store(reinterpret_cast<uint16_t *>(output));
22656     pos += 32;
22657     output += 32;
22658   }
22659 
22660   scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
22661 }
22662 
22663 } // utf16
22664 } // unnamed namespace
22665 } // namespace haswell
22666 } // namespace simdutf
22667 /* end file src/generic/utf16.h */
22668 
22669 namespace simdutf {
22670 namespace haswell {
22671 
detect_encodings(const char * input,size_t length) const22672 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
22673   // If there is a BOM, then we trust it.
22674   auto bom_encoding = simdutf::BOM::check_bom(input, length);
22675   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
22676   if (length % 2 == 0) {
22677     return avx2_detect_encodings<utf8_validation::utf8_checker>(input, length);
22678   } else {
22679     if (implementation::validate_utf8(input, length)) {
22680       return simdutf::encoding_type::UTF8;
22681     } else {
22682       return simdutf::encoding_type::unspecified;
22683     }
22684   }
22685 }
22686 
validate_utf8(const char * buf,size_t len) const22687 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
22688   return haswell::utf8_validation::generic_validate_utf8(buf,len);
22689 }
22690 
validate_utf8_with_errors(const char * buf,size_t len) const22691 simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
22692   return haswell::utf8_validation::generic_validate_utf8_with_errors(buf,len);
22693 }
22694 
validate_ascii(const char * buf,size_t len) const22695 simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
22696   return haswell::utf8_validation::generic_validate_ascii(buf,len);
22697 }
22698 
validate_ascii_with_errors(const char * buf,size_t len) const22699 simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
22700   return haswell::utf8_validation::generic_validate_ascii_with_errors(buf,len);
22701 }
22702 
validate_utf16le(const char16_t * buf,size_t len) const22703 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
22704   const char16_t* tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
22705   if (tail) {
22706     return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
22707   } else {
22708     return false;
22709   }
22710 }
22711 
validate_utf16be(const char16_t * buf,size_t len) const22712 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
22713   const char16_t* tail = avx2_validate_utf16<endianness::BIG>(buf, len);
22714   if (tail) {
22715     return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
22716   } else {
22717     return false;
22718   }
22719 }
22720 
validate_utf16le_with_errors(const char16_t * buf,size_t len) const22721 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
22722   result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
22723   if (res.count != len) {
22724     result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
22725     return result(scalar_res.error, res.count + scalar_res.count);
22726   } else {
22727     return res;
22728   }
22729 }
22730 
validate_utf16be_with_errors(const char16_t * buf,size_t len) const22731 simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
22732   result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
22733   if (res.count != len) {
22734     result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
22735     return result(scalar_res.error, res.count + scalar_res.count);
22736   } else {
22737     return res;
22738   }
22739 }
22740 
validate_utf32(const char32_t * buf,size_t len) const22741 simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
22742   const char32_t* tail = avx2_validate_utf32le(buf, len);
22743   if (tail) {
22744     return scalar::utf32::validate(tail, len - (tail - buf));
22745   } else {
22746     return false;
22747   }
22748 }
22749 
validate_utf32_with_errors(const char32_t * buf,size_t len) const22750 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
22751   result res = avx2_validate_utf32le_with_errors(buf, len);
22752   if (res.count != len) {
22753     result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
22754     return result(scalar_res.error, res.count + scalar_res.count);
22755   } else {
22756     return res;
22757   }
22758 }
22759 
convert_utf8_to_utf16le(const char * buf,size_t len,char16_t * utf16_output) const22760 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
22761   utf8_to_utf16::validating_transcoder converter;
22762   return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
22763 }
22764 
convert_utf8_to_utf16be(const char * buf,size_t len,char16_t * utf16_output) const22765 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
22766   utf8_to_utf16::validating_transcoder converter;
22767   return converter.convert<endianness::BIG>(buf, len, utf16_output);
22768 }
22769 
convert_utf8_to_utf16le_with_errors(const char * buf,size_t len,char16_t * utf16_output) const22770 simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
22771   utf8_to_utf16::validating_transcoder converter;
22772   return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
22773 }
22774 
convert_utf8_to_utf16be_with_errors(const char * buf,size_t len,char16_t * utf16_output) const22775 simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
22776   utf8_to_utf16::validating_transcoder converter;
22777   return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
22778 }
22779 
convert_valid_utf8_to_utf16le(const char * input,size_t size,char16_t * utf16_output) const22780 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
22781     char16_t* utf16_output) const noexcept {
22782    return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
22783 }
22784 
convert_valid_utf8_to_utf16be(const char * input,size_t size,char16_t * utf16_output) const22785 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
22786     char16_t* utf16_output) const noexcept {
22787    return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
22788 }
22789 
convert_utf8_to_utf32(const char * buf,size_t len,char32_t * utf32_output) const22790 simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
22791   utf8_to_utf32::validating_transcoder converter;
22792   return converter.convert(buf, len, utf32_output);
22793 }
22794 
convert_utf8_to_utf32_with_errors(const char * buf,size_t len,char32_t * utf32_output) const22795 simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
22796   utf8_to_utf32::validating_transcoder converter;
22797   return converter.convert_with_errors(buf, len, utf32_output);
22798 }
22799 
convert_valid_utf8_to_utf32(const char * input,size_t size,char32_t * utf32_output) const22800 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
22801     char32_t* utf32_output) const noexcept {
22802   return utf8_to_utf32::convert_valid(input, size,  utf32_output);
22803 }
22804 
convert_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const22805 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
22806   std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
22807   if (ret.first == nullptr) { return 0; }
22808   size_t saved_bytes = ret.second - utf8_output;
22809   if (ret.first != buf + len) {
22810     const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
22811                                         ret.first, len - (ret.first - buf), ret.second);
22812     if (scalar_saved_bytes == 0) { return 0; }
22813     saved_bytes += scalar_saved_bytes;
22814   }
22815   return saved_bytes;
22816 }
22817 
convert_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const22818 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
22819   std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
22820   if (ret.first == nullptr) { return 0; }
22821   size_t saved_bytes = ret.second - utf8_output;
22822   if (ret.first != buf + len) {
22823     const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
22824                                         ret.first, len - (ret.first - buf), ret.second);
22825     if (scalar_saved_bytes == 0) { return 0; }
22826     saved_bytes += scalar_saved_bytes;
22827   }
22828   return saved_bytes;
22829 }
22830 
convert_utf16le_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output) const22831 simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
22832   // ret.first.count is always the position in the buffer, not the number of words written even if finished
22833   std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
22834   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
22835   if (ret.first.count != len) { // All good so far, but not finished
22836     result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
22837                                         buf + ret.first.count, len - ret.first.count, ret.second);
22838     if (scalar_res.error) {
22839       scalar_res.count += ret.first.count;
22840       return scalar_res;
22841     } else {
22842       ret.second += scalar_res.count;
22843     }
22844   }
22845   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
22846   return ret.first;
22847 }
22848 
convert_utf16be_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output) const22849 simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
22850   // ret.first.count is always the position in the buffer, not the number of words written even if finished
22851   std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
22852   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
22853   if (ret.first.count != len) { // All good so far, but not finished
22854     result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
22855                                         buf + ret.first.count, len - ret.first.count, ret.second);
22856     if (scalar_res.error) {
22857       scalar_res.count += ret.first.count;
22858       return scalar_res;
22859     } else {
22860       ret.second += scalar_res.count;
22861     }
22862   }
22863   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
22864   return ret.first;
22865 }
22866 
convert_valid_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const22867 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
22868   return convert_utf16le_to_utf8(buf, len, utf8_output);
22869 }
22870 
convert_valid_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const22871 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
22872   return convert_utf16be_to_utf8(buf, len, utf8_output);
22873 }
22874 
convert_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output) const22875 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
22876   std::pair<const char32_t*, char*> ret = avx2_convert_utf32_to_utf8(buf, len, utf8_output);
22877   if (ret.first == nullptr) { return 0; }
22878   size_t saved_bytes = ret.second - utf8_output;
22879   if (ret.first != buf + len) {
22880     const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
22881                                         ret.first, len - (ret.first - buf), ret.second);
22882     if (scalar_saved_bytes == 0) { return 0; }
22883     saved_bytes += scalar_saved_bytes;
22884   }
22885   return saved_bytes;
22886 }
22887 
convert_utf32_to_utf8_with_errors(const char32_t * buf,size_t len,char * utf8_output) const22888 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
22889   // ret.first.count is always the position in the buffer, not the number of words written even if finished
22890   std::pair<result, char*> ret = haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
22891   if (ret.first.count != len) {
22892     result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
22893                                         buf + ret.first.count, len - ret.first.count, ret.second);
22894     if (scalar_res.error) {
22895       scalar_res.count += ret.first.count;
22896       return scalar_res;
22897     } else {
22898       ret.second += scalar_res.count;
22899     }
22900   }
22901   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
22902   return ret.first;
22903 }
22904 
convert_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const22905 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
22906   std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
22907   if (ret.first == nullptr) { return 0; }
22908   size_t saved_bytes = ret.second - utf32_output;
22909   if (ret.first != buf + len) {
22910     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
22911                                         ret.first, len - (ret.first - buf), ret.second);
22912     if (scalar_saved_bytes == 0) { return 0; }
22913     saved_bytes += scalar_saved_bytes;
22914   }
22915   return saved_bytes;
22916 }
22917 
convert_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const22918 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
22919   std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
22920   if (ret.first == nullptr) { return 0; }
22921   size_t saved_bytes = ret.second - utf32_output;
22922   if (ret.first != buf + len) {
22923     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
22924                                         ret.first, len - (ret.first - buf), ret.second);
22925     if (scalar_saved_bytes == 0) { return 0; }
22926     saved_bytes += scalar_saved_bytes;
22927   }
22928   return saved_bytes;
22929 }
22930 
convert_utf16le_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output) const22931 simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
22932   // ret.first.count is always the position in the buffer, not the number of words written even if finished
22933   std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
22934   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
22935   if (ret.first.count != len) { // All good so far, but not finished
22936     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
22937                                         buf + ret.first.count, len - ret.first.count, ret.second);
22938     if (scalar_res.error) {
22939       scalar_res.count += ret.first.count;
22940       return scalar_res;
22941     } else {
22942       ret.second += scalar_res.count;
22943     }
22944   }
22945   ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
22946   return ret.first;
22947 }
22948 
convert_utf16be_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output) const22949 simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
22950   // ret.first.count is always the position in the buffer, not the number of words written even if finished
22951   std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
22952   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
22953   if (ret.first.count != len) { // All good so far, but not finished
22954     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
22955                                         buf + ret.first.count, len - ret.first.count, ret.second);
22956     if (scalar_res.error) {
22957       scalar_res.count += ret.first.count;
22958       return scalar_res;
22959     } else {
22960       ret.second += scalar_res.count;
22961     }
22962   }
22963   ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
22964   return ret.first;
22965 }
22966 
convert_valid_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output) const22967 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
22968   return convert_utf32_to_utf8(buf, len, utf8_output);
22969 }
22970 
convert_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_output) const22971 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
22972   std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
22973   if (ret.first == nullptr) { return 0; }
22974   size_t saved_bytes = ret.second - utf16_output;
22975   if (ret.first != buf + len) {
22976     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
22977                                         ret.first, len - (ret.first - buf), ret.second);
22978     if (scalar_saved_bytes == 0) { return 0; }
22979     saved_bytes += scalar_saved_bytes;
22980   }
22981   return saved_bytes;
22982 }
22983 
convert_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_output) const22984 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
22985   std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
22986   if (ret.first == nullptr) { return 0; }
22987   size_t saved_bytes = ret.second - utf16_output;
22988   if (ret.first != buf + len) {
22989     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
22990                                         ret.first, len - (ret.first - buf), ret.second);
22991     if (scalar_saved_bytes == 0) { return 0; }
22992     saved_bytes += scalar_saved_bytes;
22993   }
22994   return saved_bytes;
22995 }
22996 
convert_utf32_to_utf16le_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output) const22997 simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
22998   // ret.first.count is always the position in the buffer, not the number of words written even if finished
22999   std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
23000   if (ret.first.count != len) {
23001     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
23002                                         buf + ret.first.count, len - ret.first.count, ret.second);
23003     if (scalar_res.error) {
23004       scalar_res.count += ret.first.count;
23005       return scalar_res;
23006     } else {
23007       ret.second += scalar_res.count;
23008     }
23009   }
23010   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
23011   return ret.first;
23012 }
23013 
convert_utf32_to_utf16be_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output) const23014 simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
23015   // ret.first.count is always the position in the buffer, not the number of words written even if finished
23016   std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
23017   if (ret.first.count != len) {
23018     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
23019                                         buf + ret.first.count, len - ret.first.count, ret.second);
23020     if (scalar_res.error) {
23021       scalar_res.count += ret.first.count;
23022       return scalar_res;
23023     } else {
23024       ret.second += scalar_res.count;
23025     }
23026   }
23027   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
23028   return ret.first;
23029 }
23030 
convert_valid_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_output) const23031 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
23032   return convert_utf32_to_utf16le(buf, len, utf16_output);
23033 }
23034 
convert_valid_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_output) const23035 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
23036   return convert_utf32_to_utf16be(buf, len, utf16_output);
23037 }
23038 
convert_valid_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const23039 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
23040   return convert_utf16le_to_utf32(buf, len, utf32_output);
23041 }
23042 
convert_valid_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const23043 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
23044   return convert_utf16be_to_utf32(buf, len, utf32_output);
23045 }
23046 
change_endianness_utf16(const char16_t * input,size_t length,char16_t * output) const23047 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
23048   utf16::change_endianness_utf16(input, length, output);
23049 }
23050 
count_utf16le(const char16_t * input,size_t length) const23051 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
23052   return utf16::count_code_points<endianness::LITTLE>(input, length);
23053 }
23054 
count_utf16be(const char16_t * input,size_t length) const23055 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
23056   return utf16::count_code_points<endianness::BIG>(input, length);
23057 }
23058 
count_utf8(const char * input,size_t length) const23059 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
23060   return utf8::count_code_points(input, length);
23061 }
23062 
utf8_length_from_utf16le(const char16_t * input,size_t length) const23063 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
23064   return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
23065 }
23066 
utf8_length_from_utf16be(const char16_t * input,size_t length) const23067 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
23068   return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
23069 }
23070 
utf32_length_from_utf16le(const char16_t * input,size_t length) const23071 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
23072   return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
23073 }
23074 
utf32_length_from_utf16be(const char16_t * input,size_t length) const23075 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
23076   return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
23077 }
23078 
utf16_length_from_utf8(const char * input,size_t length) const23079 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
23080   return utf8::utf16_length_from_utf8(input, length);
23081 }
23082 
utf8_length_from_utf32(const char32_t * input,size_t length) const23083 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
23084   const __m256i v_00000000 = _mm256_setzero_si256();
23085   const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
23086   const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
23087   const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
23088   size_t pos = 0;
23089   size_t count = 0;
23090   for(;pos + 8 <= length; pos += 8) {
23091     __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
23092     const __m256i ascii_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
23093     const __m256i one_two_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
23094     const __m256i two_bytes_bytemask = _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
23095     const __m256i one_two_three_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
23096     const __m256i three_bytes_bytemask = _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
23097     const uint32_t ascii_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
23098     const uint32_t two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
23099     const uint32_t three_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
23100 
23101     size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
23102     size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
23103     size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
23104     count += 32 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
23105   }
23106   return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
23107 }
23108 
utf16_length_from_utf32(const char32_t * input,size_t length) const23109 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
23110   const __m256i v_00000000 = _mm256_setzero_si256();
23111   const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
23112   size_t pos = 0;
23113   size_t count = 0;
23114   for(;pos + 8 <= length; pos += 8) {
23115     __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
23116     const __m256i surrogate_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
23117     const uint32_t surrogate_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
23118     size_t surrogate_count = (32-count_ones(surrogate_bitmask))/4;
23119     count += 8 + surrogate_count;
23120   }
23121   return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
23122 }
23123 
utf32_length_from_utf8(const char * input,size_t length) const23124 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
23125   return scalar::utf8::count_code_points(input, length);
23126 }
23127 
23128 } // namespace haswell
23129 } // namespace simdutf
23130 
23131 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h
23132 /* begin file src/simdutf/haswell/end.h */
23133 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
23134 // nothing needed.
23135 #else
23136 SIMDUTF_UNTARGET_REGION
23137 #endif
23138 
23139 
23140 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
23141 SIMDUTF_POP_DISABLE_WARNINGS
23142 #endif // end of workaround
23143 /* end file src/simdutf/haswell/end.h */
23144 /* end file src/haswell/implementation.cpp */
23145 #endif
23146 #if SIMDUTF_IMPLEMENTATION_PPC64
23147 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=ppc64/implementation.cpp
23148 /* begin file src/ppc64/implementation.cpp */
23149 
23150 
23151 
23152 
23153 
23154 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h
23155 /* begin file src/simdutf/ppc64/begin.h */
23156 // redefining SIMDUTF_IMPLEMENTATION to "ppc64"
23157 // #define SIMDUTF_IMPLEMENTATION ppc64
23158 /* end file src/simdutf/ppc64/begin.h */
23159 namespace simdutf {
23160 namespace ppc64 {
23161 namespace {
23162 #ifndef SIMDUTF_PPC64_H
23163 #error "ppc64.h must be included"
23164 #endif
23165 using namespace simd;
23166 
23167 
is_ascii(const simd8x64<uint8_t> & input)23168 simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
23169   // careful: 0x80 is not ascii.
23170   return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
23171 }
23172 
must_be_continuation(const simd8<uint8_t> prev1,const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)23173 simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
23174   simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
23175   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
23176   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
23177   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
23178   return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
23179 }
23180 
must_be_2_3_continuation(const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)23181 simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
23182   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
23183   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
23184   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
23185   return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
23186 }
23187 
23188 } // unnamed namespace
23189 } // namespace ppc64
23190 } // namespace simdutf
23191 
23192 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
23193 /* begin file src/generic/buf_block_reader.h */
23194 namespace simdutf {
23195 namespace ppc64 {
23196 namespace {
23197 
23198 // Walks through a buffer in block-sized increments, loading the last part with spaces
23199 template<size_t STEP_SIZE>
23200 struct buf_block_reader {
23201 public:
23202   simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
23203   simdutf_really_inline size_t block_index();
23204   simdutf_really_inline bool has_full_block() const;
23205   simdutf_really_inline const uint8_t *full_block() const;
23206   /**
23207    * Get the last block, padded with spaces.
23208    *
23209    * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
23210    * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
23211    * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
23212    *
23213    * @return the number of effective characters in the last block.
23214    */
23215   simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
23216   simdutf_really_inline void advance();
23217 private:
23218   const uint8_t *buf;
23219   const size_t len;
23220   const size_t lenminusstep;
23221   size_t idx;
23222 };
23223 
23224 // Routines to print masks and text for debugging bitmask operations
format_input_text_64(const uint8_t * text)23225 simdutf_unused static char * format_input_text_64(const uint8_t *text) {
23226   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
23227   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
23228     buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
23229   }
23230   buf[sizeof(simd8x64<uint8_t>)] = '\0';
23231   return buf;
23232 }
23233 
23234 // Routines to print masks and text for debugging bitmask operations
format_input_text(const simd8x64<uint8_t> & in)23235 simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
23236   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
23237   in.store(reinterpret_cast<uint8_t*>(buf));
23238   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
23239     if (buf[i] < ' ') { buf[i] = '_'; }
23240   }
23241   buf[sizeof(simd8x64<uint8_t>)] = '\0';
23242   return buf;
23243 }
23244 
format_mask(uint64_t mask)23245 simdutf_unused static char * format_mask(uint64_t mask) {
23246   static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
23247   for (size_t i=0; i<64; i++) {
23248     buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
23249   }
23250   buf[64] = '\0';
23251   return buf;
23252 }
23253 
23254 template<size_t STEP_SIZE>
buf_block_reader(const uint8_t * _buf,size_t _len)23255 simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
23256 
23257 template<size_t STEP_SIZE>
block_index()23258 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
23259 
23260 template<size_t STEP_SIZE>
has_full_block() const23261 simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
23262   return idx < lenminusstep;
23263 }
23264 
23265 template<size_t STEP_SIZE>
full_block() const23266 simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
23267   return &buf[idx];
23268 }
23269 
23270 template<size_t STEP_SIZE>
get_remainder(uint8_t * dst) const23271 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
23272   if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
23273   std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
23274   std::memcpy(dst, buf + idx, len - idx);
23275   return len - idx;
23276 }
23277 
23278 template<size_t STEP_SIZE>
advance()23279 simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
23280   idx += STEP_SIZE;
23281 }
23282 
23283 } // unnamed namespace
23284 } // namespace ppc64
23285 } // namespace simdutf
23286 /* end file src/generic/buf_block_reader.h */
23287 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
23288 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
23289 namespace simdutf {
23290 namespace ppc64 {
23291 namespace {
23292 namespace utf8_validation {
23293 
23294 using namespace simd;
23295 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)23296   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
23297 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
23298 // Bit 1 = Too Long (ASCII followed by continuation)
23299 // Bit 2 = Overlong 3-byte
23300 // Bit 4 = Surrogate
23301 // Bit 5 = Overlong 2-byte
23302 // Bit 7 = Two Continuations
23303     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
23304                                                 // 11______ 11______
23305     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
23306     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
23307     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
23308     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
23309     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
23310     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
23311                                                 // 11110100 101_____
23312                                                 // 11110101 1001____
23313                                                 // 11110101 101_____
23314                                                 // 1111011_ 1001____
23315                                                 // 1111011_ 101_____
23316                                                 // 11111___ 1001____
23317                                                 // 11111___ 101_____
23318     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
23319                                                 // 11110101 1000____
23320                                                 // 1111011_ 1000____
23321                                                 // 11111___ 1000____
23322     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
23323 
23324     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
23325       // 0_______ ________ <ASCII in byte 1>
23326       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
23327       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
23328       // 10______ ________ <continuation in byte 1>
23329       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
23330       // 1100____ ________ <two byte lead in byte 1>
23331       TOO_SHORT | OVERLONG_2,
23332       // 1101____ ________ <two byte lead in byte 1>
23333       TOO_SHORT,
23334       // 1110____ ________ <three byte lead in byte 1>
23335       TOO_SHORT | OVERLONG_3 | SURROGATE,
23336       // 1111____ ________ <four+ byte lead in byte 1>
23337       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
23338     );
23339     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
23340     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
23341       // ____0000 ________
23342       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
23343       // ____0001 ________
23344       CARRY | OVERLONG_2,
23345       // ____001_ ________
23346       CARRY,
23347       CARRY,
23348 
23349       // ____0100 ________
23350       CARRY | TOO_LARGE,
23351       // ____0101 ________
23352       CARRY | TOO_LARGE | TOO_LARGE_1000,
23353       // ____011_ ________
23354       CARRY | TOO_LARGE | TOO_LARGE_1000,
23355       CARRY | TOO_LARGE | TOO_LARGE_1000,
23356 
23357       // ____1___ ________
23358       CARRY | TOO_LARGE | TOO_LARGE_1000,
23359       CARRY | TOO_LARGE | TOO_LARGE_1000,
23360       CARRY | TOO_LARGE | TOO_LARGE_1000,
23361       CARRY | TOO_LARGE | TOO_LARGE_1000,
23362       CARRY | TOO_LARGE | TOO_LARGE_1000,
23363       // ____1101 ________
23364       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
23365       CARRY | TOO_LARGE | TOO_LARGE_1000,
23366       CARRY | TOO_LARGE | TOO_LARGE_1000
23367     );
23368     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
23369       // ________ 0_______ <ASCII in byte 2>
23370       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
23371       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
23372 
23373       // ________ 1000____
23374       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
23375       // ________ 1001____
23376       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
23377       // ________ 101_____
23378       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
23379       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
23380 
23381       // ________ 11______
23382       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
23383     );
23384     return (byte_1_high & byte_1_low & byte_2_high);
23385   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)23386   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
23387       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
23388     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
23389     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
23390     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
23391     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
23392     return must23_80 ^ sc;
23393   }
23394 
23395   //
23396   // Return nonzero if there are incomplete multibyte characters at the end of the block:
23397   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
23398   //
is_incomplete(const simd8<uint8_t> input)23399   simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
23400     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
23401     // ... 1111____ 111_____ 11______
23402     static const uint8_t max_array[32] = {
23403       255, 255, 255, 255, 255, 255, 255, 255,
23404       255, 255, 255, 255, 255, 255, 255, 255,
23405       255, 255, 255, 255, 255, 255, 255, 255,
23406       255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
23407     };
23408     const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
23409     return input.gt_bits(max_value);
23410   }
23411 
23412   struct utf8_checker {
23413     // If this is nonzero, there has been a UTF-8 error.
23414     simd8<uint8_t> error;
23415     // The last input we received
23416     simd8<uint8_t> prev_input_block;
23417     // Whether the last input we received was incomplete (used for ASCII fast path)
23418     simd8<uint8_t> prev_incomplete;
23419 
23420     //
23421     // Check whether the current bytes are valid UTF-8.
23422     //
check_utf8_bytessimdutf::ppc64::__anone55652eb3b11::utf8_validation::utf8_checker23423     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
23424       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
23425       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
23426       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
23427       simd8<uint8_t> sc = check_special_cases(input, prev1);
23428       this->error |= check_multibyte_lengths(input, prev_input, sc);
23429     }
23430 
23431     // The only problem that can happen at EOF is that a multibyte character is too short
23432     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
23433     // too large in the first of two bytes.
check_eofsimdutf::ppc64::__anone55652eb3b11::utf8_validation::utf8_checker23434     simdutf_really_inline void check_eof() {
23435       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
23436       // possibly finish them.
23437       this->error |= this->prev_incomplete;
23438     }
23439 
check_next_inputsimdutf::ppc64::__anone55652eb3b11::utf8_validation::utf8_checker23440     simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
23441       if(simdutf_likely(is_ascii(input))) {
23442         this->error |= this->prev_incomplete;
23443       } else {
23444         // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
23445         static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
23446             "We support either two or four chunks per 64-byte block.");
23447         if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
23448           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
23449           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
23450         } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
23451           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
23452           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
23453           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
23454           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
23455         }
23456         this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
23457         this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
23458 
23459       }
23460     }
23461 
23462     // do not forget to call check_eof!
errorssimdutf::ppc64::__anone55652eb3b11::utf8_validation::utf8_checker23463     simdutf_really_inline bool errors() const {
23464       return this->error.any_bits_set_anywhere();
23465     }
23466 
23467   }; // struct utf8_checker
23468 } // namespace utf8_validation
23469 
23470 using utf8_validation::utf8_checker;
23471 
23472 } // unnamed namespace
23473 } // namespace ppc64
23474 } // namespace simdutf
23475 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
23476 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
23477 /* begin file src/generic/utf8_validation/utf8_validator.h */
23478 namespace simdutf {
23479 namespace ppc64 {
23480 namespace {
23481 namespace utf8_validation {
23482 
23483 /**
23484  * Validates that the string is actual UTF-8.
23485  */
23486 template<class checker>
generic_validate_utf8(const uint8_t * input,size_t length)23487 bool generic_validate_utf8(const uint8_t * input, size_t length) {
23488     checker c{};
23489     buf_block_reader<64> reader(input, length);
23490     while (reader.has_full_block()) {
23491       simd::simd8x64<uint8_t> in(reader.full_block());
23492       c.check_next_input(in);
23493       reader.advance();
23494     }
23495     uint8_t block[64]{};
23496     reader.get_remainder(block);
23497     simd::simd8x64<uint8_t> in(block);
23498     c.check_next_input(in);
23499     reader.advance();
23500     c.check_eof();
23501     return !c.errors();
23502 }
23503 
generic_validate_utf8(const char * input,size_t length)23504 bool generic_validate_utf8(const char * input, size_t length) {
23505   return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
23506 }
23507 
23508 /**
23509  * Validates that the string is actual UTF-8 and stops on errors.
23510  */
23511 template<class checker>
generic_validate_utf8_with_errors(const uint8_t * input,size_t length)23512 result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
23513     checker c{};
23514     buf_block_reader<64> reader(input, length);
23515     size_t count{0};
23516     while (reader.has_full_block()) {
23517       simd::simd8x64<uint8_t> in(reader.full_block());
23518       c.check_next_input(in);
23519       if(c.errors()) {
23520         if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
23521         result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
23522         res.count += count;
23523         return res;
23524       }
23525       reader.advance();
23526       count += 64;
23527     }
23528     uint8_t block[64]{};
23529     reader.get_remainder(block);
23530     simd::simd8x64<uint8_t> in(block);
23531     c.check_next_input(in);
23532     reader.advance();
23533     c.check_eof();
23534     if (c.errors()) {
23535       result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
23536       res.count += count;
23537       return res;
23538     } else {
23539       return result(error_code::SUCCESS, length);
23540     }
23541 }
23542 
generic_validate_utf8_with_errors(const char * input,size_t length)23543 result generic_validate_utf8_with_errors(const char * input, size_t length) {
23544   return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
23545 }
23546 
23547 template<class checker>
generic_validate_ascii(const uint8_t * input,size_t length)23548 bool generic_validate_ascii(const uint8_t * input, size_t length) {
23549     buf_block_reader<64> reader(input, length);
23550     uint8_t blocks[64]{};
23551     simd::simd8x64<uint8_t> running_or(blocks);
23552     while (reader.has_full_block()) {
23553       simd::simd8x64<uint8_t> in(reader.full_block());
23554       running_or |= in;
23555       reader.advance();
23556     }
23557     uint8_t block[64]{};
23558     reader.get_remainder(block);
23559     simd::simd8x64<uint8_t> in(block);
23560     running_or |= in;
23561     return running_or.is_ascii();
23562 }
23563 
generic_validate_ascii(const char * input,size_t length)23564 bool generic_validate_ascii(const char * input, size_t length) {
23565   return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
23566 }
23567 
23568 template<class checker>
generic_validate_ascii_with_errors(const uint8_t * input,size_t length)23569 result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
23570   buf_block_reader<64> reader(input, length);
23571   size_t count{0};
23572   while (reader.has_full_block()) {
23573     simd::simd8x64<uint8_t> in(reader.full_block());
23574     if (!in.is_ascii()) {
23575       result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
23576       return result(res.error, count + res.count);
23577     }
23578     reader.advance();
23579 
23580     count += 64;
23581   }
23582   uint8_t block[64]{};
23583   reader.get_remainder(block);
23584   simd::simd8x64<uint8_t> in(block);
23585   if (!in.is_ascii()) {
23586     result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
23587     return result(res.error, count + res.count);
23588   } else {
23589     return result(error_code::SUCCESS, length);
23590   }
23591 }
23592 
generic_validate_ascii_with_errors(const char * input,size_t length)23593 result generic_validate_ascii_with_errors(const char * input, size_t length) {
23594   return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
23595 }
23596 
23597 } // namespace utf8_validation
23598 } // unnamed namespace
23599 } // namespace ppc64
23600 } // namespace simdutf
23601 /* end file src/generic/utf8_validation/utf8_validator.h */
23602 // transcoding from UTF-8 to UTF-16
23603 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
23604 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
23605 
23606 
23607 namespace simdutf {
23608 namespace ppc64 {
23609 namespace {
23610 namespace utf8_to_utf16 {
23611 
23612 using namespace simd;
23613 
23614 template <endianness endian>
convert_valid(const char * input,size_t size,char16_t * utf16_output)23615 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
23616     char16_t* utf16_output) noexcept {
23617   // The implementation is not specific to haswell and should be moved to the generic directory.
23618   size_t pos = 0;
23619   char16_t* start{utf16_output};
23620   const size_t safety_margin = 16; // to avoid overruns!
23621   while(pos + 64 + safety_margin <= size) {
23622     // this loop could be unrolled further. For example, we could process the mask
23623     // far more than 64 bytes.
23624     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
23625     if(in.is_ascii()) {
23626       in.store_ascii_as_utf16<endian>(utf16_output);
23627       utf16_output += 64;
23628       pos += 64;
23629     } else {
23630       // Slow path. We hope that the compiler will recognize that this is a slow path.
23631       // Anything that is not a continuation mask is a 'leading byte', that is, the
23632       // start of a new code point.
23633       uint64_t utf8_continuation_mask = in.lt(-65 + 1);
23634       // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
23635       uint64_t utf8_leading_mask = ~utf8_continuation_mask;
23636       // The *start* of code points is not so useful, rather, we want the *end* of code points.
23637       uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
23638       // We process in blocks of up to 12 bytes except possibly
23639       // for fast paths which may process up to 16 bytes. For the
23640       // slow path to work, we should have at least 12 input bytes left.
23641       size_t max_starting_point = (pos + 64) - 12;
23642       // Next loop is going to run at least five times when using solely
23643       // the slow/regular path, and at least four times if there are fast paths.
23644       while(pos < max_starting_point) {
23645         // Performance note: our ability to compute 'consumed' and
23646         // then shift and recompute is critical. If there is a
23647         // latency of, say, 4 cycles on getting 'consumed', then
23648         // the inner loop might have a total latency of about 6 cycles.
23649         // Yet we process between 6 to 12 inputs bytes, thus we get
23650         // a speed limit between 1 cycle/byte and 0.5 cycle/byte
23651         // for this section of the code. Hence, there is a limit
23652         // to how much we can further increase this latency before
23653         // it seriously harms performance.
23654         //
23655         // Thus we may allow convert_masked_utf8_to_utf16 to process
23656         // more bytes at a time under a fast-path mode where 16 bytes
23657         // are consumed at once (e.g., when encountering ASCII).
23658         size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
23659                             utf8_end_of_code_point_mask, utf16_output);
23660         pos += consumed;
23661         utf8_end_of_code_point_mask >>= consumed;
23662       }
23663       // At this point there may remain between 0 and 12 bytes in the
23664       // 64-byte block. These bytes will be processed again. So we have an
23665       // 80% efficiency (in the worst case). In practice we expect an
23666       // 85% to 90% efficiency.
23667     }
23668   }
23669   utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
23670   return utf16_output - start;
23671 }
23672 
23673 } // namespace utf8_to_utf16
23674 } // unnamed namespace
23675 } // namespace ppc64
23676 } // namespace simdutf
23677 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
23678 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
23679 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
23680 
23681 
23682 namespace simdutf {
23683 namespace ppc64 {
23684 namespace {
23685 namespace utf8_to_utf16 {
23686 using namespace simd;
23687 
23688 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)23689   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
23690 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
23691 // Bit 1 = Too Long (ASCII followed by continuation)
23692 // Bit 2 = Overlong 3-byte
23693 // Bit 4 = Surrogate
23694 // Bit 5 = Overlong 2-byte
23695 // Bit 7 = Two Continuations
23696     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
23697                                                 // 11______ 11______
23698     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
23699     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
23700     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
23701     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
23702     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
23703     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
23704                                                 // 11110100 101_____
23705                                                 // 11110101 1001____
23706                                                 // 11110101 101_____
23707                                                 // 1111011_ 1001____
23708                                                 // 1111011_ 101_____
23709                                                 // 11111___ 1001____
23710                                                 // 11111___ 101_____
23711     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
23712                                                 // 11110101 1000____
23713                                                 // 1111011_ 1000____
23714                                                 // 11111___ 1000____
23715     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
23716 
23717     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
23718       // 0_______ ________ <ASCII in byte 1>
23719       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
23720       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
23721       // 10______ ________ <continuation in byte 1>
23722       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
23723       // 1100____ ________ <two byte lead in byte 1>
23724       TOO_SHORT | OVERLONG_2,
23725       // 1101____ ________ <two byte lead in byte 1>
23726       TOO_SHORT,
23727       // 1110____ ________ <three byte lead in byte 1>
23728       TOO_SHORT | OVERLONG_3 | SURROGATE,
23729       // 1111____ ________ <four+ byte lead in byte 1>
23730       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
23731     );
23732     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
23733     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
23734       // ____0000 ________
23735       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
23736       // ____0001 ________
23737       CARRY | OVERLONG_2,
23738       // ____001_ ________
23739       CARRY,
23740       CARRY,
23741 
23742       // ____0100 ________
23743       CARRY | TOO_LARGE,
23744       // ____0101 ________
23745       CARRY | TOO_LARGE | TOO_LARGE_1000,
23746       // ____011_ ________
23747       CARRY | TOO_LARGE | TOO_LARGE_1000,
23748       CARRY | TOO_LARGE | TOO_LARGE_1000,
23749 
23750       // ____1___ ________
23751       CARRY | TOO_LARGE | TOO_LARGE_1000,
23752       CARRY | TOO_LARGE | TOO_LARGE_1000,
23753       CARRY | TOO_LARGE | TOO_LARGE_1000,
23754       CARRY | TOO_LARGE | TOO_LARGE_1000,
23755       CARRY | TOO_LARGE | TOO_LARGE_1000,
23756       // ____1101 ________
23757       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
23758       CARRY | TOO_LARGE | TOO_LARGE_1000,
23759       CARRY | TOO_LARGE | TOO_LARGE_1000
23760     );
23761     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
23762       // ________ 0_______ <ASCII in byte 2>
23763       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
23764       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
23765 
23766       // ________ 1000____
23767       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
23768       // ________ 1001____
23769       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
23770       // ________ 101_____
23771       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
23772       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
23773 
23774       // ________ 11______
23775       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
23776     );
23777     return (byte_1_high & byte_1_low & byte_2_high);
23778   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)23779   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
23780       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
23781     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
23782     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
23783     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
23784     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
23785     return must23_80 ^ sc;
23786   }
23787 
23788 
23789   struct validating_transcoder {
23790     // If this is nonzero, there has been a UTF-8 error.
23791     simd8<uint8_t> error;
23792 
validating_transcodersimdutf::ppc64::__anone55652eb3e11::utf8_to_utf16::validating_transcoder23793     validating_transcoder() : error(uint8_t(0)) {}
23794     //
23795     // Check whether the current bytes are valid UTF-8.
23796     //
check_utf8_bytessimdutf::ppc64::__anone55652eb3e11::utf8_to_utf16::validating_transcoder23797     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
23798       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
23799       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
23800       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
23801       simd8<uint8_t> sc = check_special_cases(input, prev1);
23802       this->error |= check_multibyte_lengths(input, prev_input, sc);
23803     }
23804 
23805 
23806     template <endianness endian>
convertsimdutf::ppc64::__anone55652eb3e11::utf8_to_utf16::validating_transcoder23807     simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
23808       size_t pos = 0;
23809       char16_t* start{utf16_output};
23810       // In the worst case, we have the haswell kernel which can cause an overflow of
23811       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
23812       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
23813       // much more than 8 bytes. However, you cannot generally assume that you have valid
23814       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
23815       // to give us a good margin.
23816       size_t leading_byte = 0;
23817       size_t margin = size;
23818       for(; margin > 0 && leading_byte < 8; margin--) {
23819         leading_byte += (int8_t(in[margin-1]) > -65);
23820       }
23821       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
23822       const size_t safety_margin = size - margin + 1; // to avoid overruns!
23823       while(pos + 64 + safety_margin <= size) {
23824         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
23825         if(input.is_ascii()) {
23826           input.store_ascii_as_utf16<endian>(utf16_output);
23827           utf16_output += 64;
23828           pos += 64;
23829         } else {
23830           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
23831           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
23832               "We support either two or four chunks per 64-byte block.");
23833           auto zero = simd8<uint8_t>{uint8_t(0)};
23834           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
23835             this->check_utf8_bytes(input.chunks[0], zero);
23836             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
23837           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
23838             this->check_utf8_bytes(input.chunks[0], zero);
23839             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
23840             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
23841             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
23842           }
23843           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
23844           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
23845           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
23846           // We process in blocks of up to 12 bytes except possibly
23847           // for fast paths which may process up to 16 bytes. For the
23848           // slow path to work, we should have at least 12 input bytes left.
23849           size_t max_starting_point = (pos + 64) - 12;
23850           // Next loop is going to run at least five times.
23851           while(pos < max_starting_point) {
23852             // Performance note: our ability to compute 'consumed' and
23853             // then shift and recompute is critical. If there is a
23854             // latency of, say, 4 cycles on getting 'consumed', then
23855             // the inner loop might have a total latency of about 6 cycles.
23856             // Yet we process between 6 to 12 inputs bytes, thus we get
23857             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
23858             // for this section of the code. Hence, there is a limit
23859             // to how much we can further increase this latency before
23860             // it seriously harms performance.
23861             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
23862                             utf8_end_of_code_point_mask, utf16_output);
23863             pos += consumed;
23864             utf8_end_of_code_point_mask >>= consumed;
23865           }
23866           // At this point there may remain between 0 and 12 bytes in the
23867           // 64-byte block. These bytes will be processed again. So we have an
23868           // 80% efficiency (in the worst case). In practice we expect an
23869           // 85% to 90% efficiency.
23870         }
23871       }
23872       if(errors()) { return 0; }
23873       if(pos < size) {
23874         size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
23875         if(howmany == 0) { return 0; }
23876         utf16_output += howmany;
23877       }
23878       return utf16_output - start;
23879     }
23880 
23881     template <endianness endian>
convert_with_errorssimdutf::ppc64::__anone55652eb3e11::utf8_to_utf16::validating_transcoder23882     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
23883       size_t pos = 0;
23884       char16_t* start{utf16_output};
23885       // In the worst case, we have the haswell kernel which can cause an overflow of
23886       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
23887       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
23888       // much more than 8 bytes. However, you cannot generally assume that you have valid
23889       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
23890       // to give us a good margin.
23891       size_t leading_byte = 0;
23892       size_t margin = size;
23893       for(; margin > 0 && leading_byte < 8; margin--) {
23894         leading_byte += (int8_t(in[margin-1]) > -65);
23895       }
23896       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
23897       const size_t safety_margin = size - margin + 1; // to avoid overruns!
23898       while(pos + 64 + safety_margin <= size) {
23899         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
23900         if(input.is_ascii()) {
23901           input.store_ascii_as_utf16<endian>(utf16_output);
23902           utf16_output += 64;
23903           pos += 64;
23904         } else {
23905           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
23906           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
23907               "We support either two or four chunks per 64-byte block.");
23908           auto zero = simd8<uint8_t>{uint8_t(0)};
23909           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
23910             this->check_utf8_bytes(input.chunks[0], zero);
23911             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
23912           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
23913             this->check_utf8_bytes(input.chunks[0], zero);
23914             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
23915             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
23916             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
23917           }
23918           if (errors()) {
23919             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
23920             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
23921             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
23922             res.count += pos;
23923             return res;
23924           }
23925           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
23926           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
23927           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
23928           // We process in blocks of up to 12 bytes except possibly
23929           // for fast paths which may process up to 16 bytes. For the
23930           // slow path to work, we should have at least 12 input bytes left.
23931           size_t max_starting_point = (pos + 64) - 12;
23932           // Next loop is going to run at least five times.
23933           while(pos < max_starting_point) {
23934             // Performance note: our ability to compute 'consumed' and
23935             // then shift and recompute is critical. If there is a
23936             // latency of, say, 4 cycles on getting 'consumed', then
23937             // the inner loop might have a total latency of about 6 cycles.
23938             // Yet we process between 6 to 12 inputs bytes, thus we get
23939             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
23940             // for this section of the code. Hence, there is a limit
23941             // to how much we can further increase this latency before
23942             // it seriously harms performance.
23943             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
23944                             utf8_end_of_code_point_mask, utf16_output);
23945             pos += consumed;
23946             utf8_end_of_code_point_mask >>= consumed;
23947           }
23948           // At this point there may remain between 0 and 12 bytes in the
23949           // 64-byte block. These bytes will be processed again. So we have an
23950           // 80% efficiency (in the worst case). In practice we expect an
23951           // 85% to 90% efficiency.
23952         }
23953       }
23954       if(errors()) {
23955         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
23956         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
23957         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
23958         res.count += pos;
23959         return res;
23960       }
23961       if(pos < size) {
23962         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
23963         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
23964         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
23965         if (res.error) {    // In case of error, we want the error position
23966           res.count += pos;
23967           return res;
23968         } else {    // In case of success, we want the number of word written
23969           utf16_output += res.count;
23970         }
23971       }
23972       return result(error_code::SUCCESS, utf16_output - start);
23973     }
23974 
errorssimdutf::ppc64::__anone55652eb3e11::utf8_to_utf16::validating_transcoder23975     simdutf_really_inline bool errors() const {
23976       return this->error.any_bits_set_anywhere();
23977     }
23978 
23979   }; // struct utf8_checker
23980 } // utf8_to_utf16 namespace
23981 } // unnamed namespace
23982 } // namespace ppc64
23983 } // namespace simdutf
23984 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
23985 // transcoding from UTF-8 to UTF-32
23986 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
23987 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
23988 
23989 namespace simdutf {
23990 namespace ppc64 {
23991 namespace {
23992 namespace utf8_to_utf32 {
23993 
23994 using namespace simd;
23995 
23996 
convert_valid(const char * input,size_t size,char32_t * utf32_output)23997 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
23998     char32_t* utf32_output) noexcept {
23999   size_t pos = 0;
24000   char32_t* start{utf32_output};
24001   const size_t safety_margin = 16; // to avoid overruns!
24002   while(pos + 64 + safety_margin <= size) {
24003     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
24004     if(in.is_ascii()) {
24005       in.store_ascii_as_utf32(utf32_output);
24006       utf32_output += 64;
24007       pos += 64;
24008     } else {
24009     // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
24010     uint64_t utf8_continuation_mask = in.lt(-65 + 1);
24011     uint64_t utf8_leading_mask = ~utf8_continuation_mask;
24012     uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
24013     size_t max_starting_point = (pos + 64) - 12;
24014     while(pos < max_starting_point) {
24015       size_t consumed = convert_masked_utf8_to_utf32(input + pos,
24016                           utf8_end_of_code_point_mask, utf32_output);
24017       pos += consumed;
24018       utf8_end_of_code_point_mask >>= consumed;
24019       }
24020     }
24021   }
24022   utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
24023   return utf32_output - start;
24024 }
24025 
24026 
24027 } // namespace utf8_to_utf32
24028 } // unnamed namespace
24029 } // namespace ppc64
24030 } // namespace simdutf
24031 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
24032 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
24033 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
24034 
24035 
24036 namespace simdutf {
24037 namespace ppc64 {
24038 namespace {
24039 namespace utf8_to_utf32 {
24040 using namespace simd;
24041 
24042 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)24043   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
24044 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
24045 // Bit 1 = Too Long (ASCII followed by continuation)
24046 // Bit 2 = Overlong 3-byte
24047 // Bit 4 = Surrogate
24048 // Bit 5 = Overlong 2-byte
24049 // Bit 7 = Two Continuations
24050     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
24051                                                 // 11______ 11______
24052     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
24053     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
24054     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
24055     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
24056     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
24057     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
24058                                                 // 11110100 101_____
24059                                                 // 11110101 1001____
24060                                                 // 11110101 101_____
24061                                                 // 1111011_ 1001____
24062                                                 // 1111011_ 101_____
24063                                                 // 11111___ 1001____
24064                                                 // 11111___ 101_____
24065     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
24066                                                 // 11110101 1000____
24067                                                 // 1111011_ 1000____
24068                                                 // 11111___ 1000____
24069     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
24070 
24071     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
24072       // 0_______ ________ <ASCII in byte 1>
24073       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
24074       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
24075       // 10______ ________ <continuation in byte 1>
24076       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
24077       // 1100____ ________ <two byte lead in byte 1>
24078       TOO_SHORT | OVERLONG_2,
24079       // 1101____ ________ <two byte lead in byte 1>
24080       TOO_SHORT,
24081       // 1110____ ________ <three byte lead in byte 1>
24082       TOO_SHORT | OVERLONG_3 | SURROGATE,
24083       // 1111____ ________ <four+ byte lead in byte 1>
24084       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
24085     );
24086     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
24087     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
24088       // ____0000 ________
24089       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
24090       // ____0001 ________
24091       CARRY | OVERLONG_2,
24092       // ____001_ ________
24093       CARRY,
24094       CARRY,
24095 
24096       // ____0100 ________
24097       CARRY | TOO_LARGE,
24098       // ____0101 ________
24099       CARRY | TOO_LARGE | TOO_LARGE_1000,
24100       // ____011_ ________
24101       CARRY | TOO_LARGE | TOO_LARGE_1000,
24102       CARRY | TOO_LARGE | TOO_LARGE_1000,
24103 
24104       // ____1___ ________
24105       CARRY | TOO_LARGE | TOO_LARGE_1000,
24106       CARRY | TOO_LARGE | TOO_LARGE_1000,
24107       CARRY | TOO_LARGE | TOO_LARGE_1000,
24108       CARRY | TOO_LARGE | TOO_LARGE_1000,
24109       CARRY | TOO_LARGE | TOO_LARGE_1000,
24110       // ____1101 ________
24111       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
24112       CARRY | TOO_LARGE | TOO_LARGE_1000,
24113       CARRY | TOO_LARGE | TOO_LARGE_1000
24114     );
24115     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
24116       // ________ 0_______ <ASCII in byte 2>
24117       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
24118       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
24119 
24120       // ________ 1000____
24121       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
24122       // ________ 1001____
24123       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
24124       // ________ 101_____
24125       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
24126       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
24127 
24128       // ________ 11______
24129       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
24130     );
24131     return (byte_1_high & byte_1_low & byte_2_high);
24132   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)24133   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
24134       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
24135     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
24136     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
24137     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
24138     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
24139     return must23_80 ^ sc;
24140   }
24141 
24142 
24143   struct validating_transcoder {
24144     // If this is nonzero, there has been a UTF-8 error.
24145     simd8<uint8_t> error;
24146 
validating_transcodersimdutf::ppc64::__anone55652eb4011::utf8_to_utf32::validating_transcoder24147     validating_transcoder() : error(uint8_t(0)) {}
24148     //
24149     // Check whether the current bytes are valid UTF-8.
24150     //
check_utf8_bytessimdutf::ppc64::__anone55652eb4011::utf8_to_utf32::validating_transcoder24151     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
24152       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
24153       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
24154       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
24155       simd8<uint8_t> sc = check_special_cases(input, prev1);
24156       this->error |= check_multibyte_lengths(input, prev_input, sc);
24157     }
24158 
24159 
24160 
convertsimdutf::ppc64::__anone55652eb4011::utf8_to_utf32::validating_transcoder24161     simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
24162       size_t pos = 0;
24163       char32_t* start{utf32_output};
24164       // In the worst case, we have the haswell kernel which can cause an overflow of
24165       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
24166       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
24167       // much more than 8 bytes. However, you cannot generally assume that you have valid
24168       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
24169       // to give us a good margin.
24170       size_t leading_byte = 0;
24171       size_t margin = size;
24172       for(; margin > 0 && leading_byte < 4; margin--) {
24173         leading_byte += (int8_t(in[margin-1]) > -65);
24174       }
24175       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
24176       const size_t safety_margin = size - margin + 1; // to avoid overruns!
24177       while(pos + 64 + safety_margin <= size) {
24178         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
24179         if(input.is_ascii()) {
24180           input.store_ascii_as_utf32(utf32_output);
24181           utf32_output += 64;
24182           pos += 64;
24183         } else {
24184           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
24185           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
24186               "We support either two or four chunks per 64-byte block.");
24187           auto zero = simd8<uint8_t>{uint8_t(0)};
24188           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
24189             this->check_utf8_bytes(input.chunks[0], zero);
24190             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
24191           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
24192             this->check_utf8_bytes(input.chunks[0], zero);
24193             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
24194             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
24195             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
24196           }
24197           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
24198           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
24199           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
24200           // We process in blocks of up to 12 bytes except possibly
24201           // for fast paths which may process up to 16 bytes. For the
24202           // slow path to work, we should have at least 12 input bytes left.
24203           size_t max_starting_point = (pos + 64) - 12;
24204           // Next loop is going to run at least five times.
24205           while(pos < max_starting_point) {
24206             // Performance note: our ability to compute 'consumed' and
24207             // then shift and recompute is critical. If there is a
24208             // latency of, say, 4 cycles on getting 'consumed', then
24209             // the inner loop might have a total latency of about 6 cycles.
24210             // Yet we process between 6 to 12 inputs bytes, thus we get
24211             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
24212             // for this section of the code. Hence, there is a limit
24213             // to how much we can further increase this latency before
24214             // it seriously harms performance.
24215             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
24216                             utf8_end_of_code_point_mask, utf32_output);
24217             pos += consumed;
24218             utf8_end_of_code_point_mask >>= consumed;
24219           }
24220           // At this point there may remain between 0 and 12 bytes in the
24221           // 64-byte block. These bytes will be processed again. So we have an
24222           // 80% efficiency (in the worst case). In practice we expect an
24223           // 85% to 90% efficiency.
24224         }
24225       }
24226       if(errors()) { return 0; }
24227       if(pos < size) {
24228         size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
24229         if(howmany == 0) { return 0; }
24230         utf32_output += howmany;
24231       }
24232       return utf32_output - start;
24233     }
24234 
convert_with_errorssimdutf::ppc64::__anone55652eb4011::utf8_to_utf32::validating_transcoder24235     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
24236       size_t pos = 0;
24237       char32_t* start{utf32_output};
24238       // In the worst case, we have the haswell kernel which can cause an overflow of
24239       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
24240       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
24241       // much more than 8 bytes. However, you cannot generally assume that you have valid
24242       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
24243       // to give us a good margin.
24244       size_t leading_byte = 0;
24245       size_t margin = size;
24246       for(; margin > 0 && leading_byte < 4; margin--) {
24247         leading_byte += (int8_t(in[margin-1]) > -65);
24248       }
24249       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
24250       const size_t safety_margin = size - margin + 1; // to avoid overruns!
24251       while(pos + 64 + safety_margin <= size) {
24252         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
24253         if(input.is_ascii()) {
24254           input.store_ascii_as_utf32(utf32_output);
24255           utf32_output += 64;
24256           pos += 64;
24257         } else {
24258           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
24259           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
24260               "We support either two or four chunks per 64-byte block.");
24261           auto zero = simd8<uint8_t>{uint8_t(0)};
24262           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
24263             this->check_utf8_bytes(input.chunks[0], zero);
24264             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
24265           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
24266             this->check_utf8_bytes(input.chunks[0], zero);
24267             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
24268             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
24269             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
24270           }
24271           if (errors()) {
24272             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
24273             res.count += pos;
24274             return res;
24275           }
24276           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
24277           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
24278           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
24279           // We process in blocks of up to 12 bytes except possibly
24280           // for fast paths which may process up to 16 bytes. For the
24281           // slow path to work, we should have at least 12 input bytes left.
24282           size_t max_starting_point = (pos + 64) - 12;
24283           // Next loop is going to run at least five times.
24284           while(pos < max_starting_point) {
24285             // Performance note: our ability to compute 'consumed' and
24286             // then shift and recompute is critical. If there is a
24287             // latency of, say, 4 cycles on getting 'consumed', then
24288             // the inner loop might have a total latency of about 6 cycles.
24289             // Yet we process between 6 to 12 inputs bytes, thus we get
24290             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
24291             // for this section of the code. Hence, there is a limit
24292             // to how much we can further increase this latency before
24293             // it seriously harms performance.
24294             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
24295                             utf8_end_of_code_point_mask, utf32_output);
24296             pos += consumed;
24297             utf8_end_of_code_point_mask >>= consumed;
24298           }
24299           // At this point there may remain between 0 and 12 bytes in the
24300           // 64-byte block. These bytes will be processed again. So we have an
24301           // 80% efficiency (in the worst case). In practice we expect an
24302           // 85% to 90% efficiency.
24303         }
24304       }
24305       if(errors()) {
24306         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
24307         res.count += pos;
24308         return res;
24309       }
24310       if(pos < size) {
24311         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
24312         if (res.error) {    // In case of error, we want the error position
24313           res.count += pos;
24314           return res;
24315         } else {    // In case of success, we want the number of word written
24316           utf32_output += res.count;
24317         }
24318       }
24319       return result(error_code::SUCCESS, utf32_output - start);
24320     }
24321 
errorssimdutf::ppc64::__anone55652eb4011::utf8_to_utf32::validating_transcoder24322     simdutf_really_inline bool errors() const {
24323       return this->error.any_bits_set_anywhere();
24324     }
24325 
24326   }; // struct utf8_checker
24327 } // utf8_to_utf32 namespace
24328 } // unnamed namespace
24329 } // namespace ppc64
24330 } // namespace simdutf
24331 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
24332 // other functions
24333 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8.h
24334 /* begin file src/generic/utf8.h */
24335 
24336 namespace simdutf {
24337 namespace ppc64 {
24338 namespace {
24339 namespace utf8 {
24340 
24341 using namespace simd;
24342 
count_code_points(const char * in,size_t size)24343 simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
24344     size_t pos = 0;
24345     size_t count = 0;
24346     for(;pos + 64 <= size; pos += 64) {
24347       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
24348       uint64_t utf8_continuation_mask = input.lt(-65 + 1);
24349       count += 64 - count_ones(utf8_continuation_mask);
24350     }
24351     return count + scalar::utf8::count_code_points(in + pos, size - pos);
24352 }
24353 
24354 
utf16_length_from_utf8(const char * in,size_t size)24355 simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
24356     size_t pos = 0;
24357     size_t count = 0;
24358     // This algorithm could no doubt be improved!
24359     for(;pos + 64 <= size; pos += 64) {
24360       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
24361       uint64_t utf8_continuation_mask = input.lt(-65 + 1);
24362       // We count one word for anything that is not a continuation (so
24363       // leading bytes).
24364       count += 64 - count_ones(utf8_continuation_mask);
24365       int64_t utf8_4byte = input.gteq_unsigned(240);
24366       count += count_ones(utf8_4byte);
24367     }
24368     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
24369 }
24370 
24371 
utf32_length_from_utf8(const char * in,size_t size)24372 simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
24373     return count_code_points(in, size);
24374 }
24375 } // utf8 namespace
24376 } // unnamed namespace
24377 } // namespace ppc64
24378 } // namespace simdutf
24379 /* end file src/generic/utf8.h */
24380 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf16.h
24381 /* begin file src/generic/utf16.h */
24382 namespace simdutf {
24383 namespace ppc64 {
24384 namespace {
24385 namespace utf16 {
24386 
24387 template <endianness big_endian>
count_code_points(const char16_t * in,size_t size)24388 simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
24389     size_t pos = 0;
24390     size_t count = 0;
24391     for(;pos + 32 <= size; pos += 32) {
24392       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
24393       if (!match_system(big_endian)) input.swap_bytes();
24394       uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
24395       count += count_ones(not_pair) / 2;
24396     }
24397     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
24398 }
24399 
24400 template <endianness big_endian>
utf8_length_from_utf16(const char16_t * in,size_t size)24401 simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
24402     size_t pos = 0;
24403     size_t count = 0;
24404     // This algorithm could no doubt be improved!
24405     for(;pos + 32 <= size; pos += 32) {
24406       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
24407       if (!match_system(big_endian)) input.swap_bytes();
24408       uint64_t ascii_mask = input.lteq(0x7F);
24409       uint64_t twobyte_mask = input.lteq(0x7FF);
24410       uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
24411 
24412       size_t ascii_count = count_ones(ascii_mask) / 2;
24413       size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
24414       size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
24415       size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
24416       count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
24417     }
24418     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
24419 }
24420 
24421 template <endianness big_endian>
utf32_length_from_utf16(const char16_t * in,size_t size)24422 simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
24423     return count_code_points<big_endian>(in, size);
24424 }
24425 
change_endianness_utf16(const char16_t * in,size_t size,char16_t * output)24426 simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
24427   size_t pos = 0;
24428 
24429   while (pos + 32 <= size) {
24430     simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
24431     input.swap_bytes();
24432     input.store(reinterpret_cast<uint16_t *>(output));
24433     pos += 32;
24434     output += 32;
24435   }
24436 
24437   scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
24438 }
24439 
24440 } // utf16
24441 } // unnamed namespace
24442 } // namespace ppc64
24443 } // namespace simdutf
24444 /* end file src/generic/utf16.h */
24445 
24446 //
24447 // Implementation-specific overrides
24448 //
24449 namespace simdutf {
24450 namespace ppc64 {
24451 
detect_encodings(const char * input,size_t length) const24452 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
24453   // If there is a BOM, then we trust it.
24454   auto bom_encoding = simdutf::BOM::check_bom(input, length);
24455   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
24456   int out = 0;
24457   if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
24458   if((length % 2) == 0) {
24459     if(validate_utf16(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
24460   }
24461   if((length % 4) == 0) {
24462     if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
24463   }
24464 
24465   return out;
24466 }
24467 
validate_utf8(const char * buf,size_t len) const24468 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
24469   return ppc64::utf8_validation::generic_validate_utf8(buf,len);
24470 }
24471 
validate_utf8_with_errors(const char * buf,size_t len) const24472 simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
24473   return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf,len);
24474 }
24475 
validate_ascii(const char * buf,size_t len) const24476 simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
24477   return ppc64::utf8_validation::generic_validate_ascii(buf,len);
24478 }
24479 
validate_ascii_with_errors(const char * buf,size_t len) const24480 simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
24481   return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf,len);
24482 }
24483 
validate_utf16le(const char16_t * buf,size_t len) const24484 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
24485   return scalar::utf16::validate<endianness::LITTLE>(buf, len);
24486 }
24487 
validate_utf16be(const char16_t * buf,size_t len) const24488 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
24489   return scalar::utf16::validate<endianness::BIG>(buf, len);
24490 }
24491 
validate_utf16le_with_errors(const char16_t * buf,size_t len) const24492 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
24493   return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
24494 }
24495 
validate_utf16be_with_errors(const char16_t * buf,size_t len) const24496 simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
24497   return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
24498 }
24499 
validate_utf32_with_errors(const char32_t * buf,size_t len) const24500 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
24501   return scalar::utf32::validate_with_errors(buf, len);
24502 }
24503 
validate_utf32(const char16_t * buf,size_t len) const24504 simdutf_warn_unused bool implementation::validate_utf32(const char16_t *buf, size_t len) const noexcept {
24505   return scalar::utf32::validate(buf, len);
24506 }
24507 
convert_utf8_to_utf16le(const char *,size_t,char16_t *) const24508 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
24509   return 0; // stub
24510 }
24511 
convert_utf8_to_utf16be(const char *,size_t,char16_t *) const24512 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
24513   return 0; // stub
24514 }
24515 
convert_utf8_to_utf16le_with_errors(const char *,size_t,char16_t *) const24516 simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
24517   return result(error_code::OTHER, 0); // stub
24518 }
24519 
convert_utf8_to_utf16be_with_errors(const char *,size_t,char16_t *) const24520 simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
24521   return result(error_code::OTHER, 0); // stub
24522 }
24523 
convert_valid_utf8_to_utf16le(const char *,size_t,char16_t *) const24524 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
24525   return 0; // stub
24526 }
24527 
convert_valid_utf8_to_utf16be(const char *,size_t,char16_t *) const24528 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
24529   return 0; // stub
24530 }
24531 
convert_utf8_to_utf32(const char *,size_t,char32_t *) const24532 simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
24533   return 0; // stub
24534 }
24535 
convert_utf8_to_utf32_with_errors(const char *,size_t,char32_t *) const24536 simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
24537   return result(error_code::OTHER, 0); // stub
24538 }
24539 
convert_valid_utf8_to_utf32(const char *,size_t,char32_t *) const24540 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
24541   return 0; // stub
24542 }
24543 
convert_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const24544 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
24545   return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
24546 }
24547 
convert_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const24548 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
24549   return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
24550 }
24551 
convert_utf16le_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output) const24552 simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
24553   return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
24554 }
24555 
convert_utf16be_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output) const24556 simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
24557   return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
24558 }
24559 
convert_valid_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const24560 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
24561   return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
24562 }
24563 
convert_valid_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const24564 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
24565   return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
24566 }
24567 
convert_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output) const24568 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
24569   return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
24570 }
24571 
convert_utf32_to_utf8_with_errors(const char32_t * buf,size_t len,char * utf8_output) const24572 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
24573   return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
24574 }
24575 
convert_valid_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output) const24576 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
24577   return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
24578 }
24579 
convert_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_output) const24580 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
24581   return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
24582 }
24583 
convert_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_output) const24584 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
24585   return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
24586 }
24587 
convert_utf32_to_utf16le_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output) const24588 simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
24589   return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
24590 }
24591 
convert_utf32_to_utf16be_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output) const24592 simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
24593   return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
24594 }
24595 
convert_valid_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_output) const24596 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
24597   return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
24598 }
24599 
convert_valid_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_output) const24600 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
24601   return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
24602 }
24603 
convert_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const24604 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
24605   return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
24606 }
24607 
convert_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const24608 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
24609   return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
24610 }
24611 
convert_utf16le_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output) const24612 simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
24613   return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
24614 }
24615 
convert_utf16be_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output) const24616 simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
24617   return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
24618 }
24619 
convert_valid_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const24620 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
24621   return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
24622 }
24623 
convert_valid_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const24624 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
24625   return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
24626 }
24627 
change_endianness_utf16(const char16_t * input,size_t length,char16_t * output) const24628 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
24629   scalar::utf16::change_endianness_utf16(input, length, output);
24630 }
24631 
count_utf16le(const char16_t * input,size_t length) const24632 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
24633   return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
24634 }
24635 
count_utf16be(const char16_t * input,size_t length) const24636 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
24637   return scalar::utf16::count_code_points<endianness::BIG>(input, length);
24638 }
24639 
count_utf8(const char * input,size_t length) const24640 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
24641   return utf8::count_code_points(input, length);
24642 }
24643 
utf8_length_from_utf16le(const char16_t * input,size_t length) const24644 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
24645   return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
24646 }
24647 
utf8_length_from_utf16be(const char16_t * input,size_t length) const24648 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
24649   return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
24650 }
24651 
utf32_length_from_utf16le(const char16_t * input,size_t length) const24652 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
24653   return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
24654 }
24655 
utf32_length_from_utf16be(const char16_t * input,size_t length) const24656 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
24657   return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
24658 }
24659 
utf16_length_from_utf8(const char * input,size_t length) const24660 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
24661   return scalar::utf8::utf16_length_from_utf8(input, length);
24662 }
24663 
utf8_length_from_utf32(const char32_t * input,size_t length) const24664 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
24665   return scalar::utf32::utf8_length_from_utf32(input, length);
24666 }
24667 
utf16_length_from_utf32(const char32_t * input,size_t length) const24668 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
24669   return scalar::utf32::utf16_length_from_utf32(input, length);
24670 }
24671 
utf32_length_from_utf8(const char * input,size_t length) const24672 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
24673   return scalar::utf8::count_code_points(input, length);
24674 }
24675 
24676 } // namespace ppc64
24677 } // namespace simdutf
24678 
24679 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h
24680 /* begin file src/simdutf/ppc64/end.h */
24681 /* end file src/simdutf/ppc64/end.h */
24682 /* end file src/ppc64/implementation.cpp */
24683 #endif
24684 #if SIMDUTF_IMPLEMENTATION_WESTMERE
24685 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/implementation.cpp
24686 /* begin file src/westmere/implementation.cpp */
24687 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h
24688 /* begin file src/simdutf/westmere/begin.h */
24689 // redefining SIMDUTF_IMPLEMENTATION to "westmere"
24690 // #define SIMDUTF_IMPLEMENTATION westmere
24691 
24692 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
24693 // nothing needed.
24694 #else
24695 SIMDUTF_TARGET_WESTMERE
24696 #endif
24697 /* end file src/simdutf/westmere/begin.h */
24698 namespace simdutf {
24699 namespace westmere {
24700 namespace {
24701 #ifndef SIMDUTF_WESTMERE_H
24702 #error "westmere.h must be included"
24703 #endif
24704 using namespace simd;
24705 
is_ascii(const simd8x64<uint8_t> & input)24706 simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
24707   return input.reduce_or().is_ascii();
24708 }
24709 
must_be_continuation(const simd8<uint8_t> prev1,const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)24710 simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
24711   simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
24712   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
24713   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
24714   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
24715   return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
24716 }
24717 
must_be_2_3_continuation(const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)24718 simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
24719   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
24720   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
24721   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
24722   return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
24723 }
24724 
24725 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_detect_encodings.cpp
24726 /* begin file src/westmere/sse_detect_encodings.cpp */
24727 template<class checker>
24728 // len is known to be a multiple of 2 when this is called
sse_detect_encodings(const char * buf,size_t len)24729 int sse_detect_encodings(const char * buf, size_t len) {
24730     const char* start = buf;
24731     const char* end = buf + len;
24732 
24733     bool is_utf8 = true;
24734     bool is_utf16 = true;
24735     bool is_utf32 = true;
24736 
24737     int out = 0;
24738 
24739     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
24740     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
24741 
24742     __m128i currentmax = _mm_setzero_si128();
24743 
24744     checker check{};
24745 
24746     while(buf + 64 <= end) {
24747         __m128i in = _mm_loadu_si128((__m128i*)buf);
24748         __m128i secondin = _mm_loadu_si128((__m128i*)buf+1);
24749         __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
24750         __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
24751 
24752         const auto u0 = simd16<uint16_t>(in);
24753         const auto u1 = simd16<uint16_t>(secondin);
24754         const auto u2 = simd16<uint16_t>(thirdin);
24755         const auto u3 = simd16<uint16_t>(fourthin);
24756 
24757         const auto v0 = u0.shr<8>();
24758         const auto v1 = u1.shr<8>();
24759         const auto v2 = u2.shr<8>();
24760         const auto v3 = u3.shr<8>();
24761 
24762         const auto in16 = simd16<uint16_t>::pack(v0, v1);
24763         const auto nextin16 = simd16<uint16_t>::pack(v2, v3);
24764 
24765         const auto surrogates_wordmask0 = (in16 & v_f8) == v_d8;
24766         const auto surrogates_wordmask1 = (nextin16 & v_f8) == v_d8;
24767         uint16_t surrogates_bitmask0 = static_cast<uint16_t>(surrogates_wordmask0.to_bitmask());
24768         uint16_t surrogates_bitmask1 = static_cast<uint16_t>(surrogates_wordmask1.to_bitmask());
24769 
24770         // Check for surrogates
24771         if (surrogates_bitmask0 != 0x0 || surrogates_bitmask1 != 0x0) {
24772             // Cannot be UTF8
24773             is_utf8 = false;
24774             // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates
24775             // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
24776             // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant
24777             // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
24778             // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words.
24779 
24780             if (((surrogates_bitmask0 | surrogates_bitmask1) & 0xaaaa) != 0) {
24781                 is_utf32 = false;
24782                 // Code from sse_validate_utf16le.cpp
24783                 // Not efficient, we do not process surrogates_bitmask1
24784                 const char16_t * input = reinterpret_cast<const char16_t*>(buf);
24785                 const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
24786 
24787                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
24788                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
24789 
24790                 const uint16_t V0 = static_cast<uint16_t>(~surrogates_bitmask0);
24791 
24792                 const auto    vH0 = (in16 & v_fc) == v_dc;
24793                 const uint16_t H0 = static_cast<uint16_t>(vH0.to_bitmask());
24794 
24795                 const uint16_t L0 = static_cast<uint16_t>(~H0 & surrogates_bitmask0);
24796 
24797                 const uint16_t a0 = static_cast<uint16_t>(L0 & (H0 >> 1));
24798 
24799                 const uint16_t b0 = static_cast<uint16_t>(a0 << 1);
24800 
24801                 const uint16_t c0 = static_cast<uint16_t>(V0 | a0 | b0);
24802 
24803                 if (c0 == 0xffff) {
24804                     input += 16;
24805                 } else if (c0 == 0x7fff) {
24806                     input += 15;
24807                 } else {
24808                     is_utf16 = false;
24809                     break;
24810                 }
24811 
24812                 while (input + simd16<uint16_t>::SIZE * 2 < end16) {
24813                     const auto in0 = simd16<uint16_t>(input);
24814                     const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
24815 
24816                     const auto t0 = in0.shr<8>();
24817                     const auto t1 = in1.shr<8>();
24818 
24819                     const auto in_16 = simd16<uint16_t>::pack(t0, t1);
24820 
24821                     const auto surrogates_wordmask = (in_16 & v_f8) == v_d8;
24822                     const uint16_t surrogates_bitmask = static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
24823                     if (surrogates_bitmask == 0x0) {
24824                         input += 16;
24825                     } else {
24826                         const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
24827 
24828                         const auto    vH = (in_16 & v_fc) == v_dc;
24829                         const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
24830 
24831                         const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
24832 
24833                         const uint16_t a = static_cast<uint16_t>(L & (H >> 1));
24834 
24835                         const uint16_t b = static_cast<uint16_t>(a << 1);
24836 
24837                         const uint16_t c = static_cast<uint16_t>(V | a | b);
24838 
24839                         if (c == 0xffff) {
24840                             input += 16;
24841                         } else if (c == 0x7fff) {
24842                             input += 15;
24843                         } else {
24844                             is_utf16 = false;
24845                             break;
24846                         }
24847                     }
24848                 }
24849             } else {
24850                 is_utf16 = false;
24851                 // Check for UTF-32
24852                 if (len % 4 == 0) {
24853                     const char32_t * input = reinterpret_cast<const char32_t*>(buf);
24854                     const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
24855 
24856                     // Must start checking for surrogates
24857                     __m128i currentoffsetmax = _mm_setzero_si128();
24858                     const __m128i offset = _mm_set1_epi32(0xffff2000);
24859                     const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
24860 
24861                     currentmax = _mm_max_epu32(in, currentmax);
24862                     currentmax = _mm_max_epu32(secondin, currentmax);
24863                     currentmax = _mm_max_epu32(thirdin, currentmax);
24864                     currentmax = _mm_max_epu32(fourthin, currentmax);
24865 
24866                     currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
24867                     currentoffsetmax = _mm_max_epu32(_mm_add_epi32(secondin, offset), currentoffsetmax);
24868                     currentoffsetmax = _mm_max_epu32(_mm_add_epi32(thirdin, offset), currentoffsetmax);
24869                     currentoffsetmax = _mm_max_epu32(_mm_add_epi32(fourthin, offset), currentoffsetmax);
24870 
24871                     while (input + 4 < end32) {
24872                         const __m128i in32 = _mm_loadu_si128((__m128i *)input);
24873                         currentmax = _mm_max_epu32(in32,currentmax);
24874                         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in32, offset), currentoffsetmax);
24875                         input += 4;
24876                     }
24877 
24878                     __m128i forbidden_words = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
24879                     if(_mm_testz_si128(forbidden_words, forbidden_words) == 0) {
24880                         is_utf32 = false;
24881                     }
24882                 } else {
24883                     is_utf32 = false;
24884                 }
24885             }
24886             break;
24887         }
24888         // If no surrogate, validate under other encodings as well
24889 
24890         // UTF-32 validation
24891         currentmax = _mm_max_epu32(in, currentmax);
24892         currentmax = _mm_max_epu32(secondin, currentmax);
24893         currentmax = _mm_max_epu32(thirdin, currentmax);
24894         currentmax = _mm_max_epu32(fourthin, currentmax);
24895 
24896         // UTF-8 validation
24897         // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
24898         simd::simd8x64<uint8_t> in8(in, secondin, thirdin, fourthin);
24899         check.check_next_input(in8);
24900 
24901         buf += 64;
24902     }
24903 
24904     // Check which encodings are possible
24905 
24906     if (is_utf8) {
24907         if (static_cast<size_t>(buf - start) != len) {
24908             uint8_t block[64]{};
24909             std::memset(block, 0x20, 64);
24910             std::memcpy(block, buf, len - (buf - start));
24911             simd::simd8x64<uint8_t> in(block);
24912             check.check_next_input(in);
24913         }
24914         if (!check.errors()) {
24915             out |= simdutf::encoding_type::UTF8;
24916         }
24917     }
24918 
24919     if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
24920         out |= simdutf::encoding_type::UTF16_LE;
24921     }
24922 
24923     if (is_utf32 && (len % 4 == 0)) {
24924         const __m128i standardmax = _mm_set1_epi32(0x10ffff);
24925         __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
24926         if (_mm_testz_si128(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
24927             out |= simdutf::encoding_type::UTF32_LE;
24928         }
24929     }
24930 
24931     return out;
24932 }
24933 /* end file src/westmere/sse_detect_encodings.cpp */
24934 
24935 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_validate_utf16.cpp
24936 /* begin file src/westmere/sse_validate_utf16.cpp */
24937 /*
24938     In UTF-16 words in range 0xD800 to 0xDFFF have special meaning.
24939 
24940     In a vectorized algorithm we want to examine the most significant
24941     nibble in order to select a fast path. If none of highest nibbles
24942     are 0xD (13), than we are sure that UTF-16 chunk in a vector
24943     register is valid.
24944 
24945     Let us analyze what we need to check if the nibble is 0xD. The
24946     value of the preceding nibble determines what we have:
24947 
24948     0xd000 .. 0xd7ff - a valid word
24949     0xd800 .. 0xdbff - low surrogate
24950     0xdc00 .. 0xdfff - high surrogate
24951 
24952     Other constraints we have to consider:
24953     - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
24954     - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
24955     - there must not be sole low surrogate nor high surrogate
24956 
24957     We're going to build three bitmasks based on the 3rd nibble:
24958     - V = valid word,
24959     - L = low surrogate (0xd800 .. 0xdbff)
24960     - H = high surrogate (0xdc00 .. 0xdfff)
24961 
24962       0   1   2   3   4   5   6   7    <--- word index
24963     [ V | L | H | L | H | V | V | L ]
24964       1   0   0   0   0   1   1   0     - V = valid masks
24965       0   1   0   1   0   0   0   1     - L = low surrogate
24966       0   0   1   0   1   0   0   0     - H high surrogate
24967 
24968 
24969       1   0   0   0   0   1   1   0   V = valid masks
24970       0   1   0   1   0   0   0   0   a = L & (H >> 1)
24971       0   0   1   0   1   0   0   0   b = a << 1
24972       1   1   1   1   1   1   1   0   c = V | a | b
24973                                   ^
24974                                   the last bit can be zero, we just consume 7 words
24975                                   and recheck this word in the next iteration
24976 */
24977 
24978 /* Returns:
24979    - pointer to the last unprocessed character (a scalar fallback should check the rest);
24980    - nullptr if an error was detected.
24981 */
24982 template <endianness big_endian>
sse_validate_utf16(const char16_t * input,size_t size)24983 const char16_t* sse_validate_utf16(const char16_t* input, size_t size) {
24984     const char16_t* end = input + size;
24985 
24986     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
24987     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
24988     const auto v_fc = simd8<uint8_t>::splat(0xfc);
24989     const auto v_dc = simd8<uint8_t>::splat(0xdc);
24990 
24991     while (input + simd16<uint16_t>::SIZE * 2 < end) {
24992         // 0. Load data: since the validation takes into account only higher
24993         //    byte of each word, we compress the two vectors into one which
24994         //    consists only the higher bytes.
24995         auto in0 = simd16<uint16_t>(input);
24996         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
24997         if (big_endian) {
24998             in0 = in0.swap_bytes();
24999             in1 = in1.swap_bytes();
25000         }
25001 
25002         const auto t0 = in0.shr<8>();
25003         const auto t1 = in1.shr<8>();
25004 
25005         const auto in = simd16<uint16_t>::pack(t0, t1);
25006 
25007         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
25008         const auto surrogates_wordmask = (in & v_f8) == v_d8;
25009         const uint16_t surrogates_bitmask = static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
25010         if (surrogates_bitmask == 0x0000) {
25011             input += 16;
25012         } else {
25013             // 2. We have some surrogates that have to be distinguished:
25014             //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
25015             //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
25016             //
25017             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
25018 
25019             // V - non-surrogate words
25020             //     V = not surrogates_wordmask
25021             const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
25022 
25023             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
25024             const auto    vH = (in & v_fc) == v_dc;
25025             const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
25026 
25027             // L - word mask for low surrogates
25028             //     L = not H and surrogates_wordmask
25029             const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
25030 
25031             const uint16_t a = static_cast<uint16_t>(L & (H >> 1));  // A low surrogate must be followed by high one.
25032                                               // (A low surrogate placed in the 7th register's word
25033                                               // is an exception we handle.)
25034             const uint16_t b = static_cast<uint16_t>(a << 1);        // Just mark that the opinput - startite fact is hold,
25035                                               // thanks to that we have only two masks for valid case.
25036             const uint16_t c = static_cast<uint16_t>(V | a | b);     // Combine all the masks into the final one.
25037 
25038             if (c == 0xffff) {
25039                 // The whole input register contains valid UTF-16, i.e.,
25040                 // either single words or proper surrogate pairs.
25041                 input += 16;
25042             } else if (c == 0x7fff) {
25043                 // The 15 lower words of the input register contains valid UTF-16.
25044                 // The 15th word may be either a low or high surrogate. It the next
25045                 // iteration we 1) check if the low surrogate is followed by a high
25046                 // one, 2) reject sole high surrogate.
25047                 input += 15;
25048             } else {
25049                 return nullptr;
25050             }
25051         }
25052     }
25053 
25054     return input;
25055 }
25056 
25057 
25058 template <endianness big_endian>
sse_validate_utf16_with_errors(const char16_t * input,size_t size)25059 const result sse_validate_utf16_with_errors(const char16_t* input, size_t size) {
25060     const char16_t* start = input;
25061     const char16_t* end = input + size;
25062 
25063     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
25064     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
25065     const auto v_fc = simd8<uint8_t>::splat(0xfc);
25066     const auto v_dc = simd8<uint8_t>::splat(0xdc);
25067 
25068     while (input + simd16<uint16_t>::SIZE * 2 < end) {
25069         // 0. Load data: since the validation takes into account only higher
25070         //    byte of each word, we compress the two vectors into one which
25071         //    consists only the higher bytes.
25072         auto in0 = simd16<uint16_t>(input);
25073         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
25074 
25075         if (big_endian) {
25076             in0 = in0.swap_bytes();
25077             in1 = in1.swap_bytes();
25078         }
25079 
25080         const auto t0 = in0.shr<8>();
25081         const auto t1 = in1.shr<8>();
25082 
25083         const auto in = simd16<uint16_t>::pack(t0, t1);
25084 
25085         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
25086         const auto surrogates_wordmask = (in & v_f8) == v_d8;
25087         const uint16_t surrogates_bitmask = static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
25088         if (surrogates_bitmask == 0x0000) {
25089             input += 16;
25090         } else {
25091             // 2. We have some surrogates that have to be distinguished:
25092             //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
25093             //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
25094             //
25095             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
25096 
25097             // V - non-surrogate words
25098             //     V = not surrogates_wordmask
25099             const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
25100 
25101             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
25102             const auto    vH = (in & v_fc) == v_dc;
25103             const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
25104 
25105             // L - word mask for low surrogates
25106             //     L = not H and surrogates_wordmask
25107             const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
25108 
25109             const uint16_t a = static_cast<uint16_t>(L & (H >> 1));  // A low surrogate must be followed by high one.
25110                                               // (A low surrogate placed in the 7th register's word
25111                                               // is an exception we handle.)
25112             const uint16_t b = static_cast<uint16_t>(a << 1);        // Just mark that the opinput - startite fact is hold,
25113                                               // thanks to that we have only two masks for valid case.
25114             const uint16_t c = static_cast<uint16_t>(V | a | b);     // Combine all the masks into the final one.
25115 
25116             if (c == 0xffff) {
25117                 // The whole input register contains valid UTF-16, i.e.,
25118                 // either single words or proper surrogate pairs.
25119                 input += 16;
25120             } else if (c == 0x7fff) {
25121                 // The 15 lower words of the input register contains valid UTF-16.
25122                 // The 15th word may be either a low or high surrogate. It the next
25123                 // iteration we 1) check if the low surrogate is followed by a high
25124                 // one, 2) reject sole high surrogate.
25125                 input += 15;
25126             } else {
25127                 return result(error_code::SURROGATE, input - start);
25128             }
25129         }
25130     }
25131 
25132     return result(error_code::SUCCESS, input - start);
25133 }
25134 /* end file src/westmere/sse_validate_utf16.cpp */
25135 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_validate_utf32le.cpp
25136 /* begin file src/westmere/sse_validate_utf32le.cpp */
25137 /* Returns:
25138    - pointer to the last unprocessed character (a scalar fallback should check the rest);
25139    - nullptr if an error was detected.
25140 */
sse_validate_utf32le(const char32_t * input,size_t size)25141 const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) {
25142     const char32_t* end = input + size;
25143 
25144     const __m128i standardmax = _mm_set1_epi32(0x10ffff);
25145     const __m128i offset = _mm_set1_epi32(0xffff2000);
25146     const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
25147     __m128i currentmax = _mm_setzero_si128();
25148     __m128i currentoffsetmax = _mm_setzero_si128();
25149 
25150     while (input + 4 < end) {
25151         const __m128i in = _mm_loadu_si128((__m128i *)input);
25152         currentmax = _mm_max_epu32(in,currentmax);
25153         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
25154         input += 4;
25155     }
25156     __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
25157     if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
25158         return nullptr;
25159     }
25160 
25161     is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
25162     if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
25163         return nullptr;
25164     }
25165 
25166     return input;
25167 }
25168 
25169 
sse_validate_utf32le_with_errors(const char32_t * input,size_t size)25170 const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size) {
25171     const char32_t* start = input;
25172     const char32_t* end = input + size;
25173 
25174     const __m128i standardmax = _mm_set1_epi32(0x10ffff);
25175     const __m128i offset = _mm_set1_epi32(0xffff2000);
25176     const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
25177     __m128i currentmax = _mm_setzero_si128();
25178     __m128i currentoffsetmax = _mm_setzero_si128();
25179 
25180     while (input + 4 < end) {
25181         const __m128i in = _mm_loadu_si128((__m128i *)input);
25182         currentmax = _mm_max_epu32(in,currentmax);
25183         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
25184 
25185         __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
25186         if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
25187             return result(error_code::TOO_LARGE, input - start);
25188         }
25189 
25190         is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
25191         if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
25192             return result(error_code::SURROGATE, input - start);
25193         }
25194         input += 4;
25195     }
25196 
25197     return result(error_code::SUCCESS, input - start);
25198 }
25199 /* end file src/westmere/sse_validate_utf32le.cpp */
25200 
25201 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf8_to_utf16.cpp
25202 /* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */
25203 // depends on "tables/utf8_to_utf16_tables.h"
25204 
25205 
25206 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
25207 // end of the code points. Only the least significant 12 bits of the mask
25208 // are accessed.
25209 // It returns how many bytes were consumed (up to 12).
25210 template <endianness big_endian>
convert_masked_utf8_to_utf16(const char * input,uint64_t utf8_end_of_code_point_mask,char16_t * & utf16_output)25211 size_t convert_masked_utf8_to_utf16(const char *input,
25212                            uint64_t utf8_end_of_code_point_mask,
25213                            char16_t *&utf16_output) {
25214   // we use an approach where we try to process up to 12 input bytes.
25215   // Why 12 input bytes and not 16? Because we are concerned with the size of
25216   // the lookup tables. Also 12 is nicely divisible by two and three.
25217   //
25218   //
25219   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
25220   // beneficial to have fast paths that depend on branch prediction but have less latency.
25221   // This results in more instructions but, potentially, also higher speeds.
25222   //
25223   // We first try a few fast paths.
25224   const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
25225   const __m128i in = _mm_loadu_si128((__m128i *)input);
25226   const uint16_t input_utf8_end_of_code_point_mask =
25227       utf8_end_of_code_point_mask & 0xfff;
25228   if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
25229     // We process the data in chunks of 16 bytes.
25230     __m128i ascii_first = _mm_cvtepu8_epi16(in);
25231     __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in,8));
25232     if (big_endian) {
25233       ascii_first = _mm_shuffle_epi8(ascii_first, swap);
25234       ascii_second = _mm_shuffle_epi8(ascii_second, swap);
25235     }
25236     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first);
25237     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8), ascii_second);
25238     utf16_output += 16; // We wrote 16 16-bit characters.
25239     return 16; // We consumed 16 bytes.
25240   }
25241   if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
25242     // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
25243     // There is probably a more efficient sequence, but the following might do.
25244     const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
25245     const __m128i perm = _mm_shuffle_epi8(in, sh);
25246     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
25247     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
25248     __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
25249     if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
25250     _mm_storeu_si128((__m128i *)utf16_output, composed);
25251     utf16_output += 8; // We wrote 16 bytes, 8 code points.
25252     return 16;
25253   }
25254   if(input_utf8_end_of_code_point_mask == 0x924) {
25255     // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
25256     // There is probably a more efficient sequence, but the following might do.
25257     const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
25258     const __m128i perm = _mm_shuffle_epi8(in, sh);
25259     const __m128i ascii =
25260         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
25261     const __m128i middlebyte =
25262         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
25263     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
25264     const __m128i highbyte =
25265         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
25266     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
25267     const __m128i composed =
25268         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
25269     __m128i composed_repacked = _mm_packus_epi32(composed, composed);
25270     if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
25271     _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
25272     utf16_output += 4;
25273     return 12;
25274   }
25275   /// We do not have a fast path available, so we fallback.
25276 
25277   const uint8_t idx =
25278       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
25279   const uint8_t consumed =
25280       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
25281   if (idx < 64) {
25282     // SIX (6) input code-words
25283     // this is a relatively easy scenario
25284     // we process SIX (6) input code-words. The max length in bytes of six code
25285     // words spanning between 1 and 2 bytes each is 12 bytes. On processors
25286     // where pdep/pext is fast, we might be able to use a small lookup table.
25287     const __m128i sh =
25288         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
25289     const __m128i perm = _mm_shuffle_epi8(in, sh);
25290     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
25291     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
25292     __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
25293     if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
25294     _mm_storeu_si128((__m128i *)utf16_output, composed);
25295     utf16_output += 6; // We wrote 12 bytes, 6 code points.
25296   } else if (idx < 145) {
25297     // FOUR (4) input code-words
25298     const __m128i sh =
25299         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
25300     const __m128i perm = _mm_shuffle_epi8(in, sh);
25301     const __m128i ascii =
25302         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
25303     const __m128i middlebyte =
25304         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
25305     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
25306     const __m128i highbyte =
25307         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
25308     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
25309     const __m128i composed =
25310         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
25311      __m128i composed_repacked = _mm_packus_epi32(composed, composed);
25312     if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
25313     _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
25314     utf16_output += 4;
25315   } else if (idx < 209) {
25316     // TWO (2) input code-words
25317     //////////////
25318     // There might be garbage inputs where a leading byte mascarades as a four-byte
25319     // leading byte (by being followed by 3 continuation byte), but is not greater than
25320     // 0xf0. This could trigger a buffer overflow if we only counted leading
25321     // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
25322     // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
25323     // We do as at the cost of an extra mask.
25324     /////////////
25325     const __m128i sh =
25326         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
25327     const __m128i perm = _mm_shuffle_epi8(in, sh);
25328     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
25329     const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
25330     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
25331     __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
25332     // correct for spurious high bit
25333     const __m128i correct =
25334         _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
25335     middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
25336     const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
25337     // We deliberately carry the leading four bits in highbyte if they are present,
25338     // we remove them later when computing hightenbits.
25339     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
25340     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
25341     // When we need to generate a surrogate pair (leading byte > 0xF0), then
25342     // the corresponding 32-bit value in 'composed'  will be greater than
25343     // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
25344     // location of the surrogate pairs.
25345     const __m128i composed =
25346         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
25347                      _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
25348     const __m128i composedminus =
25349         _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
25350     const __m128i lowtenbits =
25351         _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
25352     // Notice the 0x3ff mask:
25353     const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
25354     const __m128i lowtenbitsadd =
25355         _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
25356     const __m128i hightenbitsadd =
25357         _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
25358     const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
25359     __m128i surrogates =
25360         _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
25361     uint32_t basic_buffer[4];
25362     uint32_t basic_buffer_swap[4];
25363     if (big_endian) {
25364       _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
25365       surrogates = _mm_shuffle_epi8(surrogates, swap);
25366     }
25367     _mm_storeu_si128((__m128i *)basic_buffer, composed);
25368     uint32_t surrogate_buffer[4];
25369     _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
25370     for (size_t i = 0; i < 3; i++) {
25371       if(basic_buffer[i] > 0x3c00000) {
25372         utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
25373         utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
25374         utf16_output += 2;
25375       } else {
25376         utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
25377         utf16_output++;
25378       }
25379     }
25380   } else {
25381     // here we know that there is an error but we do not handle errors
25382   }
25383   return consumed;
25384 }
25385 /* end file src/westmere/sse_convert_utf8_to_utf16.cpp */
25386 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf8_to_utf32.cpp
25387 /* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */
25388 // depends on "tables/utf8_to_utf16_tables.h"
25389 
25390 
25391 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
25392 // end of the code points. Only the least significant 12 bits of the mask
25393 // are accessed.
25394 // It returns how many bytes were consumed (up to 12).
convert_masked_utf8_to_utf32(const char * input,uint64_t utf8_end_of_code_point_mask,char32_t * & utf32_output)25395 size_t convert_masked_utf8_to_utf32(const char *input,
25396                            uint64_t utf8_end_of_code_point_mask,
25397                            char32_t *&utf32_output) {
25398   // we use an approach where we try to process up to 12 input bytes.
25399   // Why 12 input bytes and not 16? Because we are concerned with the size of
25400   // the lookup tables. Also 12 is nicely divisible by two and three.
25401   //
25402   //
25403   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
25404   // beneficial to have fast paths that depend on branch prediction but have less latency.
25405   // This results in more instructions but, potentially, also higher speeds.
25406   //
25407   // We first try a few fast paths.
25408   const __m128i in = _mm_loadu_si128((__m128i *)input);
25409   const uint16_t input_utf8_end_of_code_point_mask =
25410       utf8_end_of_code_point_mask & 0xfff;
25411   if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
25412     // We process the data in chunks of 16 bytes.
25413     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu8_epi32(in));
25414     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu8_epi32(_mm_srli_si128(in,4)));
25415     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+8), _mm_cvtepu8_epi32(_mm_srli_si128(in,8)));
25416     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+12), _mm_cvtepu8_epi32(_mm_srli_si128(in,12)));
25417     utf32_output += 16; // We wrote 16 32-bit characters.
25418     return 16; // We consumed 16 bytes.
25419   }
25420   if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
25421     // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
25422     // There is probably a more efficient sequence, but the following might do.
25423     const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
25424     const __m128i perm = _mm_shuffle_epi8(in, sh);
25425     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
25426     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
25427     const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
25428     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed));
25429     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8)));
25430     utf32_output += 8; // We wrote 32 bytes, 8 code points.
25431     return 16;
25432   }
25433   if(input_utf8_end_of_code_point_mask == 0x924) {
25434     // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
25435     // There is probably a more efficient sequence, but the following might do.
25436     const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
25437     const __m128i perm = _mm_shuffle_epi8(in, sh);
25438     const __m128i ascii =
25439         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
25440     const __m128i middlebyte =
25441         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
25442     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
25443     const __m128i highbyte =
25444         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
25445     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
25446     const __m128i composed =
25447         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
25448     _mm_storeu_si128((__m128i *)utf32_output, composed);
25449     utf32_output += 4;
25450     return 12;
25451   }
25452   /// We do not have a fast path available, so we fallback.
25453 
25454   const uint8_t idx =
25455       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
25456   const uint8_t consumed =
25457       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
25458   if (idx < 64) {
25459     // SIX (6) input code-words
25460     // this is a relatively easy scenario
25461     // we process SIX (6) input code-words. The max length in bytes of six code
25462     // words spanning between 1 and 2 bytes each is 12 bytes. On processors
25463     // where pdep/pext is fast, we might be able to use a small lookup table.
25464     const __m128i sh =
25465         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
25466     const __m128i perm = _mm_shuffle_epi8(in, sh);
25467     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
25468     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
25469     const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
25470     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed));
25471     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8)));
25472     utf32_output += 6; // We wrote 12 bytes, 6 code points.
25473   } else if (idx < 145) {
25474     // FOUR (4) input code-words
25475     const __m128i sh =
25476         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
25477     const __m128i perm = _mm_shuffle_epi8(in, sh);
25478     const __m128i ascii =
25479         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
25480     const __m128i middlebyte =
25481         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
25482     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
25483     const __m128i highbyte =
25484         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
25485     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
25486     const __m128i composed =
25487         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
25488     _mm_storeu_si128((__m128i *)utf32_output, composed);
25489     utf32_output += 4;
25490   } else if (idx < 209) {
25491     // TWO (2) input code-words
25492     const __m128i sh =
25493         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
25494     const __m128i perm = _mm_shuffle_epi8(in, sh);
25495     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
25496     const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
25497     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
25498     __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
25499     // correct for spurious high bit
25500     const __m128i correct =
25501         _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
25502     middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
25503     const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
25504     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
25505     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
25506     const __m128i composed =
25507         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
25508                      _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
25509     _mm_storeu_si128((__m128i *)utf32_output, composed);
25510     utf32_output += 3;
25511   } else {
25512     // here we know that there is an error but we do not handle errors
25513   }
25514   return consumed;
25515 }
25516 /* end file src/westmere/sse_convert_utf8_to_utf32.cpp */
25517 
25518 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf16_to_utf8.cpp
25519 /* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */
25520 /*
25521     The vectorized algorithm works on single SSE register i.e., it
25522     loads eight 16-bit words.
25523 
25524     We consider three cases:
25525     1. an input register contains no surrogates and each value
25526        is in range 0x0000 .. 0x07ff.
25527     2. an input register contains no surrogates and values are
25528        is in range 0x0000 .. 0xffff.
25529     3. an input register contains surrogates --- i.e. codepoints
25530        can have 16 or 32 bits.
25531 
25532     Ad 1.
25533 
25534     When values are less than 0x0800, it means that a 16-bit words
25535     can be converted into: 1) single UTF8 byte (when it's an ASCII
25536     char) or 2) two UTF8 bytes.
25537 
25538     For this case we do only some shuffle to obtain these 2-byte
25539     codes and finally compress the whole SSE register with a single
25540     shuffle.
25541 
25542     We need 256-entry lookup table to get a compression pattern
25543     and the number of output bytes in the compressed vector register.
25544     Each entry occupies 17 bytes.
25545 
25546     Ad 2.
25547 
25548     When values fit in 16-bit words, but are above 0x07ff, then
25549     a single word may produce one, two or three UTF8 bytes.
25550 
25551     We prepare data for all these three cases in two registers.
25552     The first register contains lower two UTF8 bytes (used in all
25553     cases), while the second one contains just the third byte for
25554     the three-UTF8-bytes case.
25555 
25556     Finally these two registers are interleaved forming eight-element
25557     array of 32-bit values. The array spans two SSE registers.
25558     The bytes from the registers are compressed using two shuffles.
25559 
25560     We need 256-entry lookup table to get a compression pattern
25561     and the number of output bytes in the compressed vector register.
25562     Each entry occupies 17 bytes.
25563 
25564 
25565     To summarize:
25566     - We need two 256-entry tables that have 8704 bytes in total.
25567 */
25568 
25569 /*
25570   Returns a pair: the first unprocessed byte from buf and utf8_output
25571   A scalar routing should carry on the conversion of the tail.
25572 */
25573 template <endianness big_endian>
sse_convert_utf16_to_utf8(const char16_t * buf,size_t len,char * utf8_output)25574 std::pair<const char16_t*, char*> sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) {
25575 
25576   const char16_t* end = buf + len;
25577 
25578   const __m128i v_0000 = _mm_setzero_si128();
25579   const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
25580   const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
25581   const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
25582   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
25583 
25584   while (buf + 16 + safety_margin <= end) {
25585     __m128i in = _mm_loadu_si128((__m128i*)buf);
25586     if (big_endian) {
25587       const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
25588       in = _mm_shuffle_epi8(in, swap);
25589     }
25590     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
25591     const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
25592     if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
25593         __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
25594         if (big_endian) {
25595           const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
25596           nextin = _mm_shuffle_epi8(nextin, swap);
25597         }
25598         if(!_mm_testz_si128(nextin, v_ff80)) {
25599           // 1. pack the bytes
25600           // obviously suboptimal.
25601           const __m128i utf8_packed = _mm_packus_epi16(in,in);
25602           // 2. store (16 bytes)
25603           _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
25604           // 3. adjust pointers
25605           buf += 8;
25606           utf8_output += 8;
25607           in = nextin;
25608         } else {
25609           // 1. pack the bytes
25610           // obviously suboptimal.
25611           const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
25612           // 2. store (16 bytes)
25613           _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
25614           // 3. adjust pointers
25615           buf += 16;
25616           utf8_output += 16;
25617           continue; // we are done for this round!
25618         }
25619     }
25620 
25621     // no bits set above 7th bit
25622     const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
25623     const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
25624 
25625     // no bits set above 11th bit
25626     const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
25627     const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
25628 
25629     if (one_or_two_bytes_bitmask == 0xffff) {
25630           // 1. prepare 2-byte values
25631           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
25632           // expected output   : [110a|aaaa|10bb|bbbb] x 8
25633           const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
25634           const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
25635 
25636           // t0 = [000a|aaaa|bbbb|bb00]
25637           const __m128i t0 = _mm_slli_epi16(in, 2);
25638           // t1 = [000a|aaaa|0000|0000]
25639           const __m128i t1 = _mm_and_si128(t0, v_1f00);
25640           // t2 = [0000|0000|00bb|bbbb]
25641           const __m128i t2 = _mm_and_si128(in, v_003f);
25642           // t3 = [000a|aaaa|00bb|bbbb]
25643           const __m128i t3 = _mm_or_si128(t1, t2);
25644           // t4 = [110a|aaaa|10bb|bbbb]
25645           const __m128i t4 = _mm_or_si128(t3, v_c080);
25646 
25647           // 2. merge ASCII and 2-byte codewords
25648           const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
25649 
25650           // 3. prepare bitmask for 8-bit lookup
25651           //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
25652           const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
25653           const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
25654           const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
25655           // 4. pack the bytes
25656           const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
25657           const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
25658           const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
25659 
25660           // 5. store bytes
25661           _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
25662 
25663           // 6. adjust pointers
25664           buf += 8;
25665           utf8_output += row[0];
25666           continue;
25667 
25668     }
25669 
25670     // 1. Check if there are any surrogate word in the input chunk.
25671     //    We have also deal with situation when there is a surrogate word
25672     //    at the end of a chunk.
25673     const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
25674 
25675     // bitmask = 0x0000 if there are no surrogates
25676     //         = 0xc000 if the last word is a surrogate
25677     const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
25678     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
25679     // it is likely an uncommon occurrence.
25680     if (surrogates_bitmask == 0x0000) {
25681       // case: words from register produce either 1, 2 or 3 UTF-8 bytes
25682         const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
25683                                                 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
25684 
25685         /* In this branch we handle three cases:
25686            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
25687            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
25688            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
25689 
25690           We expand the input word (16-bit) into two words (32-bit), thus
25691           we have room for four bytes. However, we need five distinct bit
25692           layouts. Note that the last byte in cases #2 and #3 is the same.
25693 
25694           We precompute byte 1 for case #1 and the common byte for cases #2 & #3
25695           in register t2.
25696 
25697           We precompute byte 1 for case #3 and -- **conditionally** -- precompute
25698           either byte 1 for case #2 or byte 2 for case #3. Note that they
25699           differ by exactly one bit.
25700 
25701           Finally from these two words we build proper UTF-8 sequence, taking
25702           into account the case (i.e, the number of bytes to write).
25703         */
25704         /**
25705          * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
25706          * t2 => [0ccc|cccc] [10cc|cccc]
25707          * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
25708          */
25709 #define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
25710         // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
25711         const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
25712         // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
25713         const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
25714         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
25715         const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
25716 
25717         // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
25718         const __m128i s0 = _mm_srli_epi16(in, 4);
25719         // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
25720         const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
25721         // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
25722         const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
25723         // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
25724         const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
25725         const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
25726         const __m128i s4 = _mm_xor_si128(s3, m0);
25727 #undef simdutf_vec
25728 
25729         // 4. expand words 16-bit => 32-bit
25730         const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
25731         const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
25732 
25733         // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
25734         const uint16_t mask = (one_byte_bitmask & 0x5555) |
25735                               (one_or_two_bytes_bitmask & 0xaaaa);
25736         if(mask == 0) {
25737           // We only have three-byte words. Use fast path.
25738           const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
25739           const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
25740           const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
25741           _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
25742           utf8_output += 12;
25743           _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
25744           utf8_output += 12;
25745           buf += 8;
25746           continue;
25747         }
25748         const uint8_t mask0 = uint8_t(mask);
25749 
25750         const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
25751         const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
25752         const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
25753 
25754         const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
25755 
25756         const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
25757         const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
25758         const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
25759 
25760         _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
25761         utf8_output += row0[0];
25762         _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
25763         utf8_output += row1[0];
25764 
25765         buf += 8;
25766     // surrogate pair(s) in a register
25767     } else {
25768       // Let us do a scalar fallback.
25769       // It may seem wasteful to use scalar code, but being efficient with SIMD
25770       // in the presence of surrogate pairs may require non-trivial tables.
25771       size_t forward = 15;
25772       size_t k = 0;
25773       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
25774       for(; k < forward; k++) {
25775         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
25776         if((word & 0xFF80)==0) {
25777           *utf8_output++ = char(word);
25778         } else if((word & 0xF800)==0) {
25779           *utf8_output++ = char((word>>6) | 0b11000000);
25780           *utf8_output++ = char((word & 0b111111) | 0b10000000);
25781         } else if((word &0xF800 ) != 0xD800) {
25782           *utf8_output++ = char((word>>12) | 0b11100000);
25783           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
25784           *utf8_output++ = char((word & 0b111111) | 0b10000000);
25785         } else {
25786           // must be a surrogate pair
25787           uint16_t diff = uint16_t(word - 0xD800);
25788           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
25789           k++;
25790           uint16_t diff2 = uint16_t(next_word - 0xDC00);
25791           if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf8_output); }
25792           uint32_t value = (diff << 10) + diff2 + 0x10000;
25793           *utf8_output++ = char((value>>18) | 0b11110000);
25794           *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
25795           *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
25796           *utf8_output++ = char((value & 0b111111) | 0b10000000);
25797         }
25798       }
25799       buf += k;
25800     }
25801   } // while
25802 
25803   return std::make_pair(buf, utf8_output);
25804 }
25805 
25806 
25807 /*
25808   Returns a pair: a result struct and utf8_output.
25809   If there is an error, the count field of the result is the position of the error.
25810   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
25811   A scalar routing should carry on the conversion of the tail if needed.
25812 */
25813 template <endianness big_endian>
sse_convert_utf16_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output)25814 std::pair<result, char*> sse_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
25815   const char16_t* start = buf;
25816   const char16_t* end = buf + len;
25817 
25818   const __m128i v_0000 = _mm_setzero_si128();
25819   const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
25820   const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
25821   const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
25822   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
25823 
25824   while (buf + 16 + safety_margin <= end) {
25825     __m128i in = _mm_loadu_si128((__m128i*)buf);
25826     if (big_endian) {
25827       const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
25828       in = _mm_shuffle_epi8(in, swap);
25829     }
25830     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
25831     const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
25832     if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
25833         __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
25834         if (big_endian) {
25835           const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
25836           nextin = _mm_shuffle_epi8(nextin, swap);
25837         }
25838         if(!_mm_testz_si128(nextin, v_ff80)) {
25839           // 1. pack the bytes
25840           // obviously suboptimal.
25841           const __m128i utf8_packed = _mm_packus_epi16(in,in);
25842           // 2. store (16 bytes)
25843           _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
25844           // 3. adjust pointers
25845           buf += 8;
25846           utf8_output += 8;
25847           in = nextin;
25848         } else {
25849           // 1. pack the bytes
25850           // obviously suboptimal.
25851           const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
25852           // 2. store (16 bytes)
25853           _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
25854           // 3. adjust pointers
25855           buf += 16;
25856           utf8_output += 16;
25857           continue; // we are done for this round!
25858         }
25859     }
25860 
25861     // no bits set above 7th bit
25862     const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
25863     const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
25864 
25865     // no bits set above 11th bit
25866     const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
25867     const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
25868 
25869     if (one_or_two_bytes_bitmask == 0xffff) {
25870           // 1. prepare 2-byte values
25871           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
25872           // expected output   : [110a|aaaa|10bb|bbbb] x 8
25873           const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
25874           const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
25875 
25876           // t0 = [000a|aaaa|bbbb|bb00]
25877           const __m128i t0 = _mm_slli_epi16(in, 2);
25878           // t1 = [000a|aaaa|0000|0000]
25879           const __m128i t1 = _mm_and_si128(t0, v_1f00);
25880           // t2 = [0000|0000|00bb|bbbb]
25881           const __m128i t2 = _mm_and_si128(in, v_003f);
25882           // t3 = [000a|aaaa|00bb|bbbb]
25883           const __m128i t3 = _mm_or_si128(t1, t2);
25884           // t4 = [110a|aaaa|10bb|bbbb]
25885           const __m128i t4 = _mm_or_si128(t3, v_c080);
25886 
25887           // 2. merge ASCII and 2-byte codewords
25888           const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
25889 
25890           // 3. prepare bitmask for 8-bit lookup
25891           //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
25892           const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
25893           const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
25894           const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
25895           // 4. pack the bytes
25896           const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
25897           const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
25898           const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
25899 
25900           // 5. store bytes
25901           _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
25902 
25903           // 6. adjust pointers
25904           buf += 8;
25905           utf8_output += row[0];
25906           continue;
25907 
25908     }
25909 
25910     // 1. Check if there are any surrogate word in the input chunk.
25911     //    We have also deal with situation when there is a surrogate word
25912     //    at the end of a chunk.
25913     const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
25914 
25915     // bitmask = 0x0000 if there are no surrogates
25916     //         = 0xc000 if the last word is a surrogate
25917     const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
25918     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
25919     // it is likely an uncommon occurrence.
25920     if (surrogates_bitmask == 0x0000) {
25921       // case: words from register produce either 1, 2 or 3 UTF-8 bytes
25922         const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
25923                                                 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
25924 
25925         /* In this branch we handle three cases:
25926            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
25927            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
25928            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
25929 
25930           We expand the input word (16-bit) into two words (32-bit), thus
25931           we have room for four bytes. However, we need five distinct bit
25932           layouts. Note that the last byte in cases #2 and #3 is the same.
25933 
25934           We precompute byte 1 for case #1 and the common byte for cases #2 & #3
25935           in register t2.
25936 
25937           We precompute byte 1 for case #3 and -- **conditionally** -- precompute
25938           either byte 1 for case #2 or byte 2 for case #3. Note that they
25939           differ by exactly one bit.
25940 
25941           Finally from these two words we build proper UTF-8 sequence, taking
25942           into account the case (i.e, the number of bytes to write).
25943         */
25944         /**
25945          * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
25946          * t2 => [0ccc|cccc] [10cc|cccc]
25947          * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
25948          */
25949 #define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
25950         // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
25951         const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
25952         // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
25953         const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
25954         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
25955         const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
25956 
25957         // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
25958         const __m128i s0 = _mm_srli_epi16(in, 4);
25959         // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
25960         const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
25961         // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
25962         const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
25963         // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
25964         const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
25965         const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
25966         const __m128i s4 = _mm_xor_si128(s3, m0);
25967 #undef simdutf_vec
25968 
25969         // 4. expand words 16-bit => 32-bit
25970         const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
25971         const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
25972 
25973         // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
25974         const uint16_t mask = (one_byte_bitmask & 0x5555) |
25975                               (one_or_two_bytes_bitmask & 0xaaaa);
25976         if(mask == 0) {
25977           // We only have three-byte words. Use fast path.
25978           const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
25979           const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
25980           const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
25981           _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
25982           utf8_output += 12;
25983           _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
25984           utf8_output += 12;
25985           buf += 8;
25986           continue;
25987         }
25988         const uint8_t mask0 = uint8_t(mask);
25989 
25990         const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
25991         const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
25992         const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
25993 
25994         const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
25995 
25996         const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
25997         const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
25998         const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
25999 
26000         _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
26001         utf8_output += row0[0];
26002         _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
26003         utf8_output += row1[0];
26004 
26005         buf += 8;
26006     // surrogate pair(s) in a register
26007     } else {
26008       // Let us do a scalar fallback.
26009       // It may seem wasteful to use scalar code, but being efficient with SIMD
26010       // in the presence of surrogate pairs may require non-trivial tables.
26011       size_t forward = 15;
26012       size_t k = 0;
26013       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
26014       for(; k < forward; k++) {
26015         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
26016         if((word & 0xFF80)==0) {
26017           *utf8_output++ = char(word);
26018         } else if((word & 0xF800)==0) {
26019           *utf8_output++ = char((word>>6) | 0b11000000);
26020           *utf8_output++ = char((word & 0b111111) | 0b10000000);
26021         } else if((word &0xF800 ) != 0xD800) {
26022           *utf8_output++ = char((word>>12) | 0b11100000);
26023           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
26024           *utf8_output++ = char((word & 0b111111) | 0b10000000);
26025         } else {
26026           // must be a surrogate pair
26027           uint16_t diff = uint16_t(word - 0xD800);
26028           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
26029           k++;
26030           uint16_t diff2 = uint16_t(next_word - 0xDC00);
26031           if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); }
26032           uint32_t value = (diff << 10) + diff2 + 0x10000;
26033           *utf8_output++ = char((value>>18) | 0b11110000);
26034           *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
26035           *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
26036           *utf8_output++ = char((value & 0b111111) | 0b10000000);
26037         }
26038       }
26039       buf += k;
26040     }
26041   } // while
26042 
26043   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
26044 }
26045 /* end file src/westmere/sse_convert_utf16_to_utf8.cpp */
26046 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf16_to_utf32.cpp
26047 /* begin file src/westmere/sse_convert_utf16_to_utf32.cpp */
26048 /*
26049     The vectorized algorithm works on single SSE register i.e., it
26050     loads eight 16-bit words.
26051 
26052     We consider three cases:
26053     1. an input register contains no surrogates and each value
26054        is in range 0x0000 .. 0x07ff.
26055     2. an input register contains no surrogates and values are
26056        is in range 0x0000 .. 0xffff.
26057     3. an input register contains surrogates --- i.e. codepoints
26058        can have 16 or 32 bits.
26059 
26060     Ad 1.
26061 
26062     When values are less than 0x0800, it means that a 16-bit words
26063     can be converted into: 1) single UTF8 byte (when it's an ASCII
26064     char) or 2) two UTF8 bytes.
26065 
26066     For this case we do only some shuffle to obtain these 2-byte
26067     codes and finally compress the whole SSE register with a single
26068     shuffle.
26069 
26070     We need 256-entry lookup table to get a compression pattern
26071     and the number of output bytes in the compressed vector register.
26072     Each entry occupies 17 bytes.
26073 
26074     Ad 2.
26075 
26076     When values fit in 16-bit words, but are above 0x07ff, then
26077     a single word may produce one, two or three UTF8 bytes.
26078 
26079     We prepare data for all these three cases in two registers.
26080     The first register contains lower two UTF8 bytes (used in all
26081     cases), while the second one contains just the third byte for
26082     the three-UTF8-bytes case.
26083 
26084     Finally these two registers are interleaved forming eight-element
26085     array of 32-bit values. The array spans two SSE registers.
26086     The bytes from the registers are compressed using two shuffles.
26087 
26088     We need 256-entry lookup table to get a compression pattern
26089     and the number of output bytes in the compressed vector register.
26090     Each entry occupies 17 bytes.
26091 
26092 
26093     To summarize:
26094     - We need two 256-entry tables that have 8704 bytes in total.
26095 */
26096 
26097 /*
26098   Returns a pair: the first unprocessed byte from buf and utf8_output
26099   A scalar routing should carry on the conversion of the tail.
26100 */
26101 template <endianness big_endian>
sse_convert_utf16_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output)26102 std::pair<const char16_t*, char32_t*> sse_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
26103   const char16_t* end = buf + len;
26104 
26105   const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
26106   const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
26107 
26108   while (buf + 16 <= end) {
26109     __m128i in = _mm_loadu_si128((__m128i*)buf);
26110 
26111     if (big_endian) {
26112       const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
26113       in = _mm_shuffle_epi8(in, swap);
26114     }
26115 
26116     // 1. Check if there are any surrogate word in the input chunk.
26117     //    We have also deal with situation when there is a surrogate word
26118     //    at the end of a chunk.
26119     const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
26120 
26121     // bitmask = 0x0000 if there are no surrogates
26122     //         = 0xc000 if the last word is a surrogate
26123     const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
26124     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
26125     // it is likely an uncommon occurrence.
26126     if (surrogates_bitmask == 0x0000) {
26127       // case: no surrogate pair, extend 16-bit words to 32-bit words
26128         _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in));
26129         _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8)));
26130         utf32_output += 8;
26131         buf += 8;
26132     // surrogate pair(s) in a register
26133     } else {
26134       // Let us do a scalar fallback.
26135       // It may seem wasteful to use scalar code, but being efficient with SIMD
26136       // in the presence of surrogate pairs may require non-trivial tables.
26137       size_t forward = 15;
26138       size_t k = 0;
26139       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
26140       for(; k < forward; k++) {
26141         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
26142         if((word &0xF800 ) != 0xD800) {
26143           *utf32_output++ = char32_t(word);
26144         } else {
26145           // must be a surrogate pair
26146           uint16_t diff = uint16_t(word - 0xD800);
26147           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
26148           k++;
26149           uint16_t diff2 = uint16_t(next_word - 0xDC00);
26150           if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf32_output); }
26151           uint32_t value = (diff << 10) + diff2 + 0x10000;
26152           *utf32_output++ = char32_t(value);
26153         }
26154       }
26155       buf += k;
26156     }
26157   } // while
26158   return std::make_pair(buf, utf32_output);
26159 }
26160 
26161 
26162 /*
26163   Returns a pair: a result struct and utf8_output.
26164   If there is an error, the count field of the result is the position of the error.
26165   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
26166   A scalar routing should carry on the conversion of the tail if needed.
26167 */
26168 template <endianness big_endian>
sse_convert_utf16_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output)26169 std::pair<result, char32_t*> sse_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
26170   const char16_t* start = buf;
26171   const char16_t* end = buf + len;
26172 
26173   const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
26174   const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
26175 
26176   while (buf + 16 <= end) {
26177     __m128i in = _mm_loadu_si128((__m128i*)buf);
26178 
26179     if (big_endian) {
26180       const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
26181       in = _mm_shuffle_epi8(in, swap);
26182     }
26183 
26184     // 1. Check if there are any surrogate word in the input chunk.
26185     //    We have also deal with situation when there is a surrogate word
26186     //    at the end of a chunk.
26187     const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
26188 
26189     // bitmask = 0x0000 if there are no surrogates
26190     //         = 0xc000 if the last word is a surrogate
26191     const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
26192     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
26193     // it is likely an uncommon occurrence.
26194     if (surrogates_bitmask == 0x0000) {
26195       // case: no surrogate pair, extend 16-bit words to 32-bit words
26196         _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in));
26197         _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8)));
26198         utf32_output += 8;
26199         buf += 8;
26200     // surrogate pair(s) in a register
26201     } else {
26202       // Let us do a scalar fallback.
26203       // It may seem wasteful to use scalar code, but being efficient with SIMD
26204       // in the presence of surrogate pairs may require non-trivial tables.
26205       size_t forward = 15;
26206       size_t k = 0;
26207       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
26208       for(; k < forward; k++) {
26209         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
26210         if((word &0xF800 ) != 0xD800) {
26211           *utf32_output++ = char32_t(word);
26212         } else {
26213           // must be a surrogate pair
26214           uint16_t diff = uint16_t(word - 0xD800);
26215           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
26216           k++;
26217           uint16_t diff2 = uint16_t(next_word - 0xDC00);
26218           if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); }
26219           uint32_t value = (diff << 10) + diff2 + 0x10000;
26220           *utf32_output++ = char32_t(value);
26221         }
26222       }
26223       buf += k;
26224     }
26225   } // while
26226   return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
26227 }
26228 /* end file src/westmere/sse_convert_utf16_to_utf32.cpp */
26229 
26230 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf32_to_utf8.cpp
26231 /* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */
sse_convert_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output)26232 std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
26233   const char32_t* end = buf + len;
26234 
26235   const __m128i v_0000 = _mm_setzero_si128();
26236   const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
26237   const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
26238   const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
26239   const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
26240   const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
26241   __m128i running_max = _mm_setzero_si128();
26242   __m128i forbidden_bytemask = _mm_setzero_si128();
26243   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
26244 
26245   while (buf + 16 + safety_margin <= end) {
26246     // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
26247     __m128i in = _mm_loadu_si128((__m128i*)buf);
26248     __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
26249     running_max = _mm_max_epu32(_mm_max_epu32(in, running_max), nextin);
26250 
26251     // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
26252     __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff));
26253 
26254     // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
26255 
26256     // Check for ASCII fast path
26257     if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
26258       // We eagerly load another 32 bytes, hoping that they will be ASCII too.
26259       // The intuition is that we try to collect 16 ASCII characters which requires
26260       // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
26261       // as our new inputs.
26262       __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
26263       __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
26264       running_max = _mm_max_epu32(_mm_max_epu32(thirdin, running_max), fourthin);
26265       __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
26266       if(!_mm_testz_si128(nextin_16, v_ff80)) {
26267         // 1. pack the bytes
26268         // obviously suboptimal.
26269         const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16);
26270         // 2. store (16 bytes)
26271         _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
26272         // 3. adjust pointers
26273         buf += 8;
26274         utf8_output += 8;
26275         // Proceed with next input
26276         in_16 = nextin_16;
26277         // We need to update in and nextin because they are used later.
26278         in = thirdin;
26279         nextin = fourthin;
26280       } else {
26281         // 1. pack the bytes
26282         const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
26283         // 2. store (16 bytes)
26284         _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
26285         // 3. adjust pointers
26286         buf += 16;
26287         utf8_output += 16;
26288         continue; // we are done for this round!
26289       }
26290     }
26291 
26292     // no bits set above 7th bit
26293     const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
26294     const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
26295 
26296     // no bits set above 11th bit
26297     const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
26298     const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
26299 
26300     if (one_or_two_bytes_bitmask == 0xffff) {
26301       // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
26302       // 1. prepare 2-byte values
26303       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
26304       // expected output   : [110a|aaaa|10bb|bbbb] x 8
26305       const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
26306       const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
26307 
26308       // t0 = [000a|aaaa|bbbb|bb00]
26309       const __m128i t0 = _mm_slli_epi16(in_16, 2);
26310       // t1 = [000a|aaaa|0000|0000]
26311       const __m128i t1 = _mm_and_si128(t0, v_1f00);
26312       // t2 = [0000|0000|00bb|bbbb]
26313       const __m128i t2 = _mm_and_si128(in_16, v_003f);
26314       // t3 = [000a|aaaa|00bb|bbbb]
26315       const __m128i t3 = _mm_or_si128(t1, t2);
26316       // t4 = [110a|aaaa|10bb|bbbb]
26317       const __m128i t4 = _mm_or_si128(t3, v_c080);
26318 
26319       // 2. merge ASCII and 2-byte codewords
26320       const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
26321 
26322       // 3. prepare bitmask for 8-bit lookup
26323       //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
26324       const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
26325       const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
26326       const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
26327       // 4. pack the bytes
26328       const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
26329       const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
26330       const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
26331 
26332       // 5. store bytes
26333       _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
26334 
26335       // 6. adjust pointers
26336       buf += 8;
26337       utf8_output += row[0];
26338       continue;
26339     }
26340 
26341     // Check for overflow in packing
26342 
26343     const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
26344     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
26345     if (saturation_bitmask == 0xffff) {
26346       // case: words from register produce either 1, 2 or 3 UTF-8 bytes
26347       const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
26348       forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
26349 
26350       const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
26351                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
26352 
26353       /* In this branch we handle three cases:
26354           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
26355           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
26356           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
26357 
26358         We expand the input word (16-bit) into two words (32-bit), thus
26359         we have room for four bytes. However, we need five distinct bit
26360         layouts. Note that the last byte in cases #2 and #3 is the same.
26361 
26362         We precompute byte 1 for case #1 and the common byte for cases #2 & #3
26363         in register t2.
26364 
26365         We precompute byte 1 for case #3 and -- **conditionally** -- precompute
26366         either byte 1 for case #2 or byte 2 for case #3. Note that they
26367         differ by exactly one bit.
26368 
26369         Finally from these two words we build proper UTF-8 sequence, taking
26370         into account the case (i.e, the number of bytes to write).
26371       */
26372       /**
26373        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
26374        * t2 => [0ccc|cccc] [10cc|cccc]
26375        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
26376        */
26377 #define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
26378       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
26379       const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
26380       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
26381       const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
26382       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
26383       const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
26384 
26385       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
26386       const __m128i s0 = _mm_srli_epi16(in_16, 4);
26387       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
26388       const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
26389       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
26390       const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
26391       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
26392       const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
26393       const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
26394       const __m128i s4 = _mm_xor_si128(s3, m0);
26395 #undef simdutf_vec
26396 
26397       // 4. expand words 16-bit => 32-bit
26398       const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
26399       const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
26400 
26401       // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
26402       const uint16_t mask = (one_byte_bitmask & 0x5555) |
26403                             (one_or_two_bytes_bitmask & 0xaaaa);
26404       if(mask == 0) {
26405         // We only have three-byte words. Use fast path.
26406         const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
26407         const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
26408         const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
26409         _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
26410         utf8_output += 12;
26411         _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
26412         utf8_output += 12;
26413         buf += 8;
26414         continue;
26415       }
26416       const uint8_t mask0 = uint8_t(mask);
26417 
26418       const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
26419       const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
26420       const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
26421 
26422       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
26423 
26424       const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
26425       const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
26426       const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
26427 
26428       _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
26429       utf8_output += row0[0];
26430       _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
26431       utf8_output += row1[0];
26432 
26433       buf += 8;
26434     } else {
26435       // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
26436       // Let us do a scalar fallback.
26437       // It may seem wasteful to use scalar code, but being efficient with SIMD
26438       // in the presence of surrogate pairs may require non-trivial tables.
26439       size_t forward = 15;
26440       size_t k = 0;
26441       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
26442       for(; k < forward; k++) {
26443         uint32_t word = buf[k];
26444         if((word & 0xFFFFFF80)==0) {
26445           *utf8_output++ = char(word);
26446         } else if((word & 0xFFFFF800)==0) {
26447           *utf8_output++ = char((word>>6) | 0b11000000);
26448           *utf8_output++ = char((word & 0b111111) | 0b10000000);
26449         } else if((word &0xFFFF0000 )==0) {
26450           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
26451           *utf8_output++ = char((word>>12) | 0b11100000);
26452           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
26453           *utf8_output++ = char((word & 0b111111) | 0b10000000);
26454         } else {
26455           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
26456           *utf8_output++ = char((word>>18) | 0b11110000);
26457           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
26458           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
26459           *utf8_output++ = char((word & 0b111111) | 0b10000000);
26460         }
26461       }
26462       buf += k;
26463     }
26464   } // while
26465 
26466   // check for invalid input
26467   const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
26468   if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(_mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
26469     return std::make_pair(nullptr, utf8_output);
26470   }
26471 
26472   if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
26473 
26474   return std::make_pair(buf, utf8_output);
26475 }
26476 
26477 
sse_convert_utf32_to_utf8_with_errors(const char32_t * buf,size_t len,char * utf8_output)26478 std::pair<result, char*> sse_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
26479 
26480   const char32_t* end = buf + len;
26481   const char32_t* start = buf;
26482 
26483   const __m128i v_0000 = _mm_setzero_si128();
26484   const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
26485   const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
26486   const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
26487   const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
26488   const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
26489   const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
26490 
26491   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
26492 
26493   while (buf + 16 + safety_margin <= end) {
26494     // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
26495     __m128i in = _mm_loadu_si128((__m128i*)buf);
26496     __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
26497 
26498     // Check for too large input
26499     __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
26500     if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
26501       return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
26502     }
26503 
26504     // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
26505     __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff));
26506 
26507     // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
26508 
26509     // Check for ASCII fast path
26510     if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
26511       // We eagerly load another 32 bytes, hoping that they will be ASCII too.
26512       // The intuition is that we try to collect 16 ASCII characters which requires
26513       // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
26514       // as our new inputs.
26515       __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
26516       __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
26517       __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
26518       if(!_mm_testz_si128(nextin_16, v_ff80)) {
26519         // 1. pack the bytes
26520         // obviously suboptimal.
26521         const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16);
26522         // 2. store (16 bytes)
26523         _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
26524         // 3. adjust pointers
26525         buf += 8;
26526         utf8_output += 8;
26527         // Proceed with next input
26528         in_16 = nextin_16;
26529         __m128i next_max_input = _mm_max_epu32(_mm_max_epu32(thirdin, fourthin), v_10ffff);
26530         if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(next_max_input, v_10ffff))) != 0xffff) {
26531           return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
26532         }
26533         // We need to update in and nextin because they are used later.
26534         in = thirdin;
26535         nextin = fourthin;
26536       } else {
26537         // 1. pack the bytes
26538         const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
26539         // 2. store (16 bytes)
26540         _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
26541         // 3. adjust pointers
26542         buf += 16;
26543         utf8_output += 16;
26544         continue; // we are done for this round!
26545       }
26546     }
26547 
26548     // no bits set above 7th bit
26549     const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
26550     const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
26551 
26552     // no bits set above 11th bit
26553     const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
26554     const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
26555 
26556     if (one_or_two_bytes_bitmask == 0xffff) {
26557       // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
26558       // 1. prepare 2-byte values
26559       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
26560       // expected output   : [110a|aaaa|10bb|bbbb] x 8
26561       const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
26562       const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
26563 
26564       // t0 = [000a|aaaa|bbbb|bb00]
26565       const __m128i t0 = _mm_slli_epi16(in_16, 2);
26566       // t1 = [000a|aaaa|0000|0000]
26567       const __m128i t1 = _mm_and_si128(t0, v_1f00);
26568       // t2 = [0000|0000|00bb|bbbb]
26569       const __m128i t2 = _mm_and_si128(in_16, v_003f);
26570       // t3 = [000a|aaaa|00bb|bbbb]
26571       const __m128i t3 = _mm_or_si128(t1, t2);
26572       // t4 = [110a|aaaa|10bb|bbbb]
26573       const __m128i t4 = _mm_or_si128(t3, v_c080);
26574 
26575       // 2. merge ASCII and 2-byte codewords
26576       const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
26577 
26578       // 3. prepare bitmask for 8-bit lookup
26579       //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
26580       const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
26581       const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
26582       const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
26583       // 4. pack the bytes
26584       const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
26585       const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
26586       const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
26587 
26588       // 5. store bytes
26589       _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
26590 
26591       // 6. adjust pointers
26592       buf += 8;
26593       utf8_output += row[0];
26594       continue;
26595     }
26596 
26597 
26598     // Check for overflow in packing
26599     const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
26600     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
26601 
26602     if (saturation_bitmask == 0xffff) {
26603       // case: words from register produce either 1, 2 or 3 UTF-8 bytes
26604 
26605       // Check for illegal surrogate words
26606       const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
26607       const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
26608       if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
26609         return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
26610       }
26611 
26612       const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
26613                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
26614 
26615       /* In this branch we handle three cases:
26616           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
26617           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
26618           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
26619 
26620         We expand the input word (16-bit) into two words (32-bit), thus
26621         we have room for four bytes. However, we need five distinct bit
26622         layouts. Note that the last byte in cases #2 and #3 is the same.
26623 
26624         We precompute byte 1 for case #1 and the common byte for cases #2 & #3
26625         in register t2.
26626 
26627         We precompute byte 1 for case #3 and -- **conditionally** -- precompute
26628         either byte 1 for case #2 or byte 2 for case #3. Note that they
26629         differ by exactly one bit.
26630 
26631         Finally from these two words we build proper UTF-8 sequence, taking
26632         into account the case (i.e, the number of bytes to write).
26633       */
26634       /**
26635        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
26636        * t2 => [0ccc|cccc] [10cc|cccc]
26637        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
26638        */
26639 #define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
26640       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
26641       const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
26642       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
26643       const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
26644       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
26645       const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
26646 
26647       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
26648       const __m128i s0 = _mm_srli_epi16(in_16, 4);
26649       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
26650       const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
26651       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
26652       const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
26653       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
26654       const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
26655       const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
26656       const __m128i s4 = _mm_xor_si128(s3, m0);
26657 #undef simdutf_vec
26658 
26659       // 4. expand words 16-bit => 32-bit
26660       const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
26661       const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
26662 
26663       // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
26664       const uint16_t mask = (one_byte_bitmask & 0x5555) |
26665                             (one_or_two_bytes_bitmask & 0xaaaa);
26666       if(mask == 0) {
26667         // We only have three-byte words. Use fast path.
26668         const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
26669         const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
26670         const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
26671         _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
26672         utf8_output += 12;
26673         _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
26674         utf8_output += 12;
26675         buf += 8;
26676         continue;
26677       }
26678       const uint8_t mask0 = uint8_t(mask);
26679 
26680       const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
26681       const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
26682       const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
26683 
26684       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
26685 
26686       const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
26687       const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
26688       const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
26689 
26690       _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
26691       utf8_output += row0[0];
26692       _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
26693       utf8_output += row1[0];
26694 
26695       buf += 8;
26696     } else {
26697       // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
26698       // Let us do a scalar fallback.
26699       // It may seem wasteful to use scalar code, but being efficient with SIMD
26700       // in the presence of surrogate pairs may require non-trivial tables.
26701       size_t forward = 15;
26702       size_t k = 0;
26703       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
26704       for(; k < forward; k++) {
26705         uint32_t word = buf[k];
26706         if((word & 0xFFFFFF80)==0) {
26707           *utf8_output++ = char(word);
26708         } else if((word & 0xFFFFF800)==0) {
26709           *utf8_output++ = char((word>>6) | 0b11000000);
26710           *utf8_output++ = char((word & 0b111111) | 0b10000000);
26711         } else if((word &0xFFFF0000 )==0) {
26712           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
26713           *utf8_output++ = char((word>>12) | 0b11100000);
26714           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
26715           *utf8_output++ = char((word & 0b111111) | 0b10000000);
26716         } else {
26717           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf- start + k), utf8_output); }
26718           *utf8_output++ = char((word>>18) | 0b11110000);
26719           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
26720           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
26721           *utf8_output++ = char((word & 0b111111) | 0b10000000);
26722         }
26723       }
26724       buf += k;
26725     }
26726   } // while
26727 
26728   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
26729 }
26730 /* end file src/westmere/sse_convert_utf32_to_utf8.cpp */
26731 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf32_to_utf16.cpp
26732 /* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */
26733 template <endianness big_endian>
sse_convert_utf32_to_utf16(const char32_t * buf,size_t len,char16_t * utf16_output)26734 std::pair<const char32_t*, char16_t*> sse_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
26735 
26736   const char32_t* end = buf + len;
26737 
26738   const __m128i v_0000 = _mm_setzero_si128();
26739   const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
26740   __m128i forbidden_bytemask = _mm_setzero_si128();
26741 
26742   while (buf + 8 <= end) {
26743     __m128i in = _mm_loadu_si128((__m128i*)buf);
26744     __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
26745     const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
26746     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
26747 
26748     // Check if no bits set above 16th
26749     if (saturation_bitmask == 0xffff) {
26750       // Pack UTF-32 to UTF-16
26751       __m128i utf16_packed = _mm_packus_epi32(in, nextin);
26752 
26753       const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
26754       const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
26755       forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
26756 
26757       if (big_endian) {
26758         const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
26759         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
26760       }
26761 
26762       _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
26763       utf16_output += 8;
26764       buf += 8;
26765     } else {
26766       size_t forward = 7;
26767       size_t k = 0;
26768       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
26769       for(; k < forward; k++) {
26770         uint32_t word = buf[k];
26771         if((word & 0xFFFF0000)==0) {
26772           // will not generate a surrogate pair
26773           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
26774           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
26775         } else {
26776           // will generate a surrogate pair
26777           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
26778           word -= 0x10000;
26779           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
26780           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
26781           if (big_endian) {
26782             high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
26783             low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
26784           }
26785           *utf16_output++ = char16_t(high_surrogate);
26786           *utf16_output++ = char16_t(low_surrogate);
26787         }
26788       }
26789       buf += k;
26790     }
26791   }
26792 
26793   // check for invalid input
26794   if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
26795 
26796   return std::make_pair(buf, utf16_output);
26797 }
26798 
26799 
26800 template <endianness big_endian>
sse_convert_utf32_to_utf16_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output)26801 std::pair<result, char16_t*> sse_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
26802   const char32_t* start = buf;
26803   const char32_t* end = buf + len;
26804 
26805   const __m128i v_0000 = _mm_setzero_si128();
26806   const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
26807 
26808   while (buf + 8 <= end) {
26809     __m128i in = _mm_loadu_si128((__m128i*)buf);
26810     __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
26811     const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
26812     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
26813 
26814     // Check if no bits set above 16th
26815     if (saturation_bitmask == 0xffff) {
26816       // Pack UTF-32 to UTF-16
26817       __m128i utf16_packed = _mm_packus_epi32(in, nextin);
26818 
26819       const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
26820       const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
26821       const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
26822       if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
26823         return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
26824       }
26825 
26826       if (big_endian) {
26827         const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
26828         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
26829       }
26830 
26831       _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
26832       utf16_output += 8;
26833       buf += 8;
26834     } else {
26835       size_t forward = 7;
26836       size_t k = 0;
26837       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
26838       for(; k < forward; k++) {
26839         uint32_t word = buf[k];
26840         if((word & 0xFFFF0000)==0) {
26841           // will not generate a surrogate pair
26842           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
26843           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
26844         } else {
26845           // will generate a surrogate pair
26846           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
26847           word -= 0x10000;
26848           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
26849           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
26850           if (big_endian) {
26851             high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
26852             low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
26853           }
26854           *utf16_output++ = char16_t(high_surrogate);
26855           *utf16_output++ = char16_t(low_surrogate);
26856         }
26857       }
26858       buf += k;
26859     }
26860   }
26861 
26862   return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
26863 }
26864 /* end file src/westmere/sse_convert_utf32_to_utf16.cpp */
26865 
26866 } // unnamed namespace
26867 } // namespace westmere
26868 } // namespace simdutf
26869 
26870 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
26871 /* begin file src/generic/buf_block_reader.h */
26872 namespace simdutf {
26873 namespace westmere {
26874 namespace {
26875 
26876 // Walks through a buffer in block-sized increments, loading the last part with spaces
26877 template<size_t STEP_SIZE>
26878 struct buf_block_reader {
26879 public:
26880   simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
26881   simdutf_really_inline size_t block_index();
26882   simdutf_really_inline bool has_full_block() const;
26883   simdutf_really_inline const uint8_t *full_block() const;
26884   /**
26885    * Get the last block, padded with spaces.
26886    *
26887    * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
26888    * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
26889    * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
26890    *
26891    * @return the number of effective characters in the last block.
26892    */
26893   simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
26894   simdutf_really_inline void advance();
26895 private:
26896   const uint8_t *buf;
26897   const size_t len;
26898   const size_t lenminusstep;
26899   size_t idx;
26900 };
26901 
26902 // Routines to print masks and text for debugging bitmask operations
format_input_text_64(const uint8_t * text)26903 simdutf_unused static char * format_input_text_64(const uint8_t *text) {
26904   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
26905   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
26906     buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
26907   }
26908   buf[sizeof(simd8x64<uint8_t>)] = '\0';
26909   return buf;
26910 }
26911 
26912 // Routines to print masks and text for debugging bitmask operations
format_input_text(const simd8x64<uint8_t> & in)26913 simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
26914   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
26915   in.store(reinterpret_cast<uint8_t*>(buf));
26916   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
26917     if (buf[i] < ' ') { buf[i] = '_'; }
26918   }
26919   buf[sizeof(simd8x64<uint8_t>)] = '\0';
26920   return buf;
26921 }
26922 
format_mask(uint64_t mask)26923 simdutf_unused static char * format_mask(uint64_t mask) {
26924   static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
26925   for (size_t i=0; i<64; i++) {
26926     buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
26927   }
26928   buf[64] = '\0';
26929   return buf;
26930 }
26931 
26932 template<size_t STEP_SIZE>
buf_block_reader(const uint8_t * _buf,size_t _len)26933 simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
26934 
26935 template<size_t STEP_SIZE>
block_index()26936 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
26937 
26938 template<size_t STEP_SIZE>
has_full_block() const26939 simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
26940   return idx < lenminusstep;
26941 }
26942 
26943 template<size_t STEP_SIZE>
full_block() const26944 simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
26945   return &buf[idx];
26946 }
26947 
26948 template<size_t STEP_SIZE>
get_remainder(uint8_t * dst) const26949 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
26950   if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
26951   std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
26952   std::memcpy(dst, buf + idx, len - idx);
26953   return len - idx;
26954 }
26955 
26956 template<size_t STEP_SIZE>
advance()26957 simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
26958   idx += STEP_SIZE;
26959 }
26960 
26961 } // unnamed namespace
26962 } // namespace westmere
26963 } // namespace simdutf
26964 /* end file src/generic/buf_block_reader.h */
26965 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
26966 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
26967 namespace simdutf {
26968 namespace westmere {
26969 namespace {
26970 namespace utf8_validation {
26971 
26972 using namespace simd;
26973 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)26974   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
26975 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
26976 // Bit 1 = Too Long (ASCII followed by continuation)
26977 // Bit 2 = Overlong 3-byte
26978 // Bit 4 = Surrogate
26979 // Bit 5 = Overlong 2-byte
26980 // Bit 7 = Two Continuations
26981     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
26982                                                 // 11______ 11______
26983     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
26984     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
26985     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
26986     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
26987     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
26988     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
26989                                                 // 11110100 101_____
26990                                                 // 11110101 1001____
26991                                                 // 11110101 101_____
26992                                                 // 1111011_ 1001____
26993                                                 // 1111011_ 101_____
26994                                                 // 11111___ 1001____
26995                                                 // 11111___ 101_____
26996     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
26997                                                 // 11110101 1000____
26998                                                 // 1111011_ 1000____
26999                                                 // 11111___ 1000____
27000     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
27001 
27002     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
27003       // 0_______ ________ <ASCII in byte 1>
27004       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
27005       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
27006       // 10______ ________ <continuation in byte 1>
27007       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
27008       // 1100____ ________ <two byte lead in byte 1>
27009       TOO_SHORT | OVERLONG_2,
27010       // 1101____ ________ <two byte lead in byte 1>
27011       TOO_SHORT,
27012       // 1110____ ________ <three byte lead in byte 1>
27013       TOO_SHORT | OVERLONG_3 | SURROGATE,
27014       // 1111____ ________ <four+ byte lead in byte 1>
27015       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
27016     );
27017     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
27018     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
27019       // ____0000 ________
27020       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
27021       // ____0001 ________
27022       CARRY | OVERLONG_2,
27023       // ____001_ ________
27024       CARRY,
27025       CARRY,
27026 
27027       // ____0100 ________
27028       CARRY | TOO_LARGE,
27029       // ____0101 ________
27030       CARRY | TOO_LARGE | TOO_LARGE_1000,
27031       // ____011_ ________
27032       CARRY | TOO_LARGE | TOO_LARGE_1000,
27033       CARRY | TOO_LARGE | TOO_LARGE_1000,
27034 
27035       // ____1___ ________
27036       CARRY | TOO_LARGE | TOO_LARGE_1000,
27037       CARRY | TOO_LARGE | TOO_LARGE_1000,
27038       CARRY | TOO_LARGE | TOO_LARGE_1000,
27039       CARRY | TOO_LARGE | TOO_LARGE_1000,
27040       CARRY | TOO_LARGE | TOO_LARGE_1000,
27041       // ____1101 ________
27042       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
27043       CARRY | TOO_LARGE | TOO_LARGE_1000,
27044       CARRY | TOO_LARGE | TOO_LARGE_1000
27045     );
27046     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
27047       // ________ 0_______ <ASCII in byte 2>
27048       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
27049       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
27050 
27051       // ________ 1000____
27052       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
27053       // ________ 1001____
27054       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
27055       // ________ 101_____
27056       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
27057       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
27058 
27059       // ________ 11______
27060       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
27061     );
27062     return (byte_1_high & byte_1_low & byte_2_high);
27063   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)27064   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
27065       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
27066     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
27067     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
27068     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
27069     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
27070     return must23_80 ^ sc;
27071   }
27072 
27073   //
27074   // Return nonzero if there are incomplete multibyte characters at the end of the block:
27075   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
27076   //
is_incomplete(const simd8<uint8_t> input)27077   simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
27078     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
27079     // ... 1111____ 111_____ 11______
27080     static const uint8_t max_array[32] = {
27081       255, 255, 255, 255, 255, 255, 255, 255,
27082       255, 255, 255, 255, 255, 255, 255, 255,
27083       255, 255, 255, 255, 255, 255, 255, 255,
27084       255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
27085     };
27086     const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
27087     return input.gt_bits(max_value);
27088   }
27089 
27090   struct utf8_checker {
27091     // If this is nonzero, there has been a UTF-8 error.
27092     simd8<uint8_t> error;
27093     // The last input we received
27094     simd8<uint8_t> prev_input_block;
27095     // Whether the last input we received was incomplete (used for ASCII fast path)
27096     simd8<uint8_t> prev_incomplete;
27097 
27098     //
27099     // Check whether the current bytes are valid UTF-8.
27100     //
check_utf8_bytessimdutf::westmere::__anone55652eb4511::utf8_validation::utf8_checker27101     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
27102       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
27103       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
27104       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
27105       simd8<uint8_t> sc = check_special_cases(input, prev1);
27106       this->error |= check_multibyte_lengths(input, prev_input, sc);
27107     }
27108 
27109     // The only problem that can happen at EOF is that a multibyte character is too short
27110     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
27111     // too large in the first of two bytes.
check_eofsimdutf::westmere::__anone55652eb4511::utf8_validation::utf8_checker27112     simdutf_really_inline void check_eof() {
27113       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
27114       // possibly finish them.
27115       this->error |= this->prev_incomplete;
27116     }
27117 
check_next_inputsimdutf::westmere::__anone55652eb4511::utf8_validation::utf8_checker27118     simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
27119       if(simdutf_likely(is_ascii(input))) {
27120         this->error |= this->prev_incomplete;
27121       } else {
27122         // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
27123         static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
27124             "We support either two or four chunks per 64-byte block.");
27125         if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
27126           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
27127           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27128         } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
27129           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
27130           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27131           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27132           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27133         }
27134         this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
27135         this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
27136 
27137       }
27138     }
27139 
27140     // do not forget to call check_eof!
errorssimdutf::westmere::__anone55652eb4511::utf8_validation::utf8_checker27141     simdutf_really_inline bool errors() const {
27142       return this->error.any_bits_set_anywhere();
27143     }
27144 
27145   }; // struct utf8_checker
27146 } // namespace utf8_validation
27147 
27148 using utf8_validation::utf8_checker;
27149 
27150 } // unnamed namespace
27151 } // namespace westmere
27152 } // namespace simdutf
27153 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
27154 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
27155 /* begin file src/generic/utf8_validation/utf8_validator.h */
27156 namespace simdutf {
27157 namespace westmere {
27158 namespace {
27159 namespace utf8_validation {
27160 
27161 /**
27162  * Validates that the string is actual UTF-8.
27163  */
27164 template<class checker>
generic_validate_utf8(const uint8_t * input,size_t length)27165 bool generic_validate_utf8(const uint8_t * input, size_t length) {
27166     checker c{};
27167     buf_block_reader<64> reader(input, length);
27168     while (reader.has_full_block()) {
27169       simd::simd8x64<uint8_t> in(reader.full_block());
27170       c.check_next_input(in);
27171       reader.advance();
27172     }
27173     uint8_t block[64]{};
27174     reader.get_remainder(block);
27175     simd::simd8x64<uint8_t> in(block);
27176     c.check_next_input(in);
27177     reader.advance();
27178     c.check_eof();
27179     return !c.errors();
27180 }
27181 
generic_validate_utf8(const char * input,size_t length)27182 bool generic_validate_utf8(const char * input, size_t length) {
27183   return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
27184 }
27185 
27186 /**
27187  * Validates that the string is actual UTF-8 and stops on errors.
27188  */
27189 template<class checker>
generic_validate_utf8_with_errors(const uint8_t * input,size_t length)27190 result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
27191     checker c{};
27192     buf_block_reader<64> reader(input, length);
27193     size_t count{0};
27194     while (reader.has_full_block()) {
27195       simd::simd8x64<uint8_t> in(reader.full_block());
27196       c.check_next_input(in);
27197       if(c.errors()) {
27198         if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
27199         result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
27200         res.count += count;
27201         return res;
27202       }
27203       reader.advance();
27204       count += 64;
27205     }
27206     uint8_t block[64]{};
27207     reader.get_remainder(block);
27208     simd::simd8x64<uint8_t> in(block);
27209     c.check_next_input(in);
27210     reader.advance();
27211     c.check_eof();
27212     if (c.errors()) {
27213       result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
27214       res.count += count;
27215       return res;
27216     } else {
27217       return result(error_code::SUCCESS, length);
27218     }
27219 }
27220 
generic_validate_utf8_with_errors(const char * input,size_t length)27221 result generic_validate_utf8_with_errors(const char * input, size_t length) {
27222   return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
27223 }
27224 
27225 template<class checker>
generic_validate_ascii(const uint8_t * input,size_t length)27226 bool generic_validate_ascii(const uint8_t * input, size_t length) {
27227     buf_block_reader<64> reader(input, length);
27228     uint8_t blocks[64]{};
27229     simd::simd8x64<uint8_t> running_or(blocks);
27230     while (reader.has_full_block()) {
27231       simd::simd8x64<uint8_t> in(reader.full_block());
27232       running_or |= in;
27233       reader.advance();
27234     }
27235     uint8_t block[64]{};
27236     reader.get_remainder(block);
27237     simd::simd8x64<uint8_t> in(block);
27238     running_or |= in;
27239     return running_or.is_ascii();
27240 }
27241 
generic_validate_ascii(const char * input,size_t length)27242 bool generic_validate_ascii(const char * input, size_t length) {
27243   return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
27244 }
27245 
27246 template<class checker>
generic_validate_ascii_with_errors(const uint8_t * input,size_t length)27247 result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
27248   buf_block_reader<64> reader(input, length);
27249   size_t count{0};
27250   while (reader.has_full_block()) {
27251     simd::simd8x64<uint8_t> in(reader.full_block());
27252     if (!in.is_ascii()) {
27253       result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
27254       return result(res.error, count + res.count);
27255     }
27256     reader.advance();
27257 
27258     count += 64;
27259   }
27260   uint8_t block[64]{};
27261   reader.get_remainder(block);
27262   simd::simd8x64<uint8_t> in(block);
27263   if (!in.is_ascii()) {
27264     result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
27265     return result(res.error, count + res.count);
27266   } else {
27267     return result(error_code::SUCCESS, length);
27268   }
27269 }
27270 
generic_validate_ascii_with_errors(const char * input,size_t length)27271 result generic_validate_ascii_with_errors(const char * input, size_t length) {
27272   return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
27273 }
27274 
27275 } // namespace utf8_validation
27276 } // unnamed namespace
27277 } // namespace westmere
27278 } // namespace simdutf
27279 /* end file src/generic/utf8_validation/utf8_validator.h */
27280 // transcoding from UTF-8 to UTF-16
27281 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
27282 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
27283 
27284 
27285 namespace simdutf {
27286 namespace westmere {
27287 namespace {
27288 namespace utf8_to_utf16 {
27289 
27290 using namespace simd;
27291 
27292 template <endianness endian>
convert_valid(const char * input,size_t size,char16_t * utf16_output)27293 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
27294     char16_t* utf16_output) noexcept {
27295   // The implementation is not specific to haswell and should be moved to the generic directory.
27296   size_t pos = 0;
27297   char16_t* start{utf16_output};
27298   const size_t safety_margin = 16; // to avoid overruns!
27299   while(pos + 64 + safety_margin <= size) {
27300     // this loop could be unrolled further. For example, we could process the mask
27301     // far more than 64 bytes.
27302     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
27303     if(in.is_ascii()) {
27304       in.store_ascii_as_utf16<endian>(utf16_output);
27305       utf16_output += 64;
27306       pos += 64;
27307     } else {
27308       // Slow path. We hope that the compiler will recognize that this is a slow path.
27309       // Anything that is not a continuation mask is a 'leading byte', that is, the
27310       // start of a new code point.
27311       uint64_t utf8_continuation_mask = in.lt(-65 + 1);
27312       // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
27313       uint64_t utf8_leading_mask = ~utf8_continuation_mask;
27314       // The *start* of code points is not so useful, rather, we want the *end* of code points.
27315       uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
27316       // We process in blocks of up to 12 bytes except possibly
27317       // for fast paths which may process up to 16 bytes. For the
27318       // slow path to work, we should have at least 12 input bytes left.
27319       size_t max_starting_point = (pos + 64) - 12;
27320       // Next loop is going to run at least five times when using solely
27321       // the slow/regular path, and at least four times if there are fast paths.
27322       while(pos < max_starting_point) {
27323         // Performance note: our ability to compute 'consumed' and
27324         // then shift and recompute is critical. If there is a
27325         // latency of, say, 4 cycles on getting 'consumed', then
27326         // the inner loop might have a total latency of about 6 cycles.
27327         // Yet we process between 6 to 12 inputs bytes, thus we get
27328         // a speed limit between 1 cycle/byte and 0.5 cycle/byte
27329         // for this section of the code. Hence, there is a limit
27330         // to how much we can further increase this latency before
27331         // it seriously harms performance.
27332         //
27333         // Thus we may allow convert_masked_utf8_to_utf16 to process
27334         // more bytes at a time under a fast-path mode where 16 bytes
27335         // are consumed at once (e.g., when encountering ASCII).
27336         size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
27337                             utf8_end_of_code_point_mask, utf16_output);
27338         pos += consumed;
27339         utf8_end_of_code_point_mask >>= consumed;
27340       }
27341       // At this point there may remain between 0 and 12 bytes in the
27342       // 64-byte block. These bytes will be processed again. So we have an
27343       // 80% efficiency (in the worst case). In practice we expect an
27344       // 85% to 90% efficiency.
27345     }
27346   }
27347   utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
27348   return utf16_output - start;
27349 }
27350 
27351 } // namespace utf8_to_utf16
27352 } // unnamed namespace
27353 } // namespace westmere
27354 } // namespace simdutf
27355 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
27356 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
27357 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
27358 
27359 
27360 namespace simdutf {
27361 namespace westmere {
27362 namespace {
27363 namespace utf8_to_utf16 {
27364 using namespace simd;
27365 
27366 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)27367   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
27368 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
27369 // Bit 1 = Too Long (ASCII followed by continuation)
27370 // Bit 2 = Overlong 3-byte
27371 // Bit 4 = Surrogate
27372 // Bit 5 = Overlong 2-byte
27373 // Bit 7 = Two Continuations
27374     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
27375                                                 // 11______ 11______
27376     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
27377     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
27378     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
27379     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
27380     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
27381     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
27382                                                 // 11110100 101_____
27383                                                 // 11110101 1001____
27384                                                 // 11110101 101_____
27385                                                 // 1111011_ 1001____
27386                                                 // 1111011_ 101_____
27387                                                 // 11111___ 1001____
27388                                                 // 11111___ 101_____
27389     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
27390                                                 // 11110101 1000____
27391                                                 // 1111011_ 1000____
27392                                                 // 11111___ 1000____
27393     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
27394 
27395     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
27396       // 0_______ ________ <ASCII in byte 1>
27397       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
27398       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
27399       // 10______ ________ <continuation in byte 1>
27400       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
27401       // 1100____ ________ <two byte lead in byte 1>
27402       TOO_SHORT | OVERLONG_2,
27403       // 1101____ ________ <two byte lead in byte 1>
27404       TOO_SHORT,
27405       // 1110____ ________ <three byte lead in byte 1>
27406       TOO_SHORT | OVERLONG_3 | SURROGATE,
27407       // 1111____ ________ <four+ byte lead in byte 1>
27408       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
27409     );
27410     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
27411     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
27412       // ____0000 ________
27413       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
27414       // ____0001 ________
27415       CARRY | OVERLONG_2,
27416       // ____001_ ________
27417       CARRY,
27418       CARRY,
27419 
27420       // ____0100 ________
27421       CARRY | TOO_LARGE,
27422       // ____0101 ________
27423       CARRY | TOO_LARGE | TOO_LARGE_1000,
27424       // ____011_ ________
27425       CARRY | TOO_LARGE | TOO_LARGE_1000,
27426       CARRY | TOO_LARGE | TOO_LARGE_1000,
27427 
27428       // ____1___ ________
27429       CARRY | TOO_LARGE | TOO_LARGE_1000,
27430       CARRY | TOO_LARGE | TOO_LARGE_1000,
27431       CARRY | TOO_LARGE | TOO_LARGE_1000,
27432       CARRY | TOO_LARGE | TOO_LARGE_1000,
27433       CARRY | TOO_LARGE | TOO_LARGE_1000,
27434       // ____1101 ________
27435       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
27436       CARRY | TOO_LARGE | TOO_LARGE_1000,
27437       CARRY | TOO_LARGE | TOO_LARGE_1000
27438     );
27439     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
27440       // ________ 0_______ <ASCII in byte 2>
27441       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
27442       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
27443 
27444       // ________ 1000____
27445       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
27446       // ________ 1001____
27447       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
27448       // ________ 101_____
27449       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
27450       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
27451 
27452       // ________ 11______
27453       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
27454     );
27455     return (byte_1_high & byte_1_low & byte_2_high);
27456   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)27457   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
27458       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
27459     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
27460     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
27461     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
27462     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
27463     return must23_80 ^ sc;
27464   }
27465 
27466 
27467   struct validating_transcoder {
27468     // If this is nonzero, there has been a UTF-8 error.
27469     simd8<uint8_t> error;
27470 
validating_transcodersimdutf::westmere::__anone55652eb4811::utf8_to_utf16::validating_transcoder27471     validating_transcoder() : error(uint8_t(0)) {}
27472     //
27473     // Check whether the current bytes are valid UTF-8.
27474     //
check_utf8_bytessimdutf::westmere::__anone55652eb4811::utf8_to_utf16::validating_transcoder27475     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
27476       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
27477       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
27478       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
27479       simd8<uint8_t> sc = check_special_cases(input, prev1);
27480       this->error |= check_multibyte_lengths(input, prev_input, sc);
27481     }
27482 
27483 
27484     template <endianness endian>
convertsimdutf::westmere::__anone55652eb4811::utf8_to_utf16::validating_transcoder27485     simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
27486       size_t pos = 0;
27487       char16_t* start{utf16_output};
27488       // In the worst case, we have the haswell kernel which can cause an overflow of
27489       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
27490       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
27491       // much more than 8 bytes. However, you cannot generally assume that you have valid
27492       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
27493       // to give us a good margin.
27494       size_t leading_byte = 0;
27495       size_t margin = size;
27496       for(; margin > 0 && leading_byte < 8; margin--) {
27497         leading_byte += (int8_t(in[margin-1]) > -65);
27498       }
27499       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
27500       const size_t safety_margin = size - margin + 1; // to avoid overruns!
27501       while(pos + 64 + safety_margin <= size) {
27502         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
27503         if(input.is_ascii()) {
27504           input.store_ascii_as_utf16<endian>(utf16_output);
27505           utf16_output += 64;
27506           pos += 64;
27507         } else {
27508           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
27509           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
27510               "We support either two or four chunks per 64-byte block.");
27511           auto zero = simd8<uint8_t>{uint8_t(0)};
27512           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
27513             this->check_utf8_bytes(input.chunks[0], zero);
27514             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27515           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
27516             this->check_utf8_bytes(input.chunks[0], zero);
27517             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27518             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27519             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27520           }
27521           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
27522           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
27523           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
27524           // We process in blocks of up to 12 bytes except possibly
27525           // for fast paths which may process up to 16 bytes. For the
27526           // slow path to work, we should have at least 12 input bytes left.
27527           size_t max_starting_point = (pos + 64) - 12;
27528           // Next loop is going to run at least five times.
27529           while(pos < max_starting_point) {
27530             // Performance note: our ability to compute 'consumed' and
27531             // then shift and recompute is critical. If there is a
27532             // latency of, say, 4 cycles on getting 'consumed', then
27533             // the inner loop might have a total latency of about 6 cycles.
27534             // Yet we process between 6 to 12 inputs bytes, thus we get
27535             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
27536             // for this section of the code. Hence, there is a limit
27537             // to how much we can further increase this latency before
27538             // it seriously harms performance.
27539             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
27540                             utf8_end_of_code_point_mask, utf16_output);
27541             pos += consumed;
27542             utf8_end_of_code_point_mask >>= consumed;
27543           }
27544           // At this point there may remain between 0 and 12 bytes in the
27545           // 64-byte block. These bytes will be processed again. So we have an
27546           // 80% efficiency (in the worst case). In practice we expect an
27547           // 85% to 90% efficiency.
27548         }
27549       }
27550       if(errors()) { return 0; }
27551       if(pos < size) {
27552         size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
27553         if(howmany == 0) { return 0; }
27554         utf16_output += howmany;
27555       }
27556       return utf16_output - start;
27557     }
27558 
27559     template <endianness endian>
convert_with_errorssimdutf::westmere::__anone55652eb4811::utf8_to_utf16::validating_transcoder27560     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
27561       size_t pos = 0;
27562       char16_t* start{utf16_output};
27563       // In the worst case, we have the haswell kernel which can cause an overflow of
27564       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
27565       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
27566       // much more than 8 bytes. However, you cannot generally assume that you have valid
27567       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
27568       // to give us a good margin.
27569       size_t leading_byte = 0;
27570       size_t margin = size;
27571       for(; margin > 0 && leading_byte < 8; margin--) {
27572         leading_byte += (int8_t(in[margin-1]) > -65);
27573       }
27574       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
27575       const size_t safety_margin = size - margin + 1; // to avoid overruns!
27576       while(pos + 64 + safety_margin <= size) {
27577         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
27578         if(input.is_ascii()) {
27579           input.store_ascii_as_utf16<endian>(utf16_output);
27580           utf16_output += 64;
27581           pos += 64;
27582         } else {
27583           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
27584           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
27585               "We support either two or four chunks per 64-byte block.");
27586           auto zero = simd8<uint8_t>{uint8_t(0)};
27587           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
27588             this->check_utf8_bytes(input.chunks[0], zero);
27589             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27590           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
27591             this->check_utf8_bytes(input.chunks[0], zero);
27592             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27593             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27594             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27595           }
27596           if (errors()) {
27597             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
27598             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
27599             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
27600             res.count += pos;
27601             return res;
27602           }
27603           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
27604           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
27605           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
27606           // We process in blocks of up to 12 bytes except possibly
27607           // for fast paths which may process up to 16 bytes. For the
27608           // slow path to work, we should have at least 12 input bytes left.
27609           size_t max_starting_point = (pos + 64) - 12;
27610           // Next loop is going to run at least five times.
27611           while(pos < max_starting_point) {
27612             // Performance note: our ability to compute 'consumed' and
27613             // then shift and recompute is critical. If there is a
27614             // latency of, say, 4 cycles on getting 'consumed', then
27615             // the inner loop might have a total latency of about 6 cycles.
27616             // Yet we process between 6 to 12 inputs bytes, thus we get
27617             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
27618             // for this section of the code. Hence, there is a limit
27619             // to how much we can further increase this latency before
27620             // it seriously harms performance.
27621             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
27622                             utf8_end_of_code_point_mask, utf16_output);
27623             pos += consumed;
27624             utf8_end_of_code_point_mask >>= consumed;
27625           }
27626           // At this point there may remain between 0 and 12 bytes in the
27627           // 64-byte block. These bytes will be processed again. So we have an
27628           // 80% efficiency (in the worst case). In practice we expect an
27629           // 85% to 90% efficiency.
27630         }
27631       }
27632       if(errors()) {
27633         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
27634         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
27635         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
27636         res.count += pos;
27637         return res;
27638       }
27639       if(pos < size) {
27640         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
27641         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
27642         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
27643         if (res.error) {    // In case of error, we want the error position
27644           res.count += pos;
27645           return res;
27646         } else {    // In case of success, we want the number of word written
27647           utf16_output += res.count;
27648         }
27649       }
27650       return result(error_code::SUCCESS, utf16_output - start);
27651     }
27652 
errorssimdutf::westmere::__anone55652eb4811::utf8_to_utf16::validating_transcoder27653     simdutf_really_inline bool errors() const {
27654       return this->error.any_bits_set_anywhere();
27655     }
27656 
27657   }; // struct utf8_checker
27658 } // utf8_to_utf16 namespace
27659 } // unnamed namespace
27660 } // namespace westmere
27661 } // namespace simdutf
27662 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
27663 // transcoding from UTF-8 to UTF-32
27664 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
27665 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
27666 
27667 namespace simdutf {
27668 namespace westmere {
27669 namespace {
27670 namespace utf8_to_utf32 {
27671 
27672 using namespace simd;
27673 
27674 
convert_valid(const char * input,size_t size,char32_t * utf32_output)27675 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
27676     char32_t* utf32_output) noexcept {
27677   size_t pos = 0;
27678   char32_t* start{utf32_output};
27679   const size_t safety_margin = 16; // to avoid overruns!
27680   while(pos + 64 + safety_margin <= size) {
27681     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
27682     if(in.is_ascii()) {
27683       in.store_ascii_as_utf32(utf32_output);
27684       utf32_output += 64;
27685       pos += 64;
27686     } else {
27687     // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
27688     uint64_t utf8_continuation_mask = in.lt(-65 + 1);
27689     uint64_t utf8_leading_mask = ~utf8_continuation_mask;
27690     uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
27691     size_t max_starting_point = (pos + 64) - 12;
27692     while(pos < max_starting_point) {
27693       size_t consumed = convert_masked_utf8_to_utf32(input + pos,
27694                           utf8_end_of_code_point_mask, utf32_output);
27695       pos += consumed;
27696       utf8_end_of_code_point_mask >>= consumed;
27697       }
27698     }
27699   }
27700   utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
27701   return utf32_output - start;
27702 }
27703 
27704 
27705 } // namespace utf8_to_utf32
27706 } // unnamed namespace
27707 } // namespace westmere
27708 } // namespace simdutf
27709 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
27710 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
27711 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
27712 
27713 
27714 namespace simdutf {
27715 namespace westmere {
27716 namespace {
27717 namespace utf8_to_utf32 {
27718 using namespace simd;
27719 
27720 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)27721   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
27722 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
27723 // Bit 1 = Too Long (ASCII followed by continuation)
27724 // Bit 2 = Overlong 3-byte
27725 // Bit 4 = Surrogate
27726 // Bit 5 = Overlong 2-byte
27727 // Bit 7 = Two Continuations
27728     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
27729                                                 // 11______ 11______
27730     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
27731     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
27732     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
27733     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
27734     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
27735     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
27736                                                 // 11110100 101_____
27737                                                 // 11110101 1001____
27738                                                 // 11110101 101_____
27739                                                 // 1111011_ 1001____
27740                                                 // 1111011_ 101_____
27741                                                 // 11111___ 1001____
27742                                                 // 11111___ 101_____
27743     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
27744                                                 // 11110101 1000____
27745                                                 // 1111011_ 1000____
27746                                                 // 11111___ 1000____
27747     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
27748 
27749     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
27750       // 0_______ ________ <ASCII in byte 1>
27751       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
27752       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
27753       // 10______ ________ <continuation in byte 1>
27754       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
27755       // 1100____ ________ <two byte lead in byte 1>
27756       TOO_SHORT | OVERLONG_2,
27757       // 1101____ ________ <two byte lead in byte 1>
27758       TOO_SHORT,
27759       // 1110____ ________ <three byte lead in byte 1>
27760       TOO_SHORT | OVERLONG_3 | SURROGATE,
27761       // 1111____ ________ <four+ byte lead in byte 1>
27762       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
27763     );
27764     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
27765     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
27766       // ____0000 ________
27767       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
27768       // ____0001 ________
27769       CARRY | OVERLONG_2,
27770       // ____001_ ________
27771       CARRY,
27772       CARRY,
27773 
27774       // ____0100 ________
27775       CARRY | TOO_LARGE,
27776       // ____0101 ________
27777       CARRY | TOO_LARGE | TOO_LARGE_1000,
27778       // ____011_ ________
27779       CARRY | TOO_LARGE | TOO_LARGE_1000,
27780       CARRY | TOO_LARGE | TOO_LARGE_1000,
27781 
27782       // ____1___ ________
27783       CARRY | TOO_LARGE | TOO_LARGE_1000,
27784       CARRY | TOO_LARGE | TOO_LARGE_1000,
27785       CARRY | TOO_LARGE | TOO_LARGE_1000,
27786       CARRY | TOO_LARGE | TOO_LARGE_1000,
27787       CARRY | TOO_LARGE | TOO_LARGE_1000,
27788       // ____1101 ________
27789       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
27790       CARRY | TOO_LARGE | TOO_LARGE_1000,
27791       CARRY | TOO_LARGE | TOO_LARGE_1000
27792     );
27793     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
27794       // ________ 0_______ <ASCII in byte 2>
27795       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
27796       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
27797 
27798       // ________ 1000____
27799       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
27800       // ________ 1001____
27801       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
27802       // ________ 101_____
27803       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
27804       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
27805 
27806       // ________ 11______
27807       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
27808     );
27809     return (byte_1_high & byte_1_low & byte_2_high);
27810   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)27811   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
27812       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
27813     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
27814     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
27815     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
27816     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
27817     return must23_80 ^ sc;
27818   }
27819 
27820 
27821   struct validating_transcoder {
27822     // If this is nonzero, there has been a UTF-8 error.
27823     simd8<uint8_t> error;
27824 
validating_transcodersimdutf::westmere::__anone55652eb4a11::utf8_to_utf32::validating_transcoder27825     validating_transcoder() : error(uint8_t(0)) {}
27826     //
27827     // Check whether the current bytes are valid UTF-8.
27828     //
check_utf8_bytessimdutf::westmere::__anone55652eb4a11::utf8_to_utf32::validating_transcoder27829     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
27830       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
27831       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
27832       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
27833       simd8<uint8_t> sc = check_special_cases(input, prev1);
27834       this->error |= check_multibyte_lengths(input, prev_input, sc);
27835     }
27836 
27837 
27838 
convertsimdutf::westmere::__anone55652eb4a11::utf8_to_utf32::validating_transcoder27839     simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
27840       size_t pos = 0;
27841       char32_t* start{utf32_output};
27842       // In the worst case, we have the haswell kernel which can cause an overflow of
27843       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
27844       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
27845       // much more than 8 bytes. However, you cannot generally assume that you have valid
27846       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
27847       // to give us a good margin.
27848       size_t leading_byte = 0;
27849       size_t margin = size;
27850       for(; margin > 0 && leading_byte < 4; margin--) {
27851         leading_byte += (int8_t(in[margin-1]) > -65);
27852       }
27853       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
27854       const size_t safety_margin = size - margin + 1; // to avoid overruns!
27855       while(pos + 64 + safety_margin <= size) {
27856         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
27857         if(input.is_ascii()) {
27858           input.store_ascii_as_utf32(utf32_output);
27859           utf32_output += 64;
27860           pos += 64;
27861         } else {
27862           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
27863           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
27864               "We support either two or four chunks per 64-byte block.");
27865           auto zero = simd8<uint8_t>{uint8_t(0)};
27866           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
27867             this->check_utf8_bytes(input.chunks[0], zero);
27868             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27869           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
27870             this->check_utf8_bytes(input.chunks[0], zero);
27871             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27872             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27873             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27874           }
27875           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
27876           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
27877           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
27878           // We process in blocks of up to 12 bytes except possibly
27879           // for fast paths which may process up to 16 bytes. For the
27880           // slow path to work, we should have at least 12 input bytes left.
27881           size_t max_starting_point = (pos + 64) - 12;
27882           // Next loop is going to run at least five times.
27883           while(pos < max_starting_point) {
27884             // Performance note: our ability to compute 'consumed' and
27885             // then shift and recompute is critical. If there is a
27886             // latency of, say, 4 cycles on getting 'consumed', then
27887             // the inner loop might have a total latency of about 6 cycles.
27888             // Yet we process between 6 to 12 inputs bytes, thus we get
27889             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
27890             // for this section of the code. Hence, there is a limit
27891             // to how much we can further increase this latency before
27892             // it seriously harms performance.
27893             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
27894                             utf8_end_of_code_point_mask, utf32_output);
27895             pos += consumed;
27896             utf8_end_of_code_point_mask >>= consumed;
27897           }
27898           // At this point there may remain between 0 and 12 bytes in the
27899           // 64-byte block. These bytes will be processed again. So we have an
27900           // 80% efficiency (in the worst case). In practice we expect an
27901           // 85% to 90% efficiency.
27902         }
27903       }
27904       if(errors()) { return 0; }
27905       if(pos < size) {
27906         size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
27907         if(howmany == 0) { return 0; }
27908         utf32_output += howmany;
27909       }
27910       return utf32_output - start;
27911     }
27912 
convert_with_errorssimdutf::westmere::__anone55652eb4a11::utf8_to_utf32::validating_transcoder27913     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
27914       size_t pos = 0;
27915       char32_t* start{utf32_output};
27916       // In the worst case, we have the haswell kernel which can cause an overflow of
27917       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
27918       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
27919       // much more than 8 bytes. However, you cannot generally assume that you have valid
27920       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
27921       // to give us a good margin.
27922       size_t leading_byte = 0;
27923       size_t margin = size;
27924       for(; margin > 0 && leading_byte < 4; margin--) {
27925         leading_byte += (int8_t(in[margin-1]) > -65);
27926       }
27927       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
27928       const size_t safety_margin = size - margin + 1; // to avoid overruns!
27929       while(pos + 64 + safety_margin <= size) {
27930         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
27931         if(input.is_ascii()) {
27932           input.store_ascii_as_utf32(utf32_output);
27933           utf32_output += 64;
27934           pos += 64;
27935         } else {
27936           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
27937           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
27938               "We support either two or four chunks per 64-byte block.");
27939           auto zero = simd8<uint8_t>{uint8_t(0)};
27940           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
27941             this->check_utf8_bytes(input.chunks[0], zero);
27942             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27943           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
27944             this->check_utf8_bytes(input.chunks[0], zero);
27945             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27946             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27947             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27948           }
27949           if (errors()) {
27950             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
27951             res.count += pos;
27952             return res;
27953           }
27954           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
27955           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
27956           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
27957           // We process in blocks of up to 12 bytes except possibly
27958           // for fast paths which may process up to 16 bytes. For the
27959           // slow path to work, we should have at least 12 input bytes left.
27960           size_t max_starting_point = (pos + 64) - 12;
27961           // Next loop is going to run at least five times.
27962           while(pos < max_starting_point) {
27963             // Performance note: our ability to compute 'consumed' and
27964             // then shift and recompute is critical. If there is a
27965             // latency of, say, 4 cycles on getting 'consumed', then
27966             // the inner loop might have a total latency of about 6 cycles.
27967             // Yet we process between 6 to 12 inputs bytes, thus we get
27968             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
27969             // for this section of the code. Hence, there is a limit
27970             // to how much we can further increase this latency before
27971             // it seriously harms performance.
27972             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
27973                             utf8_end_of_code_point_mask, utf32_output);
27974             pos += consumed;
27975             utf8_end_of_code_point_mask >>= consumed;
27976           }
27977           // At this point there may remain between 0 and 12 bytes in the
27978           // 64-byte block. These bytes will be processed again. So we have an
27979           // 80% efficiency (in the worst case). In practice we expect an
27980           // 85% to 90% efficiency.
27981         }
27982       }
27983       if(errors()) {
27984         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
27985         res.count += pos;
27986         return res;
27987       }
27988       if(pos < size) {
27989         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
27990         if (res.error) {    // In case of error, we want the error position
27991           res.count += pos;
27992           return res;
27993         } else {    // In case of success, we want the number of word written
27994           utf32_output += res.count;
27995         }
27996       }
27997       return result(error_code::SUCCESS, utf32_output - start);
27998     }
27999 
errorssimdutf::westmere::__anone55652eb4a11::utf8_to_utf32::validating_transcoder28000     simdutf_really_inline bool errors() const {
28001       return this->error.any_bits_set_anywhere();
28002     }
28003 
28004   }; // struct utf8_checker
28005 } // utf8_to_utf32 namespace
28006 } // unnamed namespace
28007 } // namespace westmere
28008 } // namespace simdutf
28009 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
28010 // other functions
28011 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8.h
28012 /* begin file src/generic/utf8.h */
28013 
28014 namespace simdutf {
28015 namespace westmere {
28016 namespace {
28017 namespace utf8 {
28018 
28019 using namespace simd;
28020 
count_code_points(const char * in,size_t size)28021 simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
28022     size_t pos = 0;
28023     size_t count = 0;
28024     for(;pos + 64 <= size; pos += 64) {
28025       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
28026       uint64_t utf8_continuation_mask = input.lt(-65 + 1);
28027       count += 64 - count_ones(utf8_continuation_mask);
28028     }
28029     return count + scalar::utf8::count_code_points(in + pos, size - pos);
28030 }
28031 
28032 
utf16_length_from_utf8(const char * in,size_t size)28033 simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
28034     size_t pos = 0;
28035     size_t count = 0;
28036     // This algorithm could no doubt be improved!
28037     for(;pos + 64 <= size; pos += 64) {
28038       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
28039       uint64_t utf8_continuation_mask = input.lt(-65 + 1);
28040       // We count one word for anything that is not a continuation (so
28041       // leading bytes).
28042       count += 64 - count_ones(utf8_continuation_mask);
28043       int64_t utf8_4byte = input.gteq_unsigned(240);
28044       count += count_ones(utf8_4byte);
28045     }
28046     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
28047 }
28048 
28049 
utf32_length_from_utf8(const char * in,size_t size)28050 simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
28051     return count_code_points(in, size);
28052 }
28053 } // utf8 namespace
28054 } // unnamed namespace
28055 } // namespace westmere
28056 } // namespace simdutf
28057 /* end file src/generic/utf8.h */
28058 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf16.h
28059 /* begin file src/generic/utf16.h */
28060 namespace simdutf {
28061 namespace westmere {
28062 namespace {
28063 namespace utf16 {
28064 
28065 template <endianness big_endian>
count_code_points(const char16_t * in,size_t size)28066 simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
28067     size_t pos = 0;
28068     size_t count = 0;
28069     for(;pos + 32 <= size; pos += 32) {
28070       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
28071       if (!match_system(big_endian)) input.swap_bytes();
28072       uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
28073       count += count_ones(not_pair) / 2;
28074     }
28075     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
28076 }
28077 
28078 template <endianness big_endian>
utf8_length_from_utf16(const char16_t * in,size_t size)28079 simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
28080     size_t pos = 0;
28081     size_t count = 0;
28082     // This algorithm could no doubt be improved!
28083     for(;pos + 32 <= size; pos += 32) {
28084       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
28085       if (!match_system(big_endian)) input.swap_bytes();
28086       uint64_t ascii_mask = input.lteq(0x7F);
28087       uint64_t twobyte_mask = input.lteq(0x7FF);
28088       uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
28089 
28090       size_t ascii_count = count_ones(ascii_mask) / 2;
28091       size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
28092       size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
28093       size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
28094       count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
28095     }
28096     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
28097 }
28098 
28099 template <endianness big_endian>
utf32_length_from_utf16(const char16_t * in,size_t size)28100 simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
28101     return count_code_points<big_endian>(in, size);
28102 }
28103 
change_endianness_utf16(const char16_t * in,size_t size,char16_t * output)28104 simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
28105   size_t pos = 0;
28106 
28107   while (pos + 32 <= size) {
28108     simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
28109     input.swap_bytes();
28110     input.store(reinterpret_cast<uint16_t *>(output));
28111     pos += 32;
28112     output += 32;
28113   }
28114 
28115   scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
28116 }
28117 
28118 } // utf16
28119 } // unnamed namespace
28120 } // namespace westmere
28121 } // namespace simdutf
28122 /* end file src/generic/utf16.h */
28123 //
28124 // Implementation-specific overrides
28125 //
28126 
28127 namespace simdutf {
28128 namespace westmere {
28129 
detect_encodings(const char * input,size_t length) const28130 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
28131   // If there is a BOM, then we trust it.
28132   auto bom_encoding = simdutf::BOM::check_bom(input, length);
28133   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
28134   if (length % 2 == 0) {
28135     return sse_detect_encodings<utf8_validation::utf8_checker>(input, length);
28136   } else {
28137     if (implementation::validate_utf8(input, length)) {
28138       return simdutf::encoding_type::UTF8;
28139     } else {
28140       return simdutf::encoding_type::unspecified;
28141     }
28142   }
28143 }
28144 
validate_utf8(const char * buf,size_t len) const28145 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
28146   return westmere::utf8_validation::generic_validate_utf8(buf, len);
28147 }
28148 
validate_utf8_with_errors(const char * buf,size_t len) const28149 simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
28150   return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
28151 }
28152 
validate_ascii(const char * buf,size_t len) const28153 simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
28154   return westmere::utf8_validation::generic_validate_ascii(buf, len);
28155 }
28156 
validate_ascii_with_errors(const char * buf,size_t len) const28157 simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
28158   return westmere::utf8_validation::generic_validate_ascii_with_errors(buf,len);
28159 }
28160 
validate_utf16le(const char16_t * buf,size_t len) const28161 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
28162   const char16_t* tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
28163   if (tail) {
28164     return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
28165   } else {
28166     return false;
28167   }
28168 }
28169 
validate_utf16be(const char16_t * buf,size_t len) const28170 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
28171   const char16_t* tail = sse_validate_utf16<endianness::BIG>(buf, len);
28172   if (tail) {
28173     return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
28174   } else {
28175     return false;
28176   }
28177 }
28178 
validate_utf16le_with_errors(const char16_t * buf,size_t len) const28179 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
28180   result res = sse_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
28181   if (res.count != len) {
28182     result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
28183     return result(scalar_res.error, res.count + scalar_res.count);
28184   } else {
28185     return res;
28186   }
28187 }
28188 
validate_utf16be_with_errors(const char16_t * buf,size_t len) const28189 simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
28190   result res = sse_validate_utf16_with_errors<endianness::BIG>(buf, len);
28191   if (res.count != len) {
28192     result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
28193     return result(scalar_res.error, res.count + scalar_res.count);
28194   } else {
28195     return res;
28196   }
28197 }
28198 
validate_utf32(const char32_t * buf,size_t len) const28199 simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
28200   const char32_t* tail = sse_validate_utf32le(buf, len);
28201   if (tail) {
28202     return scalar::utf32::validate(tail, len - (tail - buf));
28203   } else {
28204     return false;
28205   }
28206 }
28207 
validate_utf32_with_errors(const char32_t * buf,size_t len) const28208 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
28209   result res = sse_validate_utf32le_with_errors(buf, len);
28210   if (res.count != len) {
28211     result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
28212     return result(scalar_res.error, res.count + scalar_res.count);
28213   } else {
28214     return res;
28215   }
28216 }
28217 
convert_utf8_to_utf16le(const char * buf,size_t len,char16_t * utf16_output) const28218 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
28219   utf8_to_utf16::validating_transcoder converter;
28220   return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
28221 }
28222 
convert_utf8_to_utf16be(const char * buf,size_t len,char16_t * utf16_output) const28223 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
28224   utf8_to_utf16::validating_transcoder converter;
28225   return converter.convert<endianness::BIG>(buf, len, utf16_output);
28226 }
28227 
convert_utf8_to_utf16le_with_errors(const char * buf,size_t len,char16_t * utf16_output) const28228 simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
28229   utf8_to_utf16::validating_transcoder converter;
28230   return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
28231 }
28232 
convert_utf8_to_utf16be_with_errors(const char * buf,size_t len,char16_t * utf16_output) const28233 simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
28234   utf8_to_utf16::validating_transcoder converter;
28235   return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
28236 }
28237 
28238 
convert_valid_utf8_to_utf16le(const char * input,size_t size,char16_t * utf16_output) const28239 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
28240     char16_t* utf16_output) const noexcept {
28241   return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
28242 }
28243 
convert_valid_utf8_to_utf16be(const char * input,size_t size,char16_t * utf16_output) const28244 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
28245     char16_t* utf16_output) const noexcept {
28246   return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
28247 }
28248 
convert_utf8_to_utf32(const char * buf,size_t len,char32_t * utf32_output) const28249 simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
28250   utf8_to_utf32::validating_transcoder converter;
28251   return converter.convert(buf, len, utf32_output);
28252 }
28253 
convert_utf8_to_utf32_with_errors(const char * buf,size_t len,char32_t * utf32_output) const28254 simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
28255   utf8_to_utf32::validating_transcoder converter;
28256   return converter.convert_with_errors(buf, len, utf32_output);
28257 }
28258 
convert_valid_utf8_to_utf32(const char * input,size_t size,char32_t * utf32_output) const28259 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
28260     char32_t* utf32_output) const noexcept {
28261   return utf8_to_utf32::convert_valid(input, size,  utf32_output);
28262 }
28263 
convert_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const28264 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
28265   std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
28266   if (ret.first == nullptr) { return 0; }
28267   size_t saved_bytes = ret.second - utf8_output;
28268   if (ret.first != buf + len) {
28269     const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
28270                                         ret.first, len - (ret.first - buf), ret.second);
28271     if (scalar_saved_bytes == 0) { return 0; }
28272     saved_bytes += scalar_saved_bytes;
28273   }
28274   return saved_bytes;
28275 }
28276 
convert_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const28277 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
28278   std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
28279   if (ret.first == nullptr) { return 0; }
28280   size_t saved_bytes = ret.second - utf8_output;
28281   if (ret.first != buf + len) {
28282     const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
28283                                         ret.first, len - (ret.first - buf), ret.second);
28284     if (scalar_saved_bytes == 0) { return 0; }
28285     saved_bytes += scalar_saved_bytes;
28286   }
28287   return saved_bytes;
28288 }
28289 
convert_utf16le_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output) const28290 simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
28291   // ret.first.count is always the position in the buffer, not the number of words written even if finished
28292   std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
28293   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
28294   if (ret.first.count != len) { // All good so far, but not finished
28295     result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
28296                                         buf + ret.first.count, len - ret.first.count, ret.second);
28297     if (scalar_res.error) {
28298       scalar_res.count += ret.first.count;
28299       return scalar_res;
28300     } else {
28301       ret.second += scalar_res.count;
28302     }
28303   }
28304   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
28305   return ret.first;
28306 }
28307 
convert_utf16be_to_utf8_with_errors(const char16_t * buf,size_t len,char * utf8_output) const28308 simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
28309   // ret.first.count is always the position in the buffer, not the number of words written even if finished
28310   std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
28311   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
28312   if (ret.first.count != len) { // All good so far, but not finished
28313     result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
28314                                         buf + ret.first.count, len - ret.first.count, ret.second);
28315     if (scalar_res.error) {
28316       scalar_res.count += ret.first.count;
28317       return scalar_res;
28318     } else {
28319       ret.second += scalar_res.count;
28320     }
28321   }
28322   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
28323   return ret.first;
28324 }
28325 
convert_valid_utf16le_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const28326 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
28327   return convert_utf16le_to_utf8(buf, len, utf8_output);
28328 }
28329 
convert_valid_utf16be_to_utf8(const char16_t * buf,size_t len,char * utf8_output) const28330 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
28331   return convert_utf16be_to_utf8(buf, len, utf8_output);
28332 }
28333 
convert_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output) const28334 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
28335   std::pair<const char32_t*, char*> ret = sse_convert_utf32_to_utf8(buf, len, utf8_output);
28336   if (ret.first == nullptr) { return 0; }
28337   size_t saved_bytes = ret.second - utf8_output;
28338   if (ret.first != buf + len) {
28339     const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
28340                                         ret.first, len - (ret.first - buf), ret.second);
28341     if (scalar_saved_bytes == 0) { return 0; }
28342     saved_bytes += scalar_saved_bytes;
28343   }
28344   return saved_bytes;
28345 }
28346 
convert_utf32_to_utf8_with_errors(const char32_t * buf,size_t len,char * utf8_output) const28347 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
28348   // ret.first.count is always the position in the buffer, not the number of words written even if finished
28349   std::pair<result, char*> ret = westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
28350   if (ret.first.count != len) {
28351     result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
28352                                         buf + ret.first.count, len - ret.first.count, ret.second);
28353     if (scalar_res.error) {
28354       scalar_res.count += ret.first.count;
28355       return scalar_res;
28356     } else {
28357       ret.second += scalar_res.count;
28358     }
28359   }
28360   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
28361   return ret.first;
28362 }
28363 
convert_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const28364 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
28365   std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
28366   if (ret.first == nullptr) { return 0; }
28367   size_t saved_bytes = ret.second - utf32_output;
28368   if (ret.first != buf + len) {
28369     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
28370                                         ret.first, len - (ret.first - buf), ret.second);
28371     if (scalar_saved_bytes == 0) { return 0; }
28372     saved_bytes += scalar_saved_bytes;
28373   }
28374   return saved_bytes;
28375 }
28376 
convert_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const28377 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
28378   std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
28379   if (ret.first == nullptr) { return 0; }
28380   size_t saved_bytes = ret.second - utf32_output;
28381   if (ret.first != buf + len) {
28382     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
28383                                         ret.first, len - (ret.first - buf), ret.second);
28384     if (scalar_saved_bytes == 0) { return 0; }
28385     saved_bytes += scalar_saved_bytes;
28386   }
28387   return saved_bytes;
28388 }
28389 
convert_utf16le_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output) const28390 simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
28391   // ret.first.count is always the position in the buffer, not the number of words written even if finished
28392   std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
28393   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
28394   if (ret.first.count != len) { // All good so far, but not finished
28395     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
28396                                         buf + ret.first.count, len - ret.first.count, ret.second);
28397     if (scalar_res.error) {
28398       scalar_res.count += ret.first.count;
28399       return scalar_res;
28400     } else {
28401       ret.second += scalar_res.count;
28402     }
28403   }
28404   ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
28405   return ret.first;
28406 }
28407 
convert_utf16be_to_utf32_with_errors(const char16_t * buf,size_t len,char32_t * utf32_output) const28408 simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
28409   // ret.first.count is always the position in the buffer, not the number of words written even if finished
28410   std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
28411   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
28412   if (ret.first.count != len) { // All good so far, but not finished
28413     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
28414                                         buf + ret.first.count, len - ret.first.count, ret.second);
28415     if (scalar_res.error) {
28416       scalar_res.count += ret.first.count;
28417       return scalar_res;
28418     } else {
28419       ret.second += scalar_res.count;
28420     }
28421   }
28422   ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
28423   return ret.first;
28424 }
28425 
convert_valid_utf32_to_utf8(const char32_t * buf,size_t len,char * utf8_output) const28426 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
28427   return convert_utf32_to_utf8(buf, len, utf8_output);
28428 }
28429 
convert_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_output) const28430 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
28431   std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
28432   if (ret.first == nullptr) { return 0; }
28433   size_t saved_bytes = ret.second - utf16_output;
28434   if (ret.first != buf + len) {
28435     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
28436                                         ret.first, len - (ret.first - buf), ret.second);
28437     if (scalar_saved_bytes == 0) { return 0; }
28438     saved_bytes += scalar_saved_bytes;
28439   }
28440   return saved_bytes;
28441 }
28442 
convert_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_output) const28443 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
28444   std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
28445   if (ret.first == nullptr) { return 0; }
28446   size_t saved_bytes = ret.second - utf16_output;
28447   if (ret.first != buf + len) {
28448     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
28449                                         ret.first, len - (ret.first - buf), ret.second);
28450     if (scalar_saved_bytes == 0) { return 0; }
28451     saved_bytes += scalar_saved_bytes;
28452   }
28453   return saved_bytes;
28454 }
28455 
convert_utf32_to_utf16le_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output) const28456 simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
28457   // ret.first.count is always the position in the buffer, not the number of words written even if finished
28458   std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
28459   if (ret.first.count != len) {
28460     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
28461                                         buf + ret.first.count, len - ret.first.count, ret.second);
28462     if (scalar_res.error) {
28463       scalar_res.count += ret.first.count;
28464       return scalar_res;
28465     } else {
28466       ret.second += scalar_res.count;
28467     }
28468   }
28469   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
28470   return ret.first;
28471 }
28472 
convert_utf32_to_utf16be_with_errors(const char32_t * buf,size_t len,char16_t * utf16_output) const28473 simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
28474   // ret.first.count is always the position in the buffer, not the number of words written even if finished
28475   std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
28476   if (ret.first.count != len) {
28477     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
28478                                         buf + ret.first.count, len - ret.first.count, ret.second);
28479     if (scalar_res.error) {
28480       scalar_res.count += ret.first.count;
28481       return scalar_res;
28482     } else {
28483       ret.second += scalar_res.count;
28484     }
28485   }
28486   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
28487   return ret.first;
28488 }
28489 
convert_valid_utf32_to_utf16le(const char32_t * buf,size_t len,char16_t * utf16_output) const28490 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
28491   return convert_utf32_to_utf16le(buf, len, utf16_output);
28492 }
28493 
convert_valid_utf32_to_utf16be(const char32_t * buf,size_t len,char16_t * utf16_output) const28494 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
28495   return convert_utf32_to_utf16be(buf, len, utf16_output);
28496 }
28497 
convert_valid_utf16le_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const28498 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
28499   return convert_utf16le_to_utf32(buf, len, utf32_output);
28500 }
28501 
convert_valid_utf16be_to_utf32(const char16_t * buf,size_t len,char32_t * utf32_output) const28502 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
28503   return convert_utf16be_to_utf32(buf, len, utf32_output);
28504 }
28505 
change_endianness_utf16(const char16_t * input,size_t length,char16_t * output) const28506 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
28507   utf16::change_endianness_utf16(input, length, output);
28508 }
28509 
count_utf16le(const char16_t * input,size_t length) const28510 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
28511   return utf16::count_code_points<endianness::LITTLE>(input, length);
28512 }
28513 
count_utf16be(const char16_t * input,size_t length) const28514 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
28515   return utf16::count_code_points<endianness::BIG>(input, length);
28516 }
28517 
count_utf8(const char * input,size_t length) const28518 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
28519   return utf8::count_code_points(input, length);
28520 }
28521 
utf8_length_from_utf16le(const char16_t * input,size_t length) const28522 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
28523   return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
28524 }
28525 
utf8_length_from_utf16be(const char16_t * input,size_t length) const28526 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
28527   return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
28528 }
28529 
utf32_length_from_utf16le(const char16_t * input,size_t length) const28530 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
28531   return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
28532 }
28533 
utf32_length_from_utf16be(const char16_t * input,size_t length) const28534 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
28535   return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
28536 }
28537 
utf16_length_from_utf8(const char * input,size_t length) const28538 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
28539   return utf8::utf16_length_from_utf8(input, length);
28540 }
28541 
utf8_length_from_utf32(const char32_t * input,size_t length) const28542 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
28543   const __m128i v_00000000 = _mm_setzero_si128();
28544   const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
28545   const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
28546   const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
28547   size_t pos = 0;
28548   size_t count = 0;
28549   for(;pos + 4 <= length; pos += 4) {
28550     __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
28551     const __m128i ascii_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
28552     const __m128i one_two_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
28553     const __m128i two_bytes_bytemask = _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
28554     const __m128i one_two_three_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
28555     const __m128i three_bytes_bytemask = _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
28556     const uint16_t ascii_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(ascii_bytes_bytemask));
28557     const uint16_t two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(two_bytes_bytemask));
28558     const uint16_t three_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(three_bytes_bytemask));
28559 
28560     size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
28561     size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
28562     size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
28563     count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
28564   }
28565   return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
28566 }
28567 
utf16_length_from_utf32(const char32_t * input,size_t length) const28568 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
28569   const __m128i v_00000000 = _mm_setzero_si128();
28570   const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
28571   size_t pos = 0;
28572   size_t count = 0;
28573   for(;pos + 4 <= length; pos += 4) {
28574     __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
28575     const __m128i surrogate_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
28576     const uint16_t surrogate_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
28577     size_t surrogate_count = (16-count_ones(surrogate_bitmask))/4;
28578     count += 4 + surrogate_count;
28579   }
28580   return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
28581 }
28582 
utf32_length_from_utf8(const char * input,size_t length) const28583 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
28584   return scalar::utf8::count_code_points(input, length);
28585 }
28586 
28587 } // namespace westmere
28588 } // namespace simdutf
28589 
28590 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h
28591 /* begin file src/simdutf/westmere/end.h */
28592 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
28593 // nothing needed.
28594 #else
28595 SIMDUTF_UNTARGET_REGION
28596 #endif
28597 
28598 /* end file src/simdutf/westmere/end.h */
28599 /* end file src/westmere/implementation.cpp */
28600 #endif
28601 
28602 SIMDUTF_POP_DISABLE_WARNINGS
28603 /* end file src/simdutf.cpp */
28604