• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* auto-generated on 2023-06-05 08:58:28 -0400. Do not edit! */
2 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf.h
3 /* begin file include/simdutf.h */
4 #ifndef SIMDUTF_H
5 #define SIMDUTF_H
6 #include <cstring>
7 
8 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/compiler_check.h
9 /* begin file include/simdutf/compiler_check.h */
10 #ifndef SIMDUTF_COMPILER_CHECK_H
11 #define SIMDUTF_COMPILER_CHECK_H
12 
13 #ifndef __cplusplus
14 #error simdutf requires a C++ compiler
15 #endif
16 
17 #ifndef SIMDUTF_CPLUSPLUS
18 #if defined(_MSVC_LANG) && !defined(__clang__)
19 #define SIMDUTF_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG)
20 #else
21 #define SIMDUTF_CPLUSPLUS __cplusplus
22 #endif
23 #endif
24 
25 // C++ 17
26 #if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L)
27 #define SIMDUTF_CPLUSPLUS17 1
28 #endif
29 
30 // C++ 14
31 #if !defined(SIMDUTF_CPLUSPLUS14) && (SIMDUTF_CPLUSPLUS >= 201402L)
32 #define SIMDUTF_CPLUSPLUS14 1
33 #endif
34 
35 // C++ 11
36 #if !defined(SIMDUTF_CPLUSPLUS11) && (SIMDUTF_CPLUSPLUS >= 201103L)
37 #define SIMDUTF_CPLUSPLUS11 1
38 #endif
39 
40 #ifndef SIMDUTF_CPLUSPLUS11
41 #error simdutf requires a compiler compliant with the C++11 standard
42 #endif
43 
44 #endif // SIMDUTF_COMPILER_CHECK_H
45 /* end file include/simdutf/compiler_check.h */
46 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/common_defs.h
47 /* begin file include/simdutf/common_defs.h */
48 #ifndef SIMDUTF_COMMON_DEFS_H
49 #define SIMDUTF_COMMON_DEFS_H
50 
51 #include <cassert>
52 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/portability.h
53 /* begin file include/simdutf/portability.h */
54 #ifndef SIMDUTF_PORTABILITY_H
55 #define SIMDUTF_PORTABILITY_H
56 
57 #include <cstddef>
58 #include <cstdint>
59 #include <cstdlib>
60 #include <cfloat>
61 #include <cassert>
62 #ifndef _WIN32
63 // strcasecmp, strncasecmp
64 #include <strings.h>
65 #endif
66 
67 /**
68  * We want to check that it is actually a little endian system at
69  * compile-time.
70  */
71 
72 #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
73 #define SIMDUTF_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
74 #elif defined(_WIN32)
75 #define SIMDUTF_IS_BIG_ENDIAN 0
76 #else
77 #if defined(__APPLE__) || defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__
78 #include <machine/endian.h>
79 #elif defined(sun) || defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__)
80 #include <sys/byteorder.h>
81 #else  // defined(__APPLE__) || defined(__FreeBSD__)
82 
83 #ifdef __has_include
84 #if __has_include(<endian.h>)
85 #include <endian.h>
86 #endif //__has_include(<endian.h>)
87 #endif //__has_include
88 
89 #endif // defined(__APPLE__) || defined(__FreeBSD__)
90 
91 
92 #ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__)
93 #define SIMDUTF_IS_BIG_ENDIAN 0
94 #endif
95 
96 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
97 #define SIMDUTF_IS_BIG_ENDIAN 0
98 #else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
99 #define SIMDUTF_IS_BIG_ENDIAN 1
100 #endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
101 
102 #endif // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__
103 
104 
105 /**
106  * At this point in time, SIMDUTF_IS_BIG_ENDIAN is defined.
107  */
108 
109 #ifdef _MSC_VER
110 #define SIMDUTF_VISUAL_STUDIO 1
111 /**
112  * We want to differentiate carefully between
113  * clang under visual studio and regular visual
114  * studio.
115  *
116  * Under clang for Windows, we enable:
117  *  * target pragmas so that part and only part of the
118  *     code gets compiled for advanced instructions.
119  *
120  */
121 #ifdef __clang__
122 // clang under visual studio
123 #define SIMDUTF_CLANG_VISUAL_STUDIO 1
124 #else
125 // just regular visual studio (best guess)
126 #define SIMDUTF_REGULAR_VISUAL_STUDIO 1
127 #endif // __clang__
128 #endif // _MSC_VER
129 
130 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
131 // https://en.wikipedia.org/wiki/C_alternative_tokens
132 // This header should have no effect, except maybe
133 // under Visual Studio.
134 #include <iso646.h>
135 #endif
136 
137 #if defined(__x86_64__) || defined(_M_AMD64)
138 #define SIMDUTF_IS_X86_64 1
139 #elif defined(__aarch64__) || defined(_M_ARM64)
140 #define SIMDUTF_IS_ARM64 1
141 #elif defined(__PPC64__) || defined(_M_PPC64)
142 //#define SIMDUTF_IS_PPC64 1
143 // The simdutf library does yet support SIMD acceleration under
144 // POWER processors. Please see https://github.com/lemire/simdutf/issues/51
145 #elif defined(__s390__)
146 // s390 IBM system. Big endian.
147 #elif (defined(__riscv) || defined(__riscv__)) && __riscv_xlen == 64
148 // RISC-V 64-bit
149 #else
150 // The simdutf library is designed
151 // for 64-bit processors and it seems that you are not
152 // compiling for a known 64-bit platform. Please
153 // use a 64-bit target such as x64 or 64-bit ARM for best performance.
154 #define SIMDUTF_IS_32BITS 1
155 
156 // We do not support 32-bit platforms, but it can be
157 // handy to identify them.
158 #if defined(_M_IX86) || defined(__i386__)
159 #define SIMDUTF_IS_X86_32BITS 1
160 #elif defined(__arm__) || defined(_M_ARM)
161 #define SIMDUTF_IS_ARM_32BITS 1
162 #elif defined(__PPC__) || defined(_M_PPC)
163 #define SIMDUTF_IS_PPC_32BITS 1
164 #endif
165 
166 #endif // defined(__x86_64__) || defined(_M_AMD64)
167 
168 #ifdef SIMDUTF_IS_32BITS
169 #ifndef SIMDUTF_NO_PORTABILITY_WARNING
170 #pragma message("The simdutf library is designed \
171 for 64-bit processors and it seems that you are not \
172 compiling for a known 64-bit platform. All fast kernels \
173 will be disabled and performance may be poor. Please \
174 use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
175 #endif // SIMDUTF_NO_PORTABILITY_WARNING
176 #endif // SIMDUTF_IS_32BITS
177 
178 // this is almost standard?
179 #define SIMDUTF_STRINGIFY_IMPLEMENTATION_(a) #a
180 #define SIMDUTF_STRINGIFY(a) SIMDUTF_STRINGIFY_IMPLEMENTATION_(a)
181 
182 // Our fast kernels require 64-bit systems.
183 //
184 // On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions.
185 // Furthermore, the number of SIMD registers is reduced.
186 //
187 // On 32-bit ARM, we would have smaller registers.
188 //
189 // The simdutf users should still have the fallback kernel. It is
190 // slower, but it should run everywhere.
191 
192 //
193 // Enable valid runtime implementations, and select SIMDUTF_BUILTIN_IMPLEMENTATION
194 //
195 
196 // We are going to use runtime dispatch.
197 #ifdef SIMDUTF_IS_X86_64
198 #ifdef __clang__
199 // clang does not have GCC push pop
200 // warning: clang attribute push can't be used within a namespace in clang up
201 // til 8.0 so SIMDUTF_TARGET_REGION and SIMDUTF_UNTARGET_REGION must be *outside* of a
202 // namespace.
203 #define SIMDUTF_TARGET_REGION(T)                                                       \
204   _Pragma(SIMDUTF_STRINGIFY(                                                           \
205       clang attribute push(__attribute__((target(T))), apply_to = function)))
206 #define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop")
207 #elif defined(__GNUC__)
208 // GCC is easier
209 #define SIMDUTF_TARGET_REGION(T)                                                       \
210   _Pragma("GCC push_options") _Pragma(SIMDUTF_STRINGIFY(GCC target(T)))
211 #define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options")
212 #endif // clang then gcc
213 
214 #endif // x86
215 
216 // Default target region macros don't do anything.
217 #ifndef SIMDUTF_TARGET_REGION
218 #define SIMDUTF_TARGET_REGION(T)
219 #define SIMDUTF_UNTARGET_REGION
220 #endif
221 
222 // Is threading enabled?
223 #if defined(_REENTRANT) || defined(_MT)
224 #ifndef SIMDUTF_THREADS_ENABLED
225 #define SIMDUTF_THREADS_ENABLED
226 #endif
227 #endif
228 
229 // workaround for large stack sizes under -O0.
230 // https://github.com/simdutf/simdutf/issues/691
231 #ifdef __APPLE__
232 #ifndef __OPTIMIZE__
233 // Apple systems have small stack sizes in secondary threads.
234 // Lack of compiler optimization may generate high stack usage.
235 // Users may want to disable threads for safety, but only when
236 // in debug mode which we detect by the fact that the __OPTIMIZE__
237 // macro is not defined.
238 #undef SIMDUTF_THREADS_ENABLED
239 #endif
240 #endif
241 
242 #ifdef SIMDUTF_VISUAL_STUDIO
243 // This is one case where we do not distinguish between
244 // regular visual studio and clang under visual studio.
245 // clang under Windows has _stricmp (like visual studio) but not strcasecmp (as clang normally has)
246 #define simdutf_strcasecmp _stricmp
247 #define simdutf_strncasecmp _strnicmp
248 #else
249 // The strcasecmp, strncasecmp, and strcasestr functions do not work with multibyte strings (e.g. UTF-8).
250 // So they are only useful for ASCII in our context.
251 // https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings
252 #define simdutf_strcasecmp strcasecmp
253 #define simdutf_strncasecmp strncasecmp
254 #endif
255 
256 #ifdef NDEBUG
257 
258 #ifdef SIMDUTF_VISUAL_STUDIO
259 #define SIMDUTF_UNREACHABLE() __assume(0)
260 #define SIMDUTF_ASSUME(COND) __assume(COND)
261 #else
262 #define SIMDUTF_UNREACHABLE() __builtin_unreachable();
263 #define SIMDUTF_ASSUME(COND) do { if (!(COND)) __builtin_unreachable(); } while (0)
264 #endif
265 
266 #else // NDEBUG
267 
268 #define SIMDUTF_UNREACHABLE() assert(0);
269 #define SIMDUTF_ASSUME(COND) assert(COND)
270 
271 #endif
272 
273 
274 #if defined(__GNUC__) && !defined(__clang__)
275 #if __GNUC__ >= 11
276 #define SIMDUTF_GCC11ORMORE 1
277 #endif //  __GNUC__ >= 11
278 #endif // defined(__GNUC__) && !defined(__clang__)
279 
280 
281 #endif // SIMDUTF_PORTABILITY_H
282 /* end file include/simdutf/portability.h */
283 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/avx512.h
284 /* begin file include/simdutf/avx512.h */
285 #ifndef SIMDUTF_AVX512_H_
286 #define SIMDUTF_AVX512_H_
287 
288 /*
289     It's possible to override AVX512 settings with cmake DCMAKE_CXX_FLAGS.
290 
291     All preprocessor directives has form `SIMDUTF_HAS_AVX512{feature}`,
292     where a feature is a code name for extensions.
293 
294     Please see the listing below to find which are supported.
295 */
296 
297 #ifndef SIMDUTF_HAS_AVX512F
298 # if defined(__AVX512F__) && __AVX512F__ == 1
299 #   define SIMDUTF_HAS_AVX512F 1
300 # endif
301 #endif
302 
303 #ifndef SIMDUTF_HAS_AVX512DQ
304 # if defined(__AVX512DQ__) && __AVX512DQ__ == 1
305 #   define SIMDUTF_HAS_AVX512DQ 1
306 # endif
307 #endif
308 
309 #ifndef SIMDUTF_HAS_AVX512IFMA
310 # if defined(__AVX512IFMA__) && __AVX512IFMA__ == 1
311 #   define SIMDUTF_HAS_AVX512IFMA 1
312 # endif
313 #endif
314 
315 #ifndef SIMDUTF_HAS_AVX512CD
316 # if defined(__AVX512CD__) && __AVX512CD__ == 1
317 #   define SIMDUTF_HAS_AVX512CD 1
318 # endif
319 #endif
320 
321 #ifndef SIMDUTF_HAS_AVX512BW
322 # if defined(__AVX512BW__) && __AVX512BW__ == 1
323 #   define SIMDUTF_HAS_AVX512BW 1
324 # endif
325 #endif
326 
327 #ifndef SIMDUTF_HAS_AVX512VL
328 # if defined(__AVX512VL__) && __AVX512VL__ == 1
329 #   define SIMDUTF_HAS_AVX512VL 1
330 # endif
331 #endif
332 
333 #ifndef SIMDUTF_HAS_AVX512VBMI
334 # if defined(__AVX512VBMI__) && __AVX512VBMI__ == 1
335 #   define SIMDUTF_HAS_AVX512VBMI 1
336 # endif
337 #endif
338 
339 #ifndef SIMDUTF_HAS_AVX512VBMI2
340 # if defined(__AVX512VBMI2__) && __AVX512VBMI2__ == 1
341 #   define SIMDUTF_HAS_AVX512VBMI2 1
342 # endif
343 #endif
344 
345 #ifndef SIMDUTF_HAS_AVX512VNNI
346 # if defined(__AVX512VNNI__) && __AVX512VNNI__ == 1
347 #   define SIMDUTF_HAS_AVX512VNNI 1
348 # endif
349 #endif
350 
351 #ifndef SIMDUTF_HAS_AVX512BITALG
352 # if defined(__AVX512BITALG__) && __AVX512BITALG__ == 1
353 #   define SIMDUTF_HAS_AVX512BITALG 1
354 # endif
355 #endif
356 
357 #ifndef SIMDUTF_HAS_AVX512VPOPCNTDQ
358 # if defined(__AVX512VPOPCNTDQ__) && __AVX512VPOPCNTDQ__ == 1
359 #   define SIMDUTF_HAS_AVX512VPOPCNTDQ 1
360 # endif
361 #endif
362 
363 #endif // SIMDUTF_AVX512_H_
364 /* end file include/simdutf/avx512.h */
365 
366 
367 #if defined(__GNUC__)
368   // Marks a block with a name so that MCA analysis can see it.
369   #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
370   #define SIMDUTF_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
371   #define SIMDUTF_DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
372 #else
373   #define SIMDUTF_BEGIN_DEBUG_BLOCK(name)
374   #define SIMDUTF_END_DEBUG_BLOCK(name)
375   #define SIMDUTF_DEBUG_BLOCK(name, block)
376 #endif
377 
378 // Align to N-byte boundary
379 #define SIMDUTF_ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1))
380 #define SIMDUTF_ROUNDDOWN_N(a, n) ((a) & ~((n)-1))
381 
382 #define SIMDUTF_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0)
383 
384 #if defined(SIMDUTF_REGULAR_VISUAL_STUDIO)
385 
386   #define simdutf_really_inline __forceinline
387   #define simdutf_never_inline __declspec(noinline)
388 
389   #define simdutf_unused
390   #define simdutf_warn_unused
391 
392   #ifndef simdutf_likely
393   #define simdutf_likely(x) x
394   #endif
395   #ifndef simdutf_unlikely
396   #define simdutf_unlikely(x) x
397   #endif
398 
399   #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning( push ))
400   #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning( push, 0 ))
401   #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning( disable : WARNING_NUMBER ))
402   // Get rid of Intellisense-only warnings (Code Analysis)
403   // Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910).
404   #ifdef __has_include
405   #if __has_include(<CppCoreCheck\Warnings.h>)
406   #include <CppCoreCheck\Warnings.h>
407   #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS)
408   #endif
409   #endif
410 
411   #ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS
412   #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
413   #endif
414 
415   #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996)
416   #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING
417   #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning( pop ))
418 
419 #else // SIMDUTF_REGULAR_VISUAL_STUDIO
420 
421   #define simdutf_really_inline inline __attribute__((always_inline))
422   #define simdutf_never_inline inline __attribute__((noinline))
423 
424   #define simdutf_unused __attribute__((unused))
425   #define simdutf_warn_unused __attribute__((warn_unused_result))
426 
427   #ifndef simdutf_likely
428   #define simdutf_likely(x) __builtin_expect(!!(x), 1)
429   #endif
430   #ifndef simdutf_unlikely
431   #define simdutf_unlikely(x) __builtin_expect(!!(x), 0)
432   #endif
433 
434   #define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push")
435   // gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary
436   #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS SIMDUTF_PUSH_DISABLE_WARNINGS \
437     SIMDUTF_DISABLE_GCC_WARNING(-Weffc++) \
438     SIMDUTF_DISABLE_GCC_WARNING(-Wall) \
439     SIMDUTF_DISABLE_GCC_WARNING(-Wconversion) \
440     SIMDUTF_DISABLE_GCC_WARNING(-Wextra) \
441     SIMDUTF_DISABLE_GCC_WARNING(-Wattributes) \
442     SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \
443     SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \
444     SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type) \
445     SIMDUTF_DISABLE_GCC_WARNING(-Wshadow) \
446     SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter) \
447     SIMDUTF_DISABLE_GCC_WARNING(-Wunused-variable)
448   #define SIMDUTF_PRAGMA(P) _Pragma(#P)
449   #define SIMDUTF_DISABLE_GCC_WARNING(WARNING) SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING)
450   #if defined(SIMDUTF_CLANG_VISUAL_STUDIO)
451   #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include)
452   #else
453   #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
454   #endif
455   #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations)
456   #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow)
457   #define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop")
458 
459 
460 
461 #endif // MSC_VER
462 
463 #ifndef SIMDUTF_DLLIMPORTEXPORT
464     #if defined(SIMDUTF_VISUAL_STUDIO)
465       /**
466        * It does not matter here whether you are using
467        * the regular visual studio or clang under visual
468        * studio.
469        */
470       #if SIMDUTF_USING_LIBRARY
471       #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
472       #else
473       #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
474       #endif
475     #else
476       #define SIMDUTF_DLLIMPORTEXPORT
477     #endif
478 #endif
479 
480 /// If EXPR is an error, returns it.
481 #define SIMDUTF_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
482 
483 
484 #endif // SIMDUTF_COMMON_DEFS_H
485 /* end file include/simdutf/common_defs.h */
486 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/encoding_types.h
487 /* begin file include/simdutf/encoding_types.h */
488 #include <string>
489 
490 namespace simdutf {
491 
492 enum encoding_type {
493         UTF8 = 1,       // BOM 0xef 0xbb 0xbf
494         UTF16_LE = 2,   // BOM 0xff 0xfe
495         UTF16_BE = 4,   // BOM 0xfe 0xff
496         UTF32_LE = 8,   // BOM 0xff 0xfe 0x00 0x00
497         UTF32_BE = 16,   // BOM 0x00 0x00 0xfe 0xff
498 
499         unspecified = 0
500 };
501 
502 enum endianness {
503         LITTLE,
504         BIG
505 };
506 
507 bool match_system(endianness e);
508 
509 std::string to_string(encoding_type bom);
510 
511 // Note that BOM for UTF8 is discouraged.
512 namespace BOM {
513 
514 /**
515  * Checks for a BOM. If not, returns unspecified
516  * @param input         the string to process
517  * @param length        the length of the string in words
518  * @return the corresponding encoding
519  */
520 
521 encoding_type check_bom(const uint8_t* byte, size_t length);
522 encoding_type check_bom(const char* byte, size_t length);
523 /**
524  * Returns the size, in bytes, of the BOM for a given encoding type.
525  * Note that UTF8 BOM are discouraged.
526  * @param bom         the encoding type
527  * @return the size in bytes of the corresponding BOM
528  */
529 size_t bom_byte_size(encoding_type bom);
530 
531 } // BOM namespace
532 } // simdutf namespace
533 /* end file include/simdutf/encoding_types.h */
534 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/error.h
535 /* begin file include/simdutf/error.h */
536 #ifndef ERROR_H
537 #define ERROR_H
538 namespace simdutf {
539 
540 enum error_code {
541   SUCCESS = 0,
542   HEADER_BITS,  // Any byte must have fewer than 5 header bits.
543   TOO_SHORT,    // The leading byte must be followed by N-1 continuation bytes, where N is the UTF-8 character length
544                 // This is also the error when the input is truncated.
545   TOO_LONG,     // We either have too many consecutive continuation bytes or the string starts with a continuation byte.
546   OVERLONG,     // The decoded character must be above U+7F for two-byte characters, U+7FF for three-byte characters,
547                 // and U+FFFF for four-byte characters.
548   TOO_LARGE,    // The decoded character must be less than or equal to U+10FFFF OR less than or equal than U+7F for ASCII.
549   SURROGATE,    // The decoded character must be not be in U+D800...DFFF (UTF-8 or UTF-32) OR
550                 // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16)
551   OTHER         // Not related to validation/transcoding.
552 };
553 
554 struct result {
555   error_code error;
556   size_t count;     // In case of error, indicates the position of the error. In case of success, indicates the number of words validated/written.
557 
558   simdutf_really_inline result();
559 
560   simdutf_really_inline result(error_code, size_t);
561 };
562 
563 }
564 #endif
565 /* end file include/simdutf/error.h */
566 
567 SIMDUTF_PUSH_DISABLE_WARNINGS
568 SIMDUTF_DISABLE_UNDESIRED_WARNINGS
569 
570 // Public API
571 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/simdutf_version.h
572 /* begin file include/simdutf/simdutf_version.h */
573 // /include/simdutf/simdutf_version.h automatically generated by release.py,
574 // do not change by hand
575 #ifndef SIMDUTF_SIMDUTF_VERSION_H
576 #define SIMDUTF_SIMDUTF_VERSION_H
577 
578 /** The version of simdutf being used (major.minor.revision) */
579 #define SIMDUTF_VERSION "3.2.14"
580 
581 namespace simdutf {
582 enum {
583   /**
584    * The major version (MAJOR.minor.revision) of simdutf being used.
585    */
586   SIMDUTF_VERSION_MAJOR = 3,
587   /**
588    * The minor version (major.MINOR.revision) of simdutf being used.
589    */
590   SIMDUTF_VERSION_MINOR = 2,
591   /**
592    * The revision (major.minor.REVISION) of simdutf being used.
593    */
594   SIMDUTF_VERSION_REVISION = 14
595 };
596 } // namespace simdutf
597 
598 #endif // SIMDUTF_SIMDUTF_VERSION_H
599 /* end file include/simdutf/simdutf_version.h */
600 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/implementation.h
601 /* begin file include/simdutf/implementation.h */
602 #ifndef SIMDUTF_IMPLEMENTATION_H
603 #define SIMDUTF_IMPLEMENTATION_H
604 #include <string>
605 #if !defined(SIMDUTF_NO_THREADS)
606 #include <atomic>
607 #endif
608 #include <vector>
609 #include <tuple>
610 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/internal/isadetection.h
611 /* begin file include/simdutf/internal/isadetection.h */
612 /* From
613 https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
614 Highly modified.
615 
616 Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
617 Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
618 Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
619 Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
620 Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
621 Copyright (c) 2011-2013 NYU                      (Clement Farabet)
622 Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
623 Iain Melvin, Jason Weston) Copyright (c) 2006      Idiap Research Institute
624 (Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
625 Samy Bengio, Johnny Mariethoz)
626 
627 All rights reserved.
628 
629 Redistribution and use in source and binary forms, with or without
630 modification, are permitted provided that the following conditions are met:
631 
632 1. Redistributions of source code must retain the above copyright
633    notice, this list of conditions and the following disclaimer.
634 
635 2. Redistributions in binary form must reproduce the above copyright
636    notice, this list of conditions and the following disclaimer in the
637    documentation and/or other materials provided with the distribution.
638 
639 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
640 America and IDIAP Research Institute nor the names of its contributors may be
641    used to endorse or promote products derived from this software without
642    specific prior written permission.
643 
644 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
645 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
646 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
647 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
648 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
649 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
650 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
651 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
652 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
653 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
654 POSSIBILITY OF SUCH DAMAGE.
655 */
656 
657 #ifndef SIMDutf_INTERNAL_ISADETECTION_H
658 #define SIMDutf_INTERNAL_ISADETECTION_H
659 
660 #include <cstdint>
661 #include <cstdlib>
662 #if defined(_MSC_VER)
663 #include <intrin.h>
664 #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
665 #include <cpuid.h>
666 #endif
667 
668 namespace simdutf {
669 namespace internal {
670 
671 enum instruction_set {
672   DEFAULT = 0x0,
673   NEON = 0x1,
674   AVX2 = 0x4,
675   SSE42 = 0x8,
676   PCLMULQDQ = 0x10,
677   BMI1 = 0x20,
678   BMI2 = 0x40,
679   ALTIVEC = 0x80,
680   AVX512F = 0x100,
681   AVX512DQ = 0x200,
682   AVX512IFMA = 0x400,
683   AVX512PF = 0x800,
684   AVX512ER = 0x1000,
685   AVX512CD = 0x2000,
686   AVX512BW = 0x4000,
687   AVX512VL = 0x8000,
688   AVX512VBMI2 = 0x10000
689 };
690 
691 #if defined(__PPC64__)
692 
detect_supported_architectures()693 static inline uint32_t detect_supported_architectures() {
694   return instruction_set::ALTIVEC;
695 }
696 
697 #elif defined(__aarch64__) || defined(_M_ARM64)
698 
detect_supported_architectures()699 static inline uint32_t detect_supported_architectures() {
700   return instruction_set::NEON;
701 }
702 
703 #elif defined(__x86_64__) || defined(_M_AMD64) // x64
704 
705 
706 namespace {
707 namespace cpuid_bit {
708     // Can be found on Intel ISA Reference for CPUID
709 
710     // EAX = 0x01
711     constexpr uint32_t pclmulqdq = uint32_t(1) << 1; ///< @private bit  1 of ECX for EAX=0x1
712     constexpr uint32_t sse42 = uint32_t(1) << 20;    ///< @private bit 20 of ECX for EAX=0x1
713     constexpr uint32_t osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1
714 
715     // EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf)
716     // See: "Table 3-8. Information Returned by CPUID Instruction"
717     namespace ebx {
718       constexpr uint32_t bmi1 = uint32_t(1) << 3;
719       constexpr uint32_t avx2 = uint32_t(1) << 5;
720       constexpr uint32_t bmi2 = uint32_t(1) << 8;
721       constexpr uint32_t avx512f = uint32_t(1) << 16;
722       constexpr uint32_t avx512dq = uint32_t(1) << 17;
723       constexpr uint32_t avx512ifma = uint32_t(1) << 21;
724       constexpr uint32_t avx512cd = uint32_t(1) << 28;
725       constexpr uint32_t avx512bw = uint32_t(1) << 30;
726       constexpr uint32_t avx512vl = uint32_t(1) << 31;
727     }
728 
729     namespace ecx {
730       constexpr uint32_t avx512vbmi = uint32_t(1) << 1;
731       constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6;
732       constexpr uint32_t avx512vnni = uint32_t(1) << 11;
733       constexpr uint32_t avx512bitalg = uint32_t(1) << 12;
734       constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14;
735     }
736     namespace edx {
737       constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8;
738     }
739     namespace xcr0_bit {
740      constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX
741      constexpr uint64_t avx512_saved = uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
742    }
743   }
744 }
745 
746 
747 
cpuid(uint32_t * eax,uint32_t * ebx,uint32_t * ecx,uint32_t * edx)748 static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
749                          uint32_t *edx) {
750 #if defined(_MSC_VER)
751   int cpu_info[4];
752   __cpuidex(cpu_info, *eax, *ecx);
753   *eax = cpu_info[0];
754   *ebx = cpu_info[1];
755   *ecx = cpu_info[2];
756   *edx = cpu_info[3];
757 #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
758   uint32_t level = *eax;
759   __get_cpuid(level, eax, ebx, ecx, edx);
760 #else
761   uint32_t a = *eax, b, c = *ecx, d;
762   asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
763   *eax = a;
764   *ebx = b;
765   *ecx = c;
766   *edx = d;
767 #endif
768 }
769 
xgetbv()770 static inline uint64_t xgetbv() {
771  #if defined(_MSC_VER)
772    return _xgetbv(0);
773  #else
774    uint32_t xcr0_lo, xcr0_hi;
775    asm volatile("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0));
776    return xcr0_lo | ((uint64_t)xcr0_hi << 32);
777  #endif
778  }
779 
detect_supported_architectures()780 static inline uint32_t detect_supported_architectures() {
781   uint32_t eax;
782   uint32_t ebx = 0;
783   uint32_t ecx = 0;
784   uint32_t edx = 0;
785   uint32_t host_isa = 0x0;
786 
787   // EBX for EAX=0x1
788   eax = 0x1;
789   cpuid(&eax, &ebx, &ecx, &edx);
790 
791   if (ecx & cpuid_bit::sse42) {
792     host_isa |= instruction_set::SSE42;
793   }
794 
795   if (ecx & cpuid_bit::pclmulqdq) {
796     host_isa |= instruction_set::PCLMULQDQ;
797   }
798 
799   if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) {
800     return host_isa;
801   }
802 
803   // xgetbv for checking if the OS saves registers
804   uint64_t xcr0 = xgetbv();
805 
806   if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) {
807     return host_isa;
808   }
809   // ECX for EAX=0x7
810   eax = 0x7;
811   ecx = 0x0; // Sub-leaf = 0
812   cpuid(&eax, &ebx, &ecx, &edx);
813   if (ebx & cpuid_bit::ebx::avx2) {
814     host_isa |= instruction_set::AVX2;
815   }
816   if (ebx & cpuid_bit::ebx::bmi1) {
817     host_isa |= instruction_set::BMI1;
818   }
819   if (ebx & cpuid_bit::ebx::bmi2) {
820     host_isa |= instruction_set::BMI2;
821   }
822   if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == cpuid_bit::xcr0_bit::avx512_saved)) {
823     return host_isa;
824   }
825   if (ebx & cpuid_bit::ebx::avx512f) {
826     host_isa |= instruction_set::AVX512F;
827   }
828   if (ebx & cpuid_bit::ebx::avx512bw) {
829     host_isa |= instruction_set::AVX512BW;
830   }
831   if (ebx & cpuid_bit::ebx::avx512cd) {
832     host_isa |= instruction_set::AVX512CD;
833   }
834   if (ebx & cpuid_bit::ebx::avx512dq) {
835     host_isa |= instruction_set::AVX512DQ;
836   }
837   if (ebx & cpuid_bit::ebx::avx512vl) {
838     host_isa |= instruction_set::AVX512VL;
839   }
840   if (ecx & cpuid_bit::ecx::avx512vbmi2) {
841     host_isa |= instruction_set::AVX512VBMI2;
842   }
843   return host_isa;
844 }
845 #else // fallback
846 
847 // includes 32-bit ARM.
detect_supported_architectures()848 static inline uint32_t detect_supported_architectures() {
849   return instruction_set::DEFAULT;
850 }
851 
852 
853 #endif // end SIMD extension detection code
854 
855 } // namespace internal
856 } // namespace simdutf
857 
858 #endif // SIMDutf_INTERNAL_ISADETECTION_H
859 /* end file include/simdutf/internal/isadetection.h */
860 
861 
862 namespace simdutf {
863 
864 /**
865  * Autodetect the encoding of the input, a single encoding is recommended.
866  * E.g., the function might return simdutf::encoding_type::UTF8,
867  * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
868  * simdutf::encoding_type::UTF32_LE.
869  *
870  * @param input the string to analyze.
871  * @param length the length of the string in bytes.
872  * @return the detected encoding type
873  */
874 simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * input, size_t length) noexcept;
autodetect_encoding(const uint8_t * input,size_t length)875 simdutf_really_inline simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const uint8_t * input, size_t length) noexcept {
876   return autodetect_encoding(reinterpret_cast<const char *>(input), length);
877 }
878 
879 /**
880  * Autodetect the possible encodings of the input in one pass.
881  * E.g., if the input might be UTF-16LE or UTF-8, this function returns
882  * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
883  *
884  * Overriden by each implementation.
885  *
886  * @param input the string to analyze.
887  * @param length the length of the string in bytes.
888  * @return the detected encoding type
889  */
890 simdutf_warn_unused int detect_encodings(const char * input, size_t length) noexcept;
detect_encodings(const uint8_t * input,size_t length)891 simdutf_really_inline simdutf_warn_unused int detect_encodings(const uint8_t * input, size_t length) noexcept {
892   return detect_encodings(reinterpret_cast<const char *>(input), length);
893 }
894 
895 
896 /**
897  * Validate the UTF-8 string. This function may be best when you expect
898  * the input to be almost always valid. Otherwise, consider using
899  * validate_utf8_with_errors.
900  *
901  * Overridden by each implementation.
902  *
903  * @param buf the UTF-8 string to validate.
904  * @param len the length of the string in bytes.
905  * @return true if and only if the string is valid UTF-8.
906  */
907 simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
908 
909 /**
910  * Validate the UTF-8 string and stop on error.
911  *
912  * Overridden by each implementation.
913  *
914  * @param buf the UTF-8 string to validate.
915  * @param len the length of the string in bytes.
916  * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
917  */
918 simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept;
919 
920 /**
921  * Validate the ASCII string.
922  *
923  * Overridden by each implementation.
924  *
925  * @param buf the ASCII string to validate.
926  * @param len the length of the string in bytes.
927  * @return true if and only if the string is valid ASCII.
928  */
929 simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
930 
931 /**
932  * Validate the ASCII string and stop on error. It might be faster than
933  * validate_utf8 when an error is expected to occur early.
934  *
935  * Overridden by each implementation.
936  *
937  * @param buf the ASCII string to validate.
938  * @param len the length of the string in bytes.
939  * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
940  */
941 simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept;
942 
943 /**
944  * Using native endianness; Validate the UTF-16 string.
945  * This function may be best when you expect the input to be almost always valid.
946  * Otherwise, consider using validate_utf16_with_errors.
947  *
948  * Overridden by each implementation.
949  *
950  * This function is not BOM-aware.
951  *
952  * @param buf the UTF-16 string to validate.
953  * @param len the length of the string in number of 2-byte words (char16_t).
954  * @return true if and only if the string is valid UTF-16.
955  */
956 simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) noexcept;
957 
958 /**
959  * Validate the UTF-16LE string. This function may be best when you expect
960  * the input to be almost always valid. Otherwise, consider using
961  * validate_utf16le_with_errors.
962  *
963  * Overridden by each implementation.
964  *
965  * This function is not BOM-aware.
966  *
967  * @param buf the UTF-16LE string to validate.
968  * @param len the length of the string in number of 2-byte words (char16_t).
969  * @return true if and only if the string is valid UTF-16LE.
970  */
971 simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) noexcept;
972 
973 /**
974  * Validate the UTF-16BE string. This function may be best when you expect
975  * the input to be almost always valid. Otherwise, consider using
976  * validate_utf16be_with_errors.
977  *
978  * Overridden by each implementation.
979  *
980  * This function is not BOM-aware.
981  *
982  * @param buf the UTF-16BE string to validate.
983  * @param len the length of the string in number of 2-byte words (char16_t).
984  * @return true if and only if the string is valid UTF-16BE.
985  */
986 simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) noexcept;
987 
988 /**
989  * Using native endianness; Validate the UTF-16 string and stop on error.
990  * It might be faster than validate_utf16 when an error is expected to occur early.
991  *
992  * Overridden by each implementation.
993  *
994  * This function is not BOM-aware.
995  *
996  * @param buf the UTF-16 string to validate.
997  * @param len the length of the string in number of 2-byte words (char16_t).
998  * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
999  */
1000 simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf, size_t len) noexcept;
1001 
1002 /**
1003  * Validate the UTF-16LE string and stop on error. It might be faster than
1004  * validate_utf16le when an error is expected to occur early.
1005  *
1006  * Overridden by each implementation.
1007  *
1008  * This function is not BOM-aware.
1009  *
1010  * @param buf the UTF-16LE string to validate.
1011  * @param len the length of the string in number of 2-byte words (char16_t).
1012  * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
1013  */
1014 simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) noexcept;
1015 
1016 /**
1017  * Validate the UTF-16BE string and stop on error. It might be faster than
1018  * validate_utf16be when an error is expected to occur early.
1019  *
1020  * Overridden by each implementation.
1021  *
1022  * This function is not BOM-aware.
1023  *
1024  * @param buf the UTF-16BE string to validate.
1025  * @param len the length of the string in number of 2-byte words (char16_t).
1026  * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
1027  */
1028 simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) noexcept;
1029 
1030 /**
1031  * Validate the UTF-32 string. This function may be best when you expect
1032  * the input to be almost always valid. Otherwise, consider using
1033  * validate_utf32_with_errors.
1034  *
1035  * Overridden by each implementation.
1036  *
1037  * This function is not BOM-aware.
1038  *
1039  * @param buf the UTF-32 string to validate.
1040  * @param len the length of the string in number of 4-byte words (char32_t).
1041  * @return true if and only if the string is valid UTF-32.
1042  */
1043 simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcept;
1044 
1045 /**
1046  * Validate the UTF-32 string and stop on error. It might be faster than
1047  * validate_utf32 when an error is expected to occur early.
1048  *
1049  * Overridden by each implementation.
1050  *
1051  * This function is not BOM-aware.
1052  *
1053  * @param buf the UTF-32 string to validate.
1054  * @param len the length of the string in number of 4-byte words (char32_t).
1055  * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
1056  */
1057 simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept;
1058 
1059 /**
1060  * Using native endianness; Convert possibly broken UTF-8 string into UTF-16 string.
1061  *
1062  * During the conversion also validation of the input string is done.
1063  * This function is suitable to work with inputs from untrusted sources.
1064  *
1065  * @param input         the UTF-8 string to convert
1066  * @param length        the length of the string in bytes
1067  * @param utf16_buffer  the pointer to buffer that can hold conversion result
1068  * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
1069  */
1070 simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept;
1071 
1072 /**
1073  * Convert possibly broken UTF-8 string into UTF-16LE string.
1074  *
1075  * During the conversion also validation of the input string is done.
1076  * This function is suitable to work with inputs from untrusted sources.
1077  *
1078  * @param input         the UTF-8 string to convert
1079  * @param length        the length of the string in bytes
1080  * @param utf16_buffer  the pointer to buffer that can hold conversion result
1081  * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
1082  */
1083 simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept;
1084 
1085 /**
1086  * Convert possibly broken UTF-8 string into UTF-16BE string.
1087  *
1088  * During the conversion also validation of the input string is done.
1089  * This function is suitable to work with inputs from untrusted sources.
1090  *
1091  * @param input         the UTF-8 string to convert
1092  * @param length        the length of the string in bytes
1093  * @param utf16_buffer  the pointer to buffer that can hold conversion result
1094  * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
1095  */
1096 simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept;
1097 
1098 /**
1099  * Using native endianness; Convert possibly broken UTF-8 string into UTF-16
1100  * string and stop on error.
1101  *
1102  * During the conversion also validation of the input string is done.
1103  * This function is suitable to work with inputs from untrusted sources.
1104  *
1105  * @param input         the UTF-8 string to convert
1106  * @param length        the length of the string in bytes
1107  * @param utf16_buffer  the pointer to buffer that can hold conversion result
1108  * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
1109  */
1110 simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
1111 
1112 /**
1113  * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
1114  *
1115  * During the conversion also validation of the input string is done.
1116  * This function is suitable to work with inputs from untrusted sources.
1117  *
1118  * @param input         the UTF-8 string to convert
1119  * @param length        the length of the string in bytes
1120  * @param utf16_buffer  the pointer to buffer that can hold conversion result
1121  * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
1122  */
1123 simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
1124 
1125 /**
1126  * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
1127  *
1128  * During the conversion also validation of the input string is done.
1129  * This function is suitable to work with inputs from untrusted sources.
1130  *
1131  * @param input         the UTF-8 string to convert
1132  * @param length        the length of the string in bytes
1133  * @param utf16_buffer  the pointer to buffer that can hold conversion result
1134  * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
1135  */
1136 simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
1137 
1138 /**
1139  * Convert possibly broken UTF-8 string into UTF-32 string.
1140  *
1141  * During the conversion also validation of the input string is done.
1142  * This function is suitable to work with inputs from untrusted sources.
1143  *
1144  * @param input         the UTF-8 string to convert
1145  * @param length        the length of the string in bytes
1146  * @param utf32_buffer  the pointer to buffer that can hold conversion result
1147  * @return the number of written char32_t; 0 if the input was not valid UTF-8 string
1148  */
1149 simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept;
1150 
1151 /**
1152  * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
1153  *
1154  * During the conversion also validation of the input string is done.
1155  * This function is suitable to work with inputs from untrusted sources.
1156  *
1157  * @param input         the UTF-8 string to convert
1158  * @param length        the length of the string in bytes
1159  * @param utf32_buffer  the pointer to buffer that can hold conversion result
1160  * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
1161  */
1162 simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept;
1163 
1164 /**
1165  * Using native endianness; Convert valid UTF-8 string into UTF-16 string.
1166  *
1167  * This function assumes that the input string is valid UTF-8.
1168  *
1169  * @param input         the UTF-8 string to convert
1170  * @param length        the length of the string in bytes
1171  * @param utf16_buffer  the pointer to buffer that can hold conversion result
1172  * @return the number of written char16_t
1173  */
1174 simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
1175 
1176 /**
1177  * Convert valid UTF-8 string into UTF-16LE string.
1178  *
1179  * This function assumes that the input string is valid UTF-8.
1180  *
1181  * @param input         the UTF-8 string to convert
1182  * @param length        the length of the string in bytes
1183  * @param utf16_buffer  the pointer to buffer that can hold conversion result
1184  * @return the number of written char16_t
1185  */
1186 simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
1187 
1188 /**
1189  * Convert valid UTF-8 string into UTF-16BE string.
1190  *
1191  * This function assumes that the input string is valid UTF-8.
1192  *
1193  * @param input         the UTF-8 string to convert
1194  * @param length        the length of the string in bytes
1195  * @param utf16_buffer  the pointer to buffer that can hold conversion result
1196  * @return the number of written char16_t
1197  */
1198 simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
1199 
1200 /**
1201  * Convert valid UTF-8 string into UTF-32 string.
1202  *
1203  * This function assumes that the input string is valid UTF-8.
1204  *
1205  * @param input         the UTF-8 string to convert
1206  * @param length        the length of the string in bytes
1207  * @param utf32_buffer  the pointer to buffer that can hold conversion result
1208  * @return the number of written char32_t
1209  */
1210 simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept;
1211 
1212 /**
1213  * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format.
1214  *
1215  * This function does not validate the input.
1216  *
1217  * This function is not BOM-aware.
1218  *
1219  * @param input         the UTF-8 string to process
1220  * @param length        the length of the string in bytes
1221  * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE
1222  */
1223 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept;
1224 
1225 /**
1226  * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32 format.
1227  *
1228  * This function is equivalent to count_utf8
1229  *
1230  * This function does not validate the input.
1231  *
1232  * This function is not BOM-aware.
1233  *
1234  * @param input         the UTF-8 string to process
1235  * @param length        the length of the string in bytes
1236  * @return the number of char32_t words required to encode the UTF-8 string as UTF-32
1237  */
1238 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept;
1239 
1240 /**
1241  * Using native endianness; Convert possibly broken UTF-16 string into UTF-8 string.
1242  *
1243  * During the conversion also validation of the input string is done.
1244  * This function is suitable to work with inputs from untrusted sources.
1245  *
1246  * This function is not BOM-aware.
1247  *
1248  * @param input         the UTF-16 string to convert
1249  * @param length        the length of the string in 2-byte words (char16_t)
1250  * @param utf8_buffer   the pointer to buffer that can hold conversion result
1251  * @return number of written words; 0 if input is not a valid UTF-16LE string
1252  */
1253 simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1254 
1255 /**
1256  * Convert possibly broken UTF-16LE string into UTF-8 string.
1257  *
1258  * During the conversion also validation of the input string is done.
1259  * This function is suitable to work with inputs from untrusted sources.
1260  *
1261  * This function is not BOM-aware.
1262  *
1263  * @param input         the UTF-16LE string to convert
1264  * @param length        the length of the string in 2-byte words (char16_t)
1265  * @param utf8_buffer   the pointer to buffer that can hold conversion result
1266  * @return number of written words; 0 if input is not a valid UTF-16LE string
1267  */
1268 simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1269 
1270 /**
1271  * Convert possibly broken UTF-16BE string into UTF-8 string.
1272  *
1273  * During the conversion also validation of the input string is done.
1274  * This function is suitable to work with inputs from untrusted sources.
1275  *
1276  * This function is not BOM-aware.
1277  *
1278  * @param input         the UTF-16BE string to convert
1279  * @param length        the length of the string in 2-byte words (char16_t)
1280  * @param utf8_buffer   the pointer to buffer that can hold conversion result
1281  * @return number of written words; 0 if input is not a valid UTF-16LE string
1282  */
1283 simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1284 
1285 /**
1286  * Using native endianness; Convert possibly broken UTF-16 string into UTF-8 string and stop on error.
1287  *
1288  * During the conversion also validation of the input string is done.
1289  * This function is suitable to work with inputs from untrusted sources.
1290  *
1291  * This function is not BOM-aware.
1292  *
1293  * @param input         the UTF-16 string to convert
1294  * @param length        the length of the string in 2-byte words (char16_t)
1295  * @param utf8_buffer   the pointer to buffer that can hold conversion result
1296  * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
1297  */
1298 simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1299 
1300 /**
1301  * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
1302  *
1303  * During the conversion also validation of the input string is done.
1304  * This function is suitable to work with inputs from untrusted sources.
1305  *
1306  * This function is not BOM-aware.
1307  *
1308  * @param input         the UTF-16LE string to convert
1309  * @param length        the length of the string in 2-byte words (char16_t)
1310  * @param utf8_buffer   the pointer to buffer that can hold conversion result
1311  * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
1312  */
1313 simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1314 
1315 /**
1316  * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
1317  *
1318  * During the conversion also validation of the input string is done.
1319  * This function is suitable to work with inputs from untrusted sources.
1320  *
1321  * This function is not BOM-aware.
1322  *
1323  * @param input         the UTF-16BE string to convert
1324  * @param length        the length of the string in 2-byte words (char16_t)
1325  * @param utf8_buffer   the pointer to buffer that can hold conversion result
1326  * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
1327  */
1328 simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1329 
1330 /**
1331  * Using native endianness; Convert valid UTF-16 string into UTF-8 string.
1332  *
1333  * This function assumes that the input string is valid UTF-16LE.
1334  *
1335  * This function is not BOM-aware.
1336  *
1337  * @param input         the UTF-16 string to convert
1338  * @param length        the length of the string in 2-byte words (char16_t)
1339  * @param utf8_buffer   the pointer to buffer that can hold the conversion result
1340  * @return number of written words; 0 if conversion is not possible
1341  */
1342 simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1343 
1344 /**
1345  * Convert valid UTF-16LE string into UTF-8 string.
1346  *
1347  * This function assumes that the input string is valid UTF-16LE.
1348  *
1349  * This function is not BOM-aware.
1350  *
1351  * @param input         the UTF-16LE string to convert
1352  * @param length        the length of the string in 2-byte words (char16_t)
1353  * @param utf8_buffer   the pointer to buffer that can hold the conversion result
1354  * @return number of written words; 0 if conversion is not possible
1355  */
1356 simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1357 
1358 /**
1359  * Convert valid UTF-16BE string into UTF-8 string.
1360  *
1361  * This function assumes that the input string is valid UTF-16BE.
1362  *
1363  * This function is not BOM-aware.
1364  *
1365  * @param input         the UTF-16BE string to convert
1366  * @param length        the length of the string in 2-byte words (char16_t)
1367  * @param utf8_buffer   the pointer to buffer that can hold the conversion result
1368  * @return number of written words; 0 if conversion is not possible
1369  */
1370 simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1371 
1372 /**
1373  * Using native endianness; Convert possibly broken UTF-16 string into UTF-32 string.
1374  *
1375  * During the conversion also validation of the input string is done.
1376  * This function is suitable to work with inputs from untrusted sources.
1377  *
1378  * This function is not BOM-aware.
1379  *
1380  * @param input         the UTF-16 string to convert
1381  * @param length        the length of the string in 2-byte words (char16_t)
1382  * @param utf32_buffer   the pointer to buffer that can hold conversion result
1383  * @return number of written words; 0 if input is not a valid UTF-16LE string
1384  */
1385 simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1386 
1387 /**
1388  * Convert possibly broken UTF-16LE string into UTF-32 string.
1389  *
1390  * During the conversion also validation of the input string is done.
1391  * This function is suitable to work with inputs from untrusted sources.
1392  *
1393  * This function is not BOM-aware.
1394  *
1395  * @param input         the UTF-16LE string to convert
1396  * @param length        the length of the string in 2-byte words (char16_t)
1397  * @param utf32_buffer   the pointer to buffer that can hold conversion result
1398  * @return number of written words; 0 if input is not a valid UTF-16LE string
1399  */
1400 simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1401 
1402 /**
1403  * Convert possibly broken UTF-16BE string into UTF-32 string.
1404  *
1405  * During the conversion also validation of the input string is done.
1406  * This function is suitable to work with inputs from untrusted sources.
1407  *
1408  * This function is not BOM-aware.
1409  *
1410  * @param input         the UTF-16BE string to convert
1411  * @param length        the length of the string in 2-byte words (char16_t)
1412  * @param utf32_buffer   the pointer to buffer that can hold conversion result
1413  * @return number of written words; 0 if input is not a valid UTF-16LE string
1414  */
1415 simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1416 
1417 /**
1418  * Using native endianness; Convert possibly broken UTF-16 string into
1419  * UTF-32 string and stop on error.
1420  *
1421  * During the conversion also validation of the input string is done.
1422  * This function is suitable to work with inputs from untrusted sources.
1423  *
1424  * This function is not BOM-aware.
1425  *
1426  * @param input         the UTF-16 string to convert
1427  * @param length        the length of the string in 2-byte words (char16_t)
1428  * @param utf32_buffer   the pointer to buffer that can hold conversion result
1429  * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
1430  */
1431 simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1432 
1433 /**
1434  * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
1435  *
1436  * During the conversion also validation of the input string is done.
1437  * This function is suitable to work with inputs from untrusted sources.
1438  *
1439  * This function is not BOM-aware.
1440  *
1441  * @param input         the UTF-16LE string to convert
1442  * @param length        the length of the string in 2-byte words (char16_t)
1443  * @param utf32_buffer   the pointer to buffer that can hold conversion result
1444  * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
1445  */
1446 simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1447 
1448 /**
1449  * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
1450  *
1451  * During the conversion also validation of the input string is done.
1452  * This function is suitable to work with inputs from untrusted sources.
1453  *
1454  * This function is not BOM-aware.
1455  *
1456  * @param input         the UTF-16BE string to convert
1457  * @param length        the length of the string in 2-byte words (char16_t)
1458  * @param utf32_buffer   the pointer to buffer that can hold conversion result
1459  * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
1460  */
1461 simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1462 
1463 /**
1464  * Using native endianness; Convert valid UTF-16 string into UTF-32 string.
1465  *
1466  * This function assumes that the input string is valid UTF-16 (native endianness).
1467  *
1468  * This function is not BOM-aware.
1469  *
1470  * @param input         the UTF-16 string to convert
1471  * @param length        the length of the string in 2-byte words (char16_t)
1472  * @param utf32_buffer   the pointer to buffer that can hold the conversion result
1473  * @return number of written words; 0 if conversion is not possible
1474  */
1475 simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1476 
1477 /**
1478  * Convert valid UTF-16LE string into UTF-32 string.
1479  *
1480  * This function assumes that the input string is valid UTF-16LE.
1481  *
1482  * This function is not BOM-aware.
1483  *
1484  * @param input         the UTF-16LE string to convert
1485  * @param length        the length of the string in 2-byte words (char16_t)
1486  * @param utf32_buffer   the pointer to buffer that can hold the conversion result
1487  * @return number of written words; 0 if conversion is not possible
1488  */
1489 simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1490 
1491 /**
1492  * Convert valid UTF-16BE string into UTF-32 string.
1493  *
1494  * This function assumes that the input string is valid UTF-16LE.
1495  *
1496  * This function is not BOM-aware.
1497  *
1498  * @param input         the UTF-16BE string to convert
1499  * @param length        the length of the string in 2-byte words (char16_t)
1500  * @param utf32_buffer   the pointer to buffer that can hold the conversion result
1501  * @return number of written words; 0 if conversion is not possible
1502  */
1503 simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1504 
1505 /**
1506  * Using native endianness; Compute the number of bytes that this UTF-16
1507  * string would require in UTF-8 format.
1508  *
1509  * This function does not validate the input.
1510  *
1511  * @param input         the UTF-16 string to convert
1512  * @param length        the length of the string in 2-byte words (char16_t)
1513  * @return the number of bytes required to encode the UTF-16LE string as UTF-8
1514  */
1515 simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept;
1516 
1517 /**
1518  * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
1519  *
1520  * This function does not validate the input.
1521  *
1522  * @param input         the UTF-16LE string to convert
1523  * @param length        the length of the string in 2-byte words (char16_t)
1524  * @return the number of bytes required to encode the UTF-16LE string as UTF-8
1525  */
1526 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept;
1527 
1528 /**
1529  * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format.
1530  *
1531  * This function does not validate the input.
1532  *
1533  * @param input         the UTF-16BE string to convert
1534  * @param length        the length of the string in 2-byte words (char16_t)
1535  * @return the number of bytes required to encode the UTF-16BE string as UTF-8
1536  */
1537 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept;
1538 
1539 /**
1540  * Convert possibly broken UTF-32 string into UTF-8 string.
1541  *
1542  * During the conversion also validation of the input string is done.
1543  * This function is suitable to work with inputs from untrusted sources.
1544  *
1545  * This function is not BOM-aware.
1546  *
1547  * @param input         the UTF-32 string to convert
1548  * @param length        the length of the string in 4-byte words (char32_t)
1549  * @param utf8_buffer   the pointer to buffer that can hold conversion result
1550  * @return number of written words; 0 if input is not a valid UTF-32 string
1551  */
1552 simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
1553 
1554 /**
1555  * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
1556  *
1557  * During the conversion also validation of the input string is done.
1558  * This function is suitable to work with inputs from untrusted sources.
1559  *
1560  * This function is not BOM-aware.
1561  *
1562  * @param input         the UTF-32 string to convert
1563  * @param length        the length of the string in 4-byte words (char32_t)
1564  * @param utf8_buffer   the pointer to buffer that can hold conversion result
1565  * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
1566  */
1567 simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
1568 
1569 /**
1570  * Convert valid UTF-32 string into UTF-8 string.
1571  *
1572  * This function assumes that the input string is valid UTF-32.
1573  *
1574  * This function is not BOM-aware.
1575  *
1576  * @param input         the UTF-32 string to convert
1577  * @param length        the length of the string in 4-byte words (char32_t)
1578  * @param utf8_buffer   the pointer to buffer that can hold the conversion result
1579  * @return number of written words; 0 if conversion is not possible
1580  */
1581 simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
1582 
1583 /**
1584  * Using native endianness; Convert possibly broken UTF-32 string into UTF-16 string.
1585  *
1586  * During the conversion also validation of the input string is done.
1587  * This function is suitable to work with inputs from untrusted sources.
1588  *
1589  * This function is not BOM-aware.
1590  *
1591  * @param input         the UTF-32 string to convert
1592  * @param length        the length of the string in 4-byte words (char32_t)
1593  * @param utf16_buffer   the pointer to buffer that can hold conversion result
1594  * @return number of written words; 0 if input is not a valid UTF-32 string
1595  */
1596 simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1597 
1598 /**
1599  * Convert possibly broken UTF-32 string into UTF-16LE string.
1600  *
1601  * During the conversion also validation of the input string is done.
1602  * This function is suitable to work with inputs from untrusted sources.
1603  *
1604  * This function is not BOM-aware.
1605  *
1606  * @param input         the UTF-32 string to convert
1607  * @param length        the length of the string in 4-byte words (char32_t)
1608  * @param utf16_buffer   the pointer to buffer that can hold conversion result
1609  * @return number of written words; 0 if input is not a valid UTF-32 string
1610  */
1611 simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1612 
1613 /**
1614  * Convert possibly broken UTF-32 string into UTF-16BE string.
1615  *
1616  * During the conversion also validation of the input string is done.
1617  * This function is suitable to work with inputs from untrusted sources.
1618  *
1619  * This function is not BOM-aware.
1620  *
1621  * @param input         the UTF-32 string to convert
1622  * @param length        the length of the string in 4-byte words (char32_t)
1623  * @param utf16_buffer   the pointer to buffer that can hold conversion result
1624  * @return number of written words; 0 if input is not a valid UTF-32 string
1625  */
1626 simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1627 
1628 /**
1629  * Using native endianness; Convert possibly broken UTF-32 string into UTF-16
1630  * string and stop on error.
1631  *
1632  * During the conversion also validation of the input string is done.
1633  * This function is suitable to work with inputs from untrusted sources.
1634  *
1635  * This function is not BOM-aware.
1636  *
1637  * @param input         the UTF-32 string to convert
1638  * @param length        the length of the string in 4-byte words (char32_t)
1639  * @param utf16_buffer   the pointer to buffer that can hold conversion result
1640  * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
1641  */
1642 simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1643 
1644 /**
1645  * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
1646  *
1647  * During the conversion also validation of the input string is done.
1648  * This function is suitable to work with inputs from untrusted sources.
1649  *
1650  * This function is not BOM-aware.
1651  *
1652  * @param input         the UTF-32 string to convert
1653  * @param length        the length of the string in 4-byte words (char32_t)
1654  * @param utf16_buffer   the pointer to buffer that can hold conversion result
1655  * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
1656  */
1657 simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1658 
1659 /**
1660  * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
1661  *
1662  * During the conversion also validation of the input string is done.
1663  * This function is suitable to work with inputs from untrusted sources.
1664  *
1665  * This function is not BOM-aware.
1666  *
1667  * @param input         the UTF-32 string to convert
1668  * @param length        the length of the string in 4-byte words (char32_t)
1669  * @param utf16_buffer   the pointer to buffer that can hold conversion result
1670  * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
1671  */
1672 simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1673 
1674 /**
1675  * Using native endianness; Convert valid UTF-32 string into UTF-16 string.
1676  *
1677  * This function assumes that the input string is valid UTF-32.
1678  *
1679  * This function is not BOM-aware.
1680  *
1681  * @param input         the UTF-32 string to convert
1682  * @param length        the length of the string in 4-byte words (char32_t)
1683  * @param utf16_buffer   the pointer to buffer that can hold the conversion result
1684  * @return number of written words; 0 if conversion is not possible
1685  */
1686 simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1687 
1688 /**
1689  * Convert valid UTF-32 string into UTF-16LE string.
1690  *
1691  * This function assumes that the input string is valid UTF-32.
1692  *
1693  * This function is not BOM-aware.
1694  *
1695  * @param input         the UTF-32 string to convert
1696  * @param length        the length of the string in 4-byte words (char32_t)
1697  * @param utf16_buffer   the pointer to buffer that can hold the conversion result
1698  * @return number of written words; 0 if conversion is not possible
1699  */
1700 simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1701 
1702 /**
1703  * Convert valid UTF-32 string into UTF-16BE string.
1704  *
1705  * This function assumes that the input string is valid UTF-32.
1706  *
1707  * This function is not BOM-aware.
1708  *
1709  * @param input         the UTF-32 string to convert
1710  * @param length        the length of the string in 4-byte words (char32_t)
1711  * @param utf16_buffer   the pointer to buffer that can hold the conversion result
1712  * @return number of written words; 0 if conversion is not possible
1713  */
1714 simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1715 
1716 /**
1717  * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or
1718  * from UTF-16BE to UTF-16LE.
1719  *
1720  * This function does not validate the input.
1721  *
1722  * This function is not BOM-aware.
1723  *
1724  * @param input         the UTF-16 string to process
1725  * @param length        the length of the string in 2-byte words (char16_t)
1726  * @param output        the pointer to buffer that can hold the conversion result
1727  */
1728 void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept;
1729 
1730 /**
1731  * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
1732  *
1733  * This function does not validate the input.
1734  *
1735  * @param input         the UTF-32 string to convert
1736  * @param length        the length of the string in 4-byte words (char32_t)
1737  * @return the number of bytes required to encode the UTF-32 string as UTF-8
1738  */
1739 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept;
1740 
1741 /**
1742  * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format.
1743  *
1744  * This function does not validate the input.
1745  *
1746  * @param input         the UTF-32 string to convert
1747  * @param length        the length of the string in 4-byte words (char32_t)
1748  * @return the number of bytes required to encode the UTF-32 string as UTF-16
1749  */
1750 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept;
1751 
1752 /**
1753  * Using native endianness; Compute the number of bytes that this UTF-16
1754  * string would require in UTF-32 format.
1755  *
1756  * This function is equivalent to count_utf16.
1757  *
1758  * This function does not validate the input.
1759  *
1760  * This function is not BOM-aware.
1761  *
1762  * @param input         the UTF-16 string to convert
1763  * @param length        the length of the string in 2-byte words (char16_t)
1764  * @return the number of bytes required to encode the UTF-16LE string as UTF-32
1765  */
1766 simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept;
1767 
1768 /**
1769  * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format.
1770  *
1771  * This function is equivalent to count_utf16le.
1772  *
1773  * This function does not validate the input.
1774  *
1775  * This function is not BOM-aware.
1776  *
1777  * @param input         the UTF-16LE string to convert
1778  * @param length        the length of the string in 2-byte words (char16_t)
1779  * @return the number of bytes required to encode the UTF-16LE string as UTF-32
1780  */
1781 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept;
1782 
1783 /**
1784  * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format.
1785  *
1786  * This function is equivalent to count_utf16be.
1787  *
1788  * This function does not validate the input.
1789  *
1790  * This function is not BOM-aware.
1791  *
1792  * @param input         the UTF-16BE string to convert
1793  * @param length        the length of the string in 2-byte words (char16_t)
1794  * @return the number of bytes required to encode the UTF-16BE string as UTF-32
1795  */
1796 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept;
1797 
1798 /**
1799  * Count the number of code points (characters) in the string assuming that
1800  * it is valid.
1801  *
1802  * This function assumes that the input string is valid UTF-16 (native endianness).
1803  *
1804  * This function is not BOM-aware.
1805  *
1806  * @param input         the UTF-16 string to process
1807  * @param length        the length of the string in 2-byte words (char16_t)
1808  * @return number of code points
1809  */
1810 simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept;
1811 
1812 /**
1813  * Count the number of code points (characters) in the string assuming that
1814  * it is valid.
1815  *
1816  * This function assumes that the input string is valid UTF-16LE.
1817  *
1818  * This function is not BOM-aware.
1819  *
1820  * @param input         the UTF-16LE string to process
1821  * @param length        the length of the string in 2-byte words (char16_t)
1822  * @return number of code points
1823  */
1824 simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept;
1825 
1826 /**
1827  * Count the number of code points (characters) in the string assuming that
1828  * it is valid.
1829  *
1830  * This function assumes that the input string is valid UTF-16BE.
1831  *
1832  * This function is not BOM-aware.
1833  *
1834  * @param input         the UTF-16BE string to process
1835  * @param length        the length of the string in 2-byte words (char16_t)
1836  * @return number of code points
1837  */
1838 simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept;
1839 
1840 /**
1841  * Count the number of code points (characters) in the string assuming that
1842  * it is valid.
1843  *
1844  * This function assumes that the input string is valid UTF-8.
1845  *
1846  * @param input         the UTF-8 string to process
1847  * @param length        the length of the string in bytes
1848  * @return number of code points
1849  */
1850 simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept;
1851 
1852 /**
1853  * An implementation of simdutf for a particular CPU architecture.
1854  *
1855  * Also used to maintain the currently active implementation. The active implementation is
1856  * automatically initialized on first use to the most advanced implementation supported by the host.
1857  */
1858 class implementation {
1859 public:
1860 
1861   /**
1862    * The name of this implementation.
1863    *
1864    *     const implementation *impl = simdutf::active_implementation;
1865    *     cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
1866    *
1867    * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
1868    */
name()1869   virtual const std::string &name() const { return _name; }
1870 
1871   /**
1872    * The description of this implementation.
1873    *
1874    *     const implementation *impl = simdutf::active_implementation;
1875    *     cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
1876    *
1877    * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
1878    */
description()1879   virtual const std::string &description() const { return _description; }
1880 
1881   /**
1882    * The instruction sets this implementation is compiled against
1883    * and the current CPU match. This function may poll the current CPU/system
1884    * and should therefore not be called too often if performance is a concern.
1885    *
1886    *
1887    * @return true if the implementation can be safely used on the current system (determined at runtime)
1888    */
1889   bool supported_by_runtime_system() const;
1890 
1891   /**
1892    * This function will try to detect the encoding
1893    * @param input the string to identify
1894    * @param length the length of the string in bytes.
1895    * @return the encoding type detected
1896    */
1897   virtual encoding_type autodetect_encoding(const char * input, size_t length) const noexcept;
1898 
1899   /**
1900    * This function will try to detect the possible encodings in one pass
1901    * @param input the string to identify
1902    * @param length the length of the string in bytes.
1903    * @return the encoding type detected
1904    */
1905   virtual int detect_encodings(const char * input, size_t length) const noexcept = 0;
1906 
1907   /**
1908    * @private For internal implementation use
1909    *
1910    * The instruction sets this implementation is compiled against.
1911    *
1912    * @return a mask of all required `internal::instruction_set::` values
1913    */
required_instruction_sets()1914   virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; }
1915 
1916 
1917   /**
1918    * Validate the UTF-8 string.
1919    *
1920    * Overridden by each implementation.
1921    *
1922    * @param buf the UTF-8 string to validate.
1923    * @param len the length of the string in bytes.
1924    * @return true if and only if the string is valid UTF-8.
1925    */
1926   simdutf_warn_unused virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0;
1927 
1928   /**
1929    * Validate the UTF-8 string and stop on errors.
1930    *
1931    * Overridden by each implementation.
1932    *
1933    * @param buf the UTF-8 string to validate.
1934    * @param len the length of the string in bytes.
1935    * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
1936    */
1937   simdutf_warn_unused virtual result validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
1938 
1939   /**
1940    * Validate the ASCII string.
1941    *
1942    * Overridden by each implementation.
1943    *
1944    * @param buf the ASCII string to validate.
1945    * @param len the length of the string in bytes.
1946    * @return true if and only if the string is valid ASCII.
1947    */
1948   simdutf_warn_unused virtual bool validate_ascii(const char *buf, size_t len) const noexcept = 0;
1949 
1950   /**
1951    * Validate the ASCII string and stop on error.
1952    *
1953    * Overridden by each implementation.
1954    *
1955    * @param buf the ASCII string to validate.
1956    * @param len the length of the string in bytes.
1957    * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
1958    */
1959   simdutf_warn_unused virtual result validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
1960 
1961   /**
1962    * Validate the UTF-16LE string.This function may be best when you expect
1963    * the input to be almost always valid. Otherwise, consider using
1964    * validate_utf16le_with_errors.
1965    *
1966    * Overridden by each implementation.
1967    *
1968    * This function is not BOM-aware.
1969    *
1970    * @param buf the UTF-16LE string to validate.
1971    * @param len the length of the string in number of 2-byte words (char16_t).
1972    * @return true if and only if the string is valid UTF-16LE.
1973    */
1974   simdutf_warn_unused virtual bool validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
1975 
1976   /**
1977    * Validate the UTF-16BE string. This function may be best when you expect
1978    * the input to be almost always valid. Otherwise, consider using
1979    * validate_utf16be_with_errors.
1980    *
1981    * Overridden by each implementation.
1982    *
1983    * This function is not BOM-aware.
1984    *
1985    * @param buf the UTF-16BE string to validate.
1986    * @param len the length of the string in number of 2-byte words (char16_t).
1987    * @return true if and only if the string is valid UTF-16BE.
1988    */
1989   simdutf_warn_unused virtual bool validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
1990 
1991   /**
1992    * Validate the UTF-16LE string and stop on error.  It might be faster than
1993  * validate_utf16le when an error is expected to occur early.
1994    *
1995    * Overridden by each implementation.
1996    *
1997    * This function is not BOM-aware.
1998    *
1999    * @param buf the UTF-16LE string to validate.
2000    * @param len the length of the string in number of 2-byte words (char16_t).
2001    * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
2002    */
2003   simdutf_warn_unused virtual result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept = 0;
2004 
2005   /**
2006    * Validate the UTF-16BE string and stop on error. It might be faster than
2007    * validate_utf16be when an error is expected to occur early.
2008    *
2009    * Overridden by each implementation.
2010    *
2011    * This function is not BOM-aware.
2012    *
2013    * @param buf the UTF-16BE string to validate.
2014    * @param len the length of the string in number of 2-byte words (char16_t).
2015    * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
2016    */
2017   simdutf_warn_unused virtual result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept = 0;
2018 
2019   /**
2020    * Validate the UTF-32 string.
2021    *
2022    * Overridden by each implementation.
2023    *
2024    * This function is not BOM-aware.
2025    *
2026    * @param buf the UTF-32 string to validate.
2027    * @param len the length of the string in number of 4-byte words (char32_t).
2028    * @return true if and only if the string is valid UTF-32.
2029    */
2030   simdutf_warn_unused virtual bool validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
2031 
2032   /**
2033    * Validate the UTF-32 string and stop on error.
2034    *
2035    * Overridden by each implementation.
2036    *
2037    * This function is not BOM-aware.
2038    *
2039    * @param buf the UTF-32 string to validate.
2040    * @param len the length of the string in number of 4-byte words (char32_t).
2041    * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
2042    */
2043   simdutf_warn_unused virtual result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept = 0;
2044 
2045   /**
2046    * Convert possibly broken UTF-8 string into UTF-16LE string.
2047    *
2048    * During the conversion also validation of the input string is done.
2049    * This function is suitable to work with inputs from untrusted sources.
2050    *
2051    * @param input         the UTF-8 string to convert
2052    * @param length        the length of the string in bytes
2053    * @param utf16_buffer  the pointer to buffer that can hold conversion result
2054    * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
2055    */
2056   simdutf_warn_unused virtual size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2057 
2058   /**
2059    * Convert possibly broken UTF-8 string into UTF-16BE string.
2060    *
2061    * During the conversion also validation of the input string is done.
2062    * This function is suitable to work with inputs from untrusted sources.
2063    *
2064    * @param input         the UTF-8 string to convert
2065    * @param length        the length of the string in bytes
2066    * @param utf16_buffer  the pointer to buffer that can hold conversion result
2067    * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
2068    */
2069   simdutf_warn_unused virtual size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2070 
2071   /**
2072    * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
2073    *
2074    * During the conversion also validation of the input string is done.
2075    * This function is suitable to work with inputs from untrusted sources.
2076    *
2077    * @param input         the UTF-8 string to convert
2078    * @param length        the length of the string in bytes
2079    * @param utf16_buffer  the pointer to buffer that can hold conversion result
2080    * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
2081    */
2082   simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2083 
2084   /**
2085    * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
2086    *
2087    * During the conversion also validation of the input string is done.
2088    * This function is suitable to work with inputs from untrusted sources.
2089    *
2090    * @param input         the UTF-8 string to convert
2091    * @param length        the length of the string in bytes
2092    * @param utf16_buffer  the pointer to buffer that can hold conversion result
2093    * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
2094    */
2095   simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2096 
2097   /**
2098    * Convert possibly broken UTF-8 string into UTF-32 string.
2099    *
2100    * During the conversion also validation of the input string is done.
2101    * This function is suitable to work with inputs from untrusted sources.
2102    *
2103    * @param input         the UTF-8 string to convert
2104    * @param length        the length of the string in bytes
2105    * @param utf32_buffer  the pointer to buffer that can hold conversion result
2106    * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
2107    */
2108   simdutf_warn_unused virtual size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0;
2109 
2110   /**
2111    * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
2112    *
2113    * During the conversion also validation of the input string is done.
2114    * This function is suitable to work with inputs from untrusted sources.
2115    *
2116    * @param input         the UTF-8 string to convert
2117    * @param length        the length of the string in bytes
2118    * @param utf32_buffer  the pointer to buffer that can hold conversion result
2119    * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
2120    */
2121   simdutf_warn_unused virtual result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0;
2122 
2123   /**
2124    * Convert valid UTF-8 string into UTF-16LE string.
2125    *
2126    * This function assumes that the input string is valid UTF-8.
2127    *
2128    * @param input         the UTF-8 string to convert
2129    * @param length        the length of the string in bytes
2130    * @param utf16_buffer  the pointer to buffer that can hold conversion result
2131    * @return the number of written char16_t
2132    */
2133   simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2134 
2135 /**
2136    * Convert valid UTF-8 string into UTF-16BE string.
2137    *
2138    * This function assumes that the input string is valid UTF-8.
2139    *
2140    * @param input         the UTF-8 string to convert
2141    * @param length        the length of the string in bytes
2142    * @param utf16_buffer  the pointer to buffer that can hold conversion result
2143    * @return the number of written char16_t
2144    */
2145   simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2146 
2147   /**
2148    * Convert valid UTF-8 string into UTF-32 string.
2149    *
2150    * This function assumes that the input string is valid UTF-8.
2151    *
2152    * @param input         the UTF-8 string to convert
2153    * @param length        the length of the string in bytes
2154    * @param utf16_buffer  the pointer to buffer that can hold conversion result
2155    * @return the number of written char32_t
2156    */
2157   simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2158 
2159   /**
2160    * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format.
2161    *
2162    * This function does not validate the input.
2163    *
2164    * @param input         the UTF-8 string to process
2165    * @param length        the length of the string in bytes
2166    * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE
2167    */
2168   simdutf_warn_unused virtual size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept = 0;
2169 
2170    /**
2171    * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32 format.
2172    *
2173    * This function is equivalent to count_utf8.
2174    *
2175    * This function does not validate the input.
2176    *
2177    * @param input         the UTF-8 string to process
2178    * @param length        the length of the string in bytes
2179    * @return the number of char32_t words required to encode the UTF-8 string as UTF-32
2180    */
2181   simdutf_warn_unused virtual size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept = 0;
2182 
2183   /**
2184    * Convert possibly broken UTF-16LE string into UTF-8 string.
2185    *
2186    * During the conversion also validation of the input string is done.
2187    * This function is suitable to work with inputs from untrusted sources.
2188    *
2189    * This function is not BOM-aware.
2190    *
2191    * @param input         the UTF-16LE string to convert
2192    * @param length        the length of the string in 2-byte words (char16_t)
2193    * @param utf8_buffer   the pointer to buffer that can hold conversion result
2194    * @return number of written words; 0 if input is not a valid UTF-16LE string
2195    */
2196   simdutf_warn_unused virtual size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2197 
2198   /**
2199    * Convert possibly broken UTF-16BE string into UTF-8 string.
2200    *
2201    * During the conversion also validation of the input string is done.
2202    * This function is suitable to work with inputs from untrusted sources.
2203    *
2204    * This function is not BOM-aware.
2205    *
2206    * @param input         the UTF-16BE string to convert
2207    * @param length        the length of the string in 2-byte words (char16_t)
2208    * @param utf8_buffer   the pointer to buffer that can hold conversion result
2209    * @return number of written words; 0 if input is not a valid UTF-16BE string
2210    */
2211   simdutf_warn_unused virtual size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2212 
2213   /**
2214    * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
2215    *
2216    * During the conversion also validation of the input string is done.
2217    * This function is suitable to work with inputs from untrusted sources.
2218    *
2219    * This function is not BOM-aware.
2220    *
2221    * @param input         the UTF-16LE string to convert
2222    * @param length        the length of the string in 2-byte words (char16_t)
2223    * @param utf8_buffer   the pointer to buffer that can hold conversion result
2224    * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
2225    */
2226   simdutf_warn_unused virtual result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2227 
2228   /**
2229    * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
2230    *
2231    * During the conversion also validation of the input string is done.
2232    * This function is suitable to work with inputs from untrusted sources.
2233    *
2234    * This function is not BOM-aware.
2235    *
2236    * @param input         the UTF-16BE string to convert
2237    * @param length        the length of the string in 2-byte words (char16_t)
2238    * @param utf8_buffer   the pointer to buffer that can hold conversion result
2239    * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
2240    */
2241   simdutf_warn_unused virtual result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2242 
2243   /**
2244    * Convert valid UTF-16LE string into UTF-8 string.
2245    *
2246    * This function assumes that the input string is valid UTF-16LE.
2247    *
2248    * This function is not BOM-aware.
2249    *
2250    * @param input         the UTF-16LE string to convert
2251    * @param length        the length of the string in 2-byte words (char16_t)
2252    * @param utf8_buffer   the pointer to buffer that can hold the conversion result
2253    * @return number of written words; 0 if conversion is not possible
2254    */
2255   simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2256 
2257   /**
2258    * Convert valid UTF-16BE string into UTF-8 string.
2259    *
2260    * This function assumes that the input string is valid UTF-16BE.
2261    *
2262    * This function is not BOM-aware.
2263    *
2264    * @param input         the UTF-16BE string to convert
2265    * @param length        the length of the string in 2-byte words (char16_t)
2266    * @param utf8_buffer   the pointer to buffer that can hold the conversion result
2267    * @return number of written words; 0 if conversion is not possible
2268    */
2269   simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2270 
2271   /**
2272    * Convert possibly broken UTF-16LE string into UTF-32 string.
2273    *
2274    * During the conversion also validation of the input string is done.
2275    * This function is suitable to work with inputs from untrusted sources.
2276    *
2277    * This function is not BOM-aware.
2278    *
2279    * @param input         the UTF-16LE string to convert
2280    * @param length        the length of the string in 2-byte words (char16_t)
2281    * @param utf32_buffer   the pointer to buffer that can hold conversion result
2282    * @return number of written words; 0 if input is not a valid UTF-16LE string
2283    */
2284   simdutf_warn_unused virtual size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2285 
2286   /**
2287    * Convert possibly broken UTF-16BE string into UTF-32 string.
2288    *
2289    * During the conversion also validation of the input string is done.
2290    * This function is suitable to work with inputs from untrusted sources.
2291    *
2292    * This function is not BOM-aware.
2293    *
2294    * @param input         the UTF-16BE string to convert
2295    * @param length        the length of the string in 2-byte words (char16_t)
2296    * @param utf32_buffer   the pointer to buffer that can hold conversion result
2297    * @return number of written words; 0 if input is not a valid UTF-16BE string
2298    */
2299   simdutf_warn_unused virtual size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2300 
2301   /**
2302    * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
2303    *
2304    * During the conversion also validation of the input string is done.
2305    * This function is suitable to work with inputs from untrusted sources.
2306    *
2307    * This function is not BOM-aware.
2308    *
2309    * @param input         the UTF-16LE string to convert
2310    * @param length        the length of the string in 2-byte words (char16_t)
2311    * @param utf32_buffer   the pointer to buffer that can hold conversion result
2312    * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
2313    */
2314   simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2315 
2316   /**
2317    * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
2318    *
2319    * During the conversion also validation of the input string is done.
2320    * This function is suitable to work with inputs from untrusted sources.
2321    *
2322    * This function is not BOM-aware.
2323    *
2324    * @param input         the UTF-16BE string to convert
2325    * @param length        the length of the string in 2-byte words (char16_t)
2326    * @param utf32_buffer   the pointer to buffer that can hold conversion result
2327    * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
2328    */
2329   simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2330 
2331   /**
2332    * Convert valid UTF-16LE string into UTF-32 string.
2333    *
2334    * This function assumes that the input string is valid UTF-16LE.
2335    *
2336    * This function is not BOM-aware.
2337    *
2338    * @param input         the UTF-16LE string to convert
2339    * @param length        the length of the string in 2-byte words (char16_t)
2340    * @param utf32_buffer   the pointer to buffer that can hold the conversion result
2341    * @return number of written words; 0 if conversion is not possible
2342    */
2343   simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2344 
2345   /**
2346    * Convert valid UTF-16LE string into UTF-32BE string.
2347    *
2348    * This function assumes that the input string is valid UTF-16BE.
2349    *
2350    * This function is not BOM-aware.
2351    *
2352    * @param input         the UTF-16BE string to convert
2353    * @param length        the length of the string in 2-byte words (char16_t)
2354    * @param utf32_buffer   the pointer to buffer that can hold the conversion result
2355    * @return number of written words; 0 if conversion is not possible
2356    */
2357   simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2358 
2359   /**
2360    * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
2361    *
2362    * This function does not validate the input.
2363    *
2364    * This function is not BOM-aware.
2365    *
2366    * @param input         the UTF-16LE string to convert
2367    * @param length        the length of the string in 2-byte words (char16_t)
2368    * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2369    */
2370   simdutf_warn_unused virtual size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0;
2371 
2372   /**
2373    * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format.
2374    *
2375    * This function does not validate the input.
2376    *
2377    * This function is not BOM-aware.
2378    *
2379    * @param input         the UTF-16BE string to convert
2380    * @param length        the length of the string in 2-byte words (char16_t)
2381    * @return the number of bytes required to encode the UTF-16BE string as UTF-8
2382    */
2383   simdutf_warn_unused virtual size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0;
2384 
2385   /**
2386    * Convert possibly broken UTF-32 string into UTF-8 string.
2387    *
2388    * During the conversion also validation of the input string is done.
2389    * This function is suitable to work with inputs from untrusted sources.
2390    *
2391    * This function is not BOM-aware.
2392    *
2393    * @param input         the UTF-32 string to convert
2394    * @param length        the length of the string in 4-byte words (char32_t)
2395    * @param utf8_buffer   the pointer to buffer that can hold conversion result
2396    * @return number of written words; 0 if input is not a valid UTF-32 string
2397    */
2398   simdutf_warn_unused virtual size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2399 
2400   /**
2401    * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
2402    *
2403    * During the conversion also validation of the input string is done.
2404    * This function is suitable to work with inputs from untrusted sources.
2405    *
2406    * This function is not BOM-aware.
2407    *
2408    * @param input         the UTF-32 string to convert
2409    * @param length        the length of the string in 4-byte words (char32_t)
2410    * @param utf8_buffer   the pointer to buffer that can hold conversion result
2411    * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
2412    */
2413   simdutf_warn_unused virtual result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2414 
2415   /**
2416    * Convert valid UTF-32 string into UTF-8 string.
2417    *
2418    * This function assumes that the input string is valid UTF-32.
2419    *
2420    * This function is not BOM-aware.
2421    *
2422    * @param input         the UTF-32 string to convert
2423    * @param length        the length of the string in 4-byte words (char32_t)
2424    * @param utf8_buffer   the pointer to buffer that can hold the conversion result
2425    * @return number of written words; 0 if conversion is not possible
2426    */
2427   simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2428 
2429   /**
2430    * Convert possibly broken UTF-32 string into UTF-16LE string.
2431    *
2432    * During the conversion also validation of the input string is done.
2433    * This function is suitable to work with inputs from untrusted sources.
2434    *
2435    * This function is not BOM-aware.
2436    *
2437    * @param input         the UTF-32 string to convert
2438    * @param length        the length of the string in 4-byte words (char32_t)
2439    * @param utf16_buffer   the pointer to buffer that can hold conversion result
2440    * @return number of written words; 0 if input is not a valid UTF-32 string
2441    */
2442   simdutf_warn_unused virtual size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2443 
2444   /**
2445    * Convert possibly broken UTF-32 string into UTF-16BE string.
2446    *
2447    * During the conversion also validation of the input string is done.
2448    * This function is suitable to work with inputs from untrusted sources.
2449    *
2450    * This function is not BOM-aware.
2451    *
2452    * @param input         the UTF-32 string to convert
2453    * @param length        the length of the string in 4-byte words (char32_t)
2454    * @param utf16_buffer   the pointer to buffer that can hold conversion result
2455    * @return number of written words; 0 if input is not a valid UTF-32 string
2456    */
2457   simdutf_warn_unused virtual size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2458 
2459   /**
2460    * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
2461    *
2462    * During the conversion also validation of the input string is done.
2463    * This function is suitable to work with inputs from untrusted sources.
2464    *
2465    * This function is not BOM-aware.
2466    *
2467    * @param input         the UTF-32 string to convert
2468    * @param length        the length of the string in 4-byte words (char32_t)
2469    * @param utf16_buffer   the pointer to buffer that can hold conversion result
2470    * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
2471    */
2472   simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2473 
2474   /**
2475    * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
2476    *
2477    * During the conversion also validation of the input string is done.
2478    * This function is suitable to work with inputs from untrusted sources.
2479    *
2480    * This function is not BOM-aware.
2481    *
2482    * @param input         the UTF-32 string to convert
2483    * @param length        the length of the string in 4-byte words (char32_t)
2484    * @param utf16_buffer   the pointer to buffer that can hold conversion result
2485    * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
2486    */
2487   simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2488 
2489   /**
2490    * Convert valid UTF-32 string into UTF-16LE string.
2491    *
2492    * This function assumes that the input string is valid UTF-32.
2493    *
2494    * This function is not BOM-aware.
2495    *
2496    * @param input         the UTF-32 string to convert
2497    * @param length        the length of the string in 4-byte words (char32_t)
2498    * @param utf16_buffer   the pointer to buffer that can hold the conversion result
2499    * @return number of written words; 0 if conversion is not possible
2500    */
2501   simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2502 
2503   /**
2504    * Convert valid UTF-32 string into UTF-16BE string.
2505    *
2506    * This function assumes that the input string is valid UTF-32.
2507    *
2508    * This function is not BOM-aware.
2509    *
2510    * @param input         the UTF-32 string to convert
2511    * @param length        the length of the string in 4-byte words (char32_t)
2512    * @param utf16_buffer   the pointer to buffer that can hold the conversion result
2513    * @return number of written words; 0 if conversion is not possible
2514    */
2515   simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2516 
2517   /**
2518    * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or
2519    * from UTF-16BE to UTF-16LE.
2520    *
2521    * This function does not validate the input.
2522    *
2523    * This function is not BOM-aware.
2524    *
2525    * @param input         the UTF-16 string to process
2526    * @param length        the length of the string in 2-byte words (char16_t)
2527    * @param output        the pointer to buffer that can hold the conversion result
2528    */
2529   virtual void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept = 0;
2530 
2531   /**
2532    * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
2533    *
2534    * This function does not validate the input.
2535    *
2536    * @param input         the UTF-32 string to convert
2537    * @param length        the length of the string in 4-byte words (char32_t)
2538    * @return the number of bytes required to encode the UTF-32 string as UTF-8
2539    */
2540   simdutf_warn_unused virtual size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
2541 
2542   /**
2543    * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format.
2544    *
2545    * This function does not validate the input.
2546    *
2547    * @param input         the UTF-32 string to convert
2548    * @param length        the length of the string in 4-byte words (char32_t)
2549    * @return the number of bytes required to encode the UTF-32 string as UTF-16
2550    */
2551   simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
2552 
2553   /*
2554    * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format.
2555    *
2556    * This function is equivalent to count_utf16le.
2557    *
2558    * This function does not validate the input.
2559    *
2560    * This function is not BOM-aware.
2561    *
2562    * @param input         the UTF-16LE string to convert
2563    * @param length        the length of the string in 2-byte words (char16_t)
2564    * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2565    */
2566   simdutf_warn_unused virtual size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0;
2567 
2568   /*
2569    * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format.
2570    *
2571    * This function is equivalent to count_utf16be.
2572    *
2573    * This function does not validate the input.
2574    *
2575    * This function is not BOM-aware.
2576    *
2577    * @param input         the UTF-16BE string to convert
2578    * @param length        the length of the string in 2-byte words (char16_t)
2579    * @return the number of bytes required to encode the UTF-16BE string as UTF-32
2580    */
2581   simdutf_warn_unused virtual size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0;
2582 
2583   /**
2584    * Count the number of code points (characters) in the string assuming that
2585    * it is valid.
2586    *
2587    * This function assumes that the input string is valid UTF-16LE.
2588    *
2589    * This function is not BOM-aware.
2590    *
2591    * @param input         the UTF-16LE string to process
2592    * @param length        the length of the string in 2-byte words (char16_t)
2593    * @return number of code points
2594    */
2595   simdutf_warn_unused virtual size_t count_utf16le(const char16_t * input, size_t length) const noexcept = 0;
2596 
2597   /**
2598    * Count the number of code points (characters) in the string assuming that
2599    * it is valid.
2600    *
2601    * This function assumes that the input string is valid UTF-16BE.
2602    *
2603    * This function is not BOM-aware.
2604    *
2605    * @param input         the UTF-16BE string to process
2606    * @param length        the length of the string in 2-byte words (char16_t)
2607    * @return number of code points
2608    */
2609   simdutf_warn_unused virtual size_t count_utf16be(const char16_t * input, size_t length) const noexcept = 0;
2610 
2611 
2612   /**
2613    * Count the number of code points (characters) in the string assuming that
2614    * it is valid.
2615    *
2616    * This function assumes that the input string is valid UTF-8.
2617    *
2618    * @param input         the UTF-8 string to process
2619    * @param length        the length of the string in bytes
2620    * @return number of code points
2621    */
2622   simdutf_warn_unused virtual size_t count_utf8(const char * input, size_t length) const noexcept = 0;
2623 
2624 
2625 
2626 protected:
2627   /** @private Construct an implementation with the given name and description. For subclasses. */
implementation(std::string name,std::string description,uint32_t required_instruction_sets)2628   simdutf_really_inline implementation(
2629     std::string name,
2630     std::string description,
2631     uint32_t required_instruction_sets
2632   ) :
2633     _name(name),
2634     _description(description),
2635     _required_instruction_sets(required_instruction_sets)
2636   {
2637   }
2638   virtual ~implementation()=default;
2639 
2640 private:
2641   /**
2642    * The name of this implementation.
2643    */
2644   const std::string _name;
2645 
2646   /**
2647    * The description of this implementation.
2648    */
2649   const std::string _description;
2650 
2651   /**
2652    * Instruction sets required for this implementation.
2653    */
2654   const uint32_t _required_instruction_sets;
2655 };
2656 
2657 /** @private */
2658 namespace internal {
2659 
2660 /**
2661  * The list of available implementations compiled into simdutf.
2662  */
2663 class available_implementation_list {
2664 public:
2665   /** Get the list of available implementations compiled into simdutf */
available_implementation_list()2666   simdutf_really_inline available_implementation_list() {}
2667   /** Number of implementations */
2668   size_t size() const noexcept;
2669   /** STL const begin() iterator */
2670   const implementation * const *begin() const noexcept;
2671   /** STL const end() iterator */
2672   const implementation * const *end() const noexcept;
2673 
2674   /**
2675    * Get the implementation with the given name.
2676    *
2677    * Case sensitive.
2678    *
2679    *     const implementation *impl = simdutf::available_implementations["westmere"];
2680    *     if (!impl) { exit(1); }
2681    *     if (!imp->supported_by_runtime_system()) { exit(1); }
2682    *     simdutf::active_implementation = impl;
2683    *
2684    * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
2685    * @return the implementation, or nullptr if the parse failed.
2686    */
2687   const implementation * operator[](const std::string &name) const noexcept {
2688     for (const implementation * impl : *this) {
2689       if (impl->name() == name) { return impl; }
2690     }
2691     return nullptr;
2692   }
2693 
2694   /**
2695    * Detect the most advanced implementation supported by the current host.
2696    *
2697    * This is used to initialize the implementation on startup.
2698    *
2699    *     const implementation *impl = simdutf::available_implementation::detect_best_supported();
2700    *     simdutf::active_implementation = impl;
2701    *
2702    * @return the most advanced supported implementation for the current host, or an
2703    *         implementation that returns UNSUPPORTED_ARCHITECTURE if there is no supported
2704    *         implementation. Will never return nullptr.
2705    */
2706   const implementation *detect_best_supported() const noexcept;
2707 };
2708 
2709 template<typename T>
2710 class atomic_ptr {
2711 public:
atomic_ptr(T * _ptr)2712   atomic_ptr(T *_ptr) : ptr{_ptr} {}
2713 
2714 #if defined(SIMDUTF_NO_THREADS)
2715   operator const T*() const { return ptr; }
2716   const T& operator*() const { return *ptr; }
2717   const T* operator->() const { return ptr; }
2718 
2719   operator T*() { return ptr; }
2720   T& operator*() { return *ptr; }
2721   T* operator->() { return ptr; }
2722   atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; }
2723 
2724 #else
2725   operator const T*() const { return ptr.load(); }
2726   const T& operator*() const { return *ptr; }
2727   const T* operator->() const { return ptr.load(); }
2728 
2729   operator T*() { return ptr.load(); }
2730   T& operator*() { return *ptr; }
2731   T* operator->() { return ptr.load(); }
2732   atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; }
2733 
2734 #endif
2735 
2736 private:
2737 #if defined(SIMDUTF_NO_THREADS)
2738   T* ptr;
2739 #else
2740   std::atomic<T*> ptr;
2741 #endif
2742 };
2743 
2744 class detect_best_supported_implementation_on_first_use;
2745 
2746 } // namespace internal
2747 
2748 /**
2749  * The list of available implementations compiled into simdutf.
2750  */
2751 extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations();
2752 
2753 /**
2754   * The active implementation.
2755   *
2756   * Automatically initialized on first use to the most advanced implementation supported by this hardware.
2757   */
2758 extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation();
2759 
2760 
2761 } // namespace simdutf
2762 
2763 #endif // SIMDUTF_IMPLEMENTATION_H
2764 /* end file include/simdutf/implementation.h */
2765 
2766 
2767 // Implementation-internal files (must be included before the implementations themselves, to keep
2768 // amalgamation working--otherwise, the first time a file is included, it might be put inside the
2769 // #ifdef SIMDUTF_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other implementations can't
2770 // compile unless that implementation is turned on).
2771 
2772 
2773 SIMDUTF_POP_DISABLE_WARNINGS
2774 
2775 #endif // SIMDUTF_H
2776 /* end file include/simdutf.h */
2777