1 /* auto-generated on 2023-06-05 08:58:28 -0400. Do not edit! */
2 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf.h
3 /* begin file include/simdutf.h */
4 #ifndef SIMDUTF_H
5 #define SIMDUTF_H
6 #include <cstring>
7
8 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/compiler_check.h
9 /* begin file include/simdutf/compiler_check.h */
10 #ifndef SIMDUTF_COMPILER_CHECK_H
11 #define SIMDUTF_COMPILER_CHECK_H
12
13 #ifndef __cplusplus
14 #error simdutf requires a C++ compiler
15 #endif
16
17 #ifndef SIMDUTF_CPLUSPLUS
18 #if defined(_MSVC_LANG) && !defined(__clang__)
19 #define SIMDUTF_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG)
20 #else
21 #define SIMDUTF_CPLUSPLUS __cplusplus
22 #endif
23 #endif
24
25 // C++ 17
26 #if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L)
27 #define SIMDUTF_CPLUSPLUS17 1
28 #endif
29
30 // C++ 14
31 #if !defined(SIMDUTF_CPLUSPLUS14) && (SIMDUTF_CPLUSPLUS >= 201402L)
32 #define SIMDUTF_CPLUSPLUS14 1
33 #endif
34
35 // C++ 11
36 #if !defined(SIMDUTF_CPLUSPLUS11) && (SIMDUTF_CPLUSPLUS >= 201103L)
37 #define SIMDUTF_CPLUSPLUS11 1
38 #endif
39
40 #ifndef SIMDUTF_CPLUSPLUS11
41 #error simdutf requires a compiler compliant with the C++11 standard
42 #endif
43
44 #endif // SIMDUTF_COMPILER_CHECK_H
45 /* end file include/simdutf/compiler_check.h */
46 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/common_defs.h
47 /* begin file include/simdutf/common_defs.h */
48 #ifndef SIMDUTF_COMMON_DEFS_H
49 #define SIMDUTF_COMMON_DEFS_H
50
51 #include <cassert>
52 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/portability.h
53 /* begin file include/simdutf/portability.h */
54 #ifndef SIMDUTF_PORTABILITY_H
55 #define SIMDUTF_PORTABILITY_H
56
57 #include <cstddef>
58 #include <cstdint>
59 #include <cstdlib>
60 #include <cfloat>
61 #include <cassert>
62 #ifndef _WIN32
63 // strcasecmp, strncasecmp
64 #include <strings.h>
65 #endif
66
67 /**
68 * We want to check that it is actually a little endian system at
69 * compile-time.
70 */
71
72 #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
73 #define SIMDUTF_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
74 #elif defined(_WIN32)
75 #define SIMDUTF_IS_BIG_ENDIAN 0
76 #else
77 #if defined(__APPLE__) || defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__
78 #include <machine/endian.h>
79 #elif defined(sun) || defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__)
80 #include <sys/byteorder.h>
81 #else // defined(__APPLE__) || defined(__FreeBSD__)
82
83 #ifdef __has_include
84 #if __has_include(<endian.h>)
85 #include <endian.h>
86 #endif //__has_include(<endian.h>)
87 #endif //__has_include
88
89 #endif // defined(__APPLE__) || defined(__FreeBSD__)
90
91
92 #ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__)
93 #define SIMDUTF_IS_BIG_ENDIAN 0
94 #endif
95
96 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
97 #define SIMDUTF_IS_BIG_ENDIAN 0
98 #else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
99 #define SIMDUTF_IS_BIG_ENDIAN 1
100 #endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
101
102 #endif // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__
103
104
105 /**
106 * At this point in time, SIMDUTF_IS_BIG_ENDIAN is defined.
107 */
108
109 #ifdef _MSC_VER
110 #define SIMDUTF_VISUAL_STUDIO 1
111 /**
112 * We want to differentiate carefully between
113 * clang under visual studio and regular visual
114 * studio.
115 *
116 * Under clang for Windows, we enable:
117 * * target pragmas so that part and only part of the
118 * code gets compiled for advanced instructions.
119 *
120 */
121 #ifdef __clang__
122 // clang under visual studio
123 #define SIMDUTF_CLANG_VISUAL_STUDIO 1
124 #else
125 // just regular visual studio (best guess)
126 #define SIMDUTF_REGULAR_VISUAL_STUDIO 1
127 #endif // __clang__
128 #endif // _MSC_VER
129
130 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
131 // https://en.wikipedia.org/wiki/C_alternative_tokens
132 // This header should have no effect, except maybe
133 // under Visual Studio.
134 #include <iso646.h>
135 #endif
136
137 #if defined(__x86_64__) || defined(_M_AMD64)
138 #define SIMDUTF_IS_X86_64 1
139 #elif defined(__aarch64__) || defined(_M_ARM64)
140 #define SIMDUTF_IS_ARM64 1
141 #elif defined(__PPC64__) || defined(_M_PPC64)
142 //#define SIMDUTF_IS_PPC64 1
143 // The simdutf library does yet support SIMD acceleration under
144 // POWER processors. Please see https://github.com/lemire/simdutf/issues/51
145 #elif defined(__s390__)
146 // s390 IBM system. Big endian.
147 #elif (defined(__riscv) || defined(__riscv__)) && __riscv_xlen == 64
148 // RISC-V 64-bit
149 #else
150 // The simdutf library is designed
151 // for 64-bit processors and it seems that you are not
152 // compiling for a known 64-bit platform. Please
153 // use a 64-bit target such as x64 or 64-bit ARM for best performance.
154 #define SIMDUTF_IS_32BITS 1
155
156 // We do not support 32-bit platforms, but it can be
157 // handy to identify them.
158 #if defined(_M_IX86) || defined(__i386__)
159 #define SIMDUTF_IS_X86_32BITS 1
160 #elif defined(__arm__) || defined(_M_ARM)
161 #define SIMDUTF_IS_ARM_32BITS 1
162 #elif defined(__PPC__) || defined(_M_PPC)
163 #define SIMDUTF_IS_PPC_32BITS 1
164 #endif
165
166 #endif // defined(__x86_64__) || defined(_M_AMD64)
167
168 #ifdef SIMDUTF_IS_32BITS
169 #ifndef SIMDUTF_NO_PORTABILITY_WARNING
170 #pragma message("The simdutf library is designed \
171 for 64-bit processors and it seems that you are not \
172 compiling for a known 64-bit platform. All fast kernels \
173 will be disabled and performance may be poor. Please \
174 use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
175 #endif // SIMDUTF_NO_PORTABILITY_WARNING
176 #endif // SIMDUTF_IS_32BITS
177
178 // this is almost standard?
179 #define SIMDUTF_STRINGIFY_IMPLEMENTATION_(a) #a
180 #define SIMDUTF_STRINGIFY(a) SIMDUTF_STRINGIFY_IMPLEMENTATION_(a)
181
182 // Our fast kernels require 64-bit systems.
183 //
184 // On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions.
185 // Furthermore, the number of SIMD registers is reduced.
186 //
187 // On 32-bit ARM, we would have smaller registers.
188 //
189 // The simdutf users should still have the fallback kernel. It is
190 // slower, but it should run everywhere.
191
192 //
193 // Enable valid runtime implementations, and select SIMDUTF_BUILTIN_IMPLEMENTATION
194 //
195
196 // We are going to use runtime dispatch.
197 #ifdef SIMDUTF_IS_X86_64
198 #ifdef __clang__
199 // clang does not have GCC push pop
200 // warning: clang attribute push can't be used within a namespace in clang up
201 // til 8.0 so SIMDUTF_TARGET_REGION and SIMDUTF_UNTARGET_REGION must be *outside* of a
202 // namespace.
203 #define SIMDUTF_TARGET_REGION(T) \
204 _Pragma(SIMDUTF_STRINGIFY( \
205 clang attribute push(__attribute__((target(T))), apply_to = function)))
206 #define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop")
207 #elif defined(__GNUC__)
208 // GCC is easier
209 #define SIMDUTF_TARGET_REGION(T) \
210 _Pragma("GCC push_options") _Pragma(SIMDUTF_STRINGIFY(GCC target(T)))
211 #define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options")
212 #endif // clang then gcc
213
214 #endif // x86
215
216 // Default target region macros don't do anything.
217 #ifndef SIMDUTF_TARGET_REGION
218 #define SIMDUTF_TARGET_REGION(T)
219 #define SIMDUTF_UNTARGET_REGION
220 #endif
221
222 // Is threading enabled?
223 #if defined(_REENTRANT) || defined(_MT)
224 #ifndef SIMDUTF_THREADS_ENABLED
225 #define SIMDUTF_THREADS_ENABLED
226 #endif
227 #endif
228
229 // workaround for large stack sizes under -O0.
230 // https://github.com/simdutf/simdutf/issues/691
231 #ifdef __APPLE__
232 #ifndef __OPTIMIZE__
233 // Apple systems have small stack sizes in secondary threads.
234 // Lack of compiler optimization may generate high stack usage.
235 // Users may want to disable threads for safety, but only when
236 // in debug mode which we detect by the fact that the __OPTIMIZE__
237 // macro is not defined.
238 #undef SIMDUTF_THREADS_ENABLED
239 #endif
240 #endif
241
242 #ifdef SIMDUTF_VISUAL_STUDIO
243 // This is one case where we do not distinguish between
244 // regular visual studio and clang under visual studio.
245 // clang under Windows has _stricmp (like visual studio) but not strcasecmp (as clang normally has)
246 #define simdutf_strcasecmp _stricmp
247 #define simdutf_strncasecmp _strnicmp
248 #else
249 // The strcasecmp, strncasecmp, and strcasestr functions do not work with multibyte strings (e.g. UTF-8).
250 // So they are only useful for ASCII in our context.
251 // https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings
252 #define simdutf_strcasecmp strcasecmp
253 #define simdutf_strncasecmp strncasecmp
254 #endif
255
256 #ifdef NDEBUG
257
258 #ifdef SIMDUTF_VISUAL_STUDIO
259 #define SIMDUTF_UNREACHABLE() __assume(0)
260 #define SIMDUTF_ASSUME(COND) __assume(COND)
261 #else
262 #define SIMDUTF_UNREACHABLE() __builtin_unreachable();
263 #define SIMDUTF_ASSUME(COND) do { if (!(COND)) __builtin_unreachable(); } while (0)
264 #endif
265
266 #else // NDEBUG
267
268 #define SIMDUTF_UNREACHABLE() assert(0);
269 #define SIMDUTF_ASSUME(COND) assert(COND)
270
271 #endif
272
273
274 #if defined(__GNUC__) && !defined(__clang__)
275 #if __GNUC__ >= 11
276 #define SIMDUTF_GCC11ORMORE 1
277 #endif // __GNUC__ >= 11
278 #endif // defined(__GNUC__) && !defined(__clang__)
279
280
281 #endif // SIMDUTF_PORTABILITY_H
282 /* end file include/simdutf/portability.h */
283 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/avx512.h
284 /* begin file include/simdutf/avx512.h */
285 #ifndef SIMDUTF_AVX512_H_
286 #define SIMDUTF_AVX512_H_
287
288 /*
289 It's possible to override AVX512 settings with cmake DCMAKE_CXX_FLAGS.
290
291 All preprocessor directives has form `SIMDUTF_HAS_AVX512{feature}`,
292 where a feature is a code name for extensions.
293
294 Please see the listing below to find which are supported.
295 */
296
297 #ifndef SIMDUTF_HAS_AVX512F
298 # if defined(__AVX512F__) && __AVX512F__ == 1
299 # define SIMDUTF_HAS_AVX512F 1
300 # endif
301 #endif
302
303 #ifndef SIMDUTF_HAS_AVX512DQ
304 # if defined(__AVX512DQ__) && __AVX512DQ__ == 1
305 # define SIMDUTF_HAS_AVX512DQ 1
306 # endif
307 #endif
308
309 #ifndef SIMDUTF_HAS_AVX512IFMA
310 # if defined(__AVX512IFMA__) && __AVX512IFMA__ == 1
311 # define SIMDUTF_HAS_AVX512IFMA 1
312 # endif
313 #endif
314
315 #ifndef SIMDUTF_HAS_AVX512CD
316 # if defined(__AVX512CD__) && __AVX512CD__ == 1
317 # define SIMDUTF_HAS_AVX512CD 1
318 # endif
319 #endif
320
321 #ifndef SIMDUTF_HAS_AVX512BW
322 # if defined(__AVX512BW__) && __AVX512BW__ == 1
323 # define SIMDUTF_HAS_AVX512BW 1
324 # endif
325 #endif
326
327 #ifndef SIMDUTF_HAS_AVX512VL
328 # if defined(__AVX512VL__) && __AVX512VL__ == 1
329 # define SIMDUTF_HAS_AVX512VL 1
330 # endif
331 #endif
332
333 #ifndef SIMDUTF_HAS_AVX512VBMI
334 # if defined(__AVX512VBMI__) && __AVX512VBMI__ == 1
335 # define SIMDUTF_HAS_AVX512VBMI 1
336 # endif
337 #endif
338
339 #ifndef SIMDUTF_HAS_AVX512VBMI2
340 # if defined(__AVX512VBMI2__) && __AVX512VBMI2__ == 1
341 # define SIMDUTF_HAS_AVX512VBMI2 1
342 # endif
343 #endif
344
345 #ifndef SIMDUTF_HAS_AVX512VNNI
346 # if defined(__AVX512VNNI__) && __AVX512VNNI__ == 1
347 # define SIMDUTF_HAS_AVX512VNNI 1
348 # endif
349 #endif
350
351 #ifndef SIMDUTF_HAS_AVX512BITALG
352 # if defined(__AVX512BITALG__) && __AVX512BITALG__ == 1
353 # define SIMDUTF_HAS_AVX512BITALG 1
354 # endif
355 #endif
356
357 #ifndef SIMDUTF_HAS_AVX512VPOPCNTDQ
358 # if defined(__AVX512VPOPCNTDQ__) && __AVX512VPOPCNTDQ__ == 1
359 # define SIMDUTF_HAS_AVX512VPOPCNTDQ 1
360 # endif
361 #endif
362
363 #endif // SIMDUTF_AVX512_H_
364 /* end file include/simdutf/avx512.h */
365
366
367 #if defined(__GNUC__)
368 // Marks a block with a name so that MCA analysis can see it.
369 #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
370 #define SIMDUTF_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
371 #define SIMDUTF_DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
372 #else
373 #define SIMDUTF_BEGIN_DEBUG_BLOCK(name)
374 #define SIMDUTF_END_DEBUG_BLOCK(name)
375 #define SIMDUTF_DEBUG_BLOCK(name, block)
376 #endif
377
378 // Align to N-byte boundary
379 #define SIMDUTF_ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1))
380 #define SIMDUTF_ROUNDDOWN_N(a, n) ((a) & ~((n)-1))
381
382 #define SIMDUTF_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0)
383
384 #if defined(SIMDUTF_REGULAR_VISUAL_STUDIO)
385
386 #define simdutf_really_inline __forceinline
387 #define simdutf_never_inline __declspec(noinline)
388
389 #define simdutf_unused
390 #define simdutf_warn_unused
391
392 #ifndef simdutf_likely
393 #define simdutf_likely(x) x
394 #endif
395 #ifndef simdutf_unlikely
396 #define simdutf_unlikely(x) x
397 #endif
398
399 #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning( push ))
400 #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning( push, 0 ))
401 #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning( disable : WARNING_NUMBER ))
402 // Get rid of Intellisense-only warnings (Code Analysis)
403 // Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910).
404 #ifdef __has_include
405 #if __has_include(<CppCoreCheck\Warnings.h>)
406 #include <CppCoreCheck\Warnings.h>
407 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS)
408 #endif
409 #endif
410
411 #ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS
412 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
413 #endif
414
415 #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996)
416 #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING
417 #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning( pop ))
418
419 #else // SIMDUTF_REGULAR_VISUAL_STUDIO
420
421 #define simdutf_really_inline inline __attribute__((always_inline))
422 #define simdutf_never_inline inline __attribute__((noinline))
423
424 #define simdutf_unused __attribute__((unused))
425 #define simdutf_warn_unused __attribute__((warn_unused_result))
426
427 #ifndef simdutf_likely
428 #define simdutf_likely(x) __builtin_expect(!!(x), 1)
429 #endif
430 #ifndef simdutf_unlikely
431 #define simdutf_unlikely(x) __builtin_expect(!!(x), 0)
432 #endif
433
434 #define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push")
435 // gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary
436 #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS SIMDUTF_PUSH_DISABLE_WARNINGS \
437 SIMDUTF_DISABLE_GCC_WARNING(-Weffc++) \
438 SIMDUTF_DISABLE_GCC_WARNING(-Wall) \
439 SIMDUTF_DISABLE_GCC_WARNING(-Wconversion) \
440 SIMDUTF_DISABLE_GCC_WARNING(-Wextra) \
441 SIMDUTF_DISABLE_GCC_WARNING(-Wattributes) \
442 SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \
443 SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \
444 SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type) \
445 SIMDUTF_DISABLE_GCC_WARNING(-Wshadow) \
446 SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter) \
447 SIMDUTF_DISABLE_GCC_WARNING(-Wunused-variable)
448 #define SIMDUTF_PRAGMA(P) _Pragma(#P)
449 #define SIMDUTF_DISABLE_GCC_WARNING(WARNING) SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING)
450 #if defined(SIMDUTF_CLANG_VISUAL_STUDIO)
451 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include)
452 #else
453 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
454 #endif
455 #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations)
456 #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow)
457 #define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop")
458
459
460
461 #endif // MSC_VER
462
463 #ifndef SIMDUTF_DLLIMPORTEXPORT
464 #if defined(SIMDUTF_VISUAL_STUDIO)
465 /**
466 * It does not matter here whether you are using
467 * the regular visual studio or clang under visual
468 * studio.
469 */
470 #if SIMDUTF_USING_LIBRARY
471 #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
472 #else
473 #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
474 #endif
475 #else
476 #define SIMDUTF_DLLIMPORTEXPORT
477 #endif
478 #endif
479
480 /// If EXPR is an error, returns it.
481 #define SIMDUTF_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
482
483
484 #endif // SIMDUTF_COMMON_DEFS_H
485 /* end file include/simdutf/common_defs.h */
486 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/encoding_types.h
487 /* begin file include/simdutf/encoding_types.h */
488 #include <string>
489
490 namespace simdutf {
491
492 enum encoding_type {
493 UTF8 = 1, // BOM 0xef 0xbb 0xbf
494 UTF16_LE = 2, // BOM 0xff 0xfe
495 UTF16_BE = 4, // BOM 0xfe 0xff
496 UTF32_LE = 8, // BOM 0xff 0xfe 0x00 0x00
497 UTF32_BE = 16, // BOM 0x00 0x00 0xfe 0xff
498
499 unspecified = 0
500 };
501
502 enum endianness {
503 LITTLE,
504 BIG
505 };
506
507 bool match_system(endianness e);
508
509 std::string to_string(encoding_type bom);
510
511 // Note that BOM for UTF8 is discouraged.
512 namespace BOM {
513
514 /**
515 * Checks for a BOM. If not, returns unspecified
516 * @param input the string to process
517 * @param length the length of the string in words
518 * @return the corresponding encoding
519 */
520
521 encoding_type check_bom(const uint8_t* byte, size_t length);
522 encoding_type check_bom(const char* byte, size_t length);
523 /**
524 * Returns the size, in bytes, of the BOM for a given encoding type.
525 * Note that UTF8 BOM are discouraged.
526 * @param bom the encoding type
527 * @return the size in bytes of the corresponding BOM
528 */
529 size_t bom_byte_size(encoding_type bom);
530
531 } // BOM namespace
532 } // simdutf namespace
533 /* end file include/simdutf/encoding_types.h */
534 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/error.h
535 /* begin file include/simdutf/error.h */
536 #ifndef ERROR_H
537 #define ERROR_H
538 namespace simdutf {
539
540 enum error_code {
541 SUCCESS = 0,
542 HEADER_BITS, // Any byte must have fewer than 5 header bits.
543 TOO_SHORT, // The leading byte must be followed by N-1 continuation bytes, where N is the UTF-8 character length
544 // This is also the error when the input is truncated.
545 TOO_LONG, // We either have too many consecutive continuation bytes or the string starts with a continuation byte.
546 OVERLONG, // The decoded character must be above U+7F for two-byte characters, U+7FF for three-byte characters,
547 // and U+FFFF for four-byte characters.
548 TOO_LARGE, // The decoded character must be less than or equal to U+10FFFF OR less than or equal than U+7F for ASCII.
549 SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or UTF-32) OR
550 // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16)
551 OTHER // Not related to validation/transcoding.
552 };
553
554 struct result {
555 error_code error;
556 size_t count; // In case of error, indicates the position of the error. In case of success, indicates the number of words validated/written.
557
558 simdutf_really_inline result();
559
560 simdutf_really_inline result(error_code, size_t);
561 };
562
563 }
564 #endif
565 /* end file include/simdutf/error.h */
566
567 SIMDUTF_PUSH_DISABLE_WARNINGS
568 SIMDUTF_DISABLE_UNDESIRED_WARNINGS
569
570 // Public API
571 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/simdutf_version.h
572 /* begin file include/simdutf/simdutf_version.h */
573 // /include/simdutf/simdutf_version.h automatically generated by release.py,
574 // do not change by hand
575 #ifndef SIMDUTF_SIMDUTF_VERSION_H
576 #define SIMDUTF_SIMDUTF_VERSION_H
577
578 /** The version of simdutf being used (major.minor.revision) */
579 #define SIMDUTF_VERSION "3.2.14"
580
581 namespace simdutf {
582 enum {
583 /**
584 * The major version (MAJOR.minor.revision) of simdutf being used.
585 */
586 SIMDUTF_VERSION_MAJOR = 3,
587 /**
588 * The minor version (major.MINOR.revision) of simdutf being used.
589 */
590 SIMDUTF_VERSION_MINOR = 2,
591 /**
592 * The revision (major.minor.REVISION) of simdutf being used.
593 */
594 SIMDUTF_VERSION_REVISION = 14
595 };
596 } // namespace simdutf
597
598 #endif // SIMDUTF_SIMDUTF_VERSION_H
599 /* end file include/simdutf/simdutf_version.h */
600 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/implementation.h
601 /* begin file include/simdutf/implementation.h */
602 #ifndef SIMDUTF_IMPLEMENTATION_H
603 #define SIMDUTF_IMPLEMENTATION_H
604 #include <string>
605 #if !defined(SIMDUTF_NO_THREADS)
606 #include <atomic>
607 #endif
608 #include <vector>
609 #include <tuple>
610 // dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/internal/isadetection.h
611 /* begin file include/simdutf/internal/isadetection.h */
612 /* From
613 https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
614 Highly modified.
615
616 Copyright (c) 2016- Facebook, Inc (Adam Paszke)
617 Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
618 Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
619 Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
620 Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
621 Copyright (c) 2011-2013 NYU (Clement Farabet)
622 Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
623 Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute
624 (Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
625 Samy Bengio, Johnny Mariethoz)
626
627 All rights reserved.
628
629 Redistribution and use in source and binary forms, with or without
630 modification, are permitted provided that the following conditions are met:
631
632 1. Redistributions of source code must retain the above copyright
633 notice, this list of conditions and the following disclaimer.
634
635 2. Redistributions in binary form must reproduce the above copyright
636 notice, this list of conditions and the following disclaimer in the
637 documentation and/or other materials provided with the distribution.
638
639 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
640 America and IDIAP Research Institute nor the names of its contributors may be
641 used to endorse or promote products derived from this software without
642 specific prior written permission.
643
644 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
645 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
646 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
647 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
648 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
649 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
650 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
651 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
652 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
653 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
654 POSSIBILITY OF SUCH DAMAGE.
655 */
656
657 #ifndef SIMDutf_INTERNAL_ISADETECTION_H
658 #define SIMDutf_INTERNAL_ISADETECTION_H
659
660 #include <cstdint>
661 #include <cstdlib>
662 #if defined(_MSC_VER)
663 #include <intrin.h>
664 #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
665 #include <cpuid.h>
666 #endif
667
668 namespace simdutf {
669 namespace internal {
670
671 enum instruction_set {
672 DEFAULT = 0x0,
673 NEON = 0x1,
674 AVX2 = 0x4,
675 SSE42 = 0x8,
676 PCLMULQDQ = 0x10,
677 BMI1 = 0x20,
678 BMI2 = 0x40,
679 ALTIVEC = 0x80,
680 AVX512F = 0x100,
681 AVX512DQ = 0x200,
682 AVX512IFMA = 0x400,
683 AVX512PF = 0x800,
684 AVX512ER = 0x1000,
685 AVX512CD = 0x2000,
686 AVX512BW = 0x4000,
687 AVX512VL = 0x8000,
688 AVX512VBMI2 = 0x10000
689 };
690
691 #if defined(__PPC64__)
692
detect_supported_architectures()693 static inline uint32_t detect_supported_architectures() {
694 return instruction_set::ALTIVEC;
695 }
696
697 #elif defined(__aarch64__) || defined(_M_ARM64)
698
detect_supported_architectures()699 static inline uint32_t detect_supported_architectures() {
700 return instruction_set::NEON;
701 }
702
703 #elif defined(__x86_64__) || defined(_M_AMD64) // x64
704
705
706 namespace {
707 namespace cpuid_bit {
708 // Can be found on Intel ISA Reference for CPUID
709
710 // EAX = 0x01
711 constexpr uint32_t pclmulqdq = uint32_t(1) << 1; ///< @private bit 1 of ECX for EAX=0x1
712 constexpr uint32_t sse42 = uint32_t(1) << 20; ///< @private bit 20 of ECX for EAX=0x1
713 constexpr uint32_t osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1
714
715 // EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf)
716 // See: "Table 3-8. Information Returned by CPUID Instruction"
717 namespace ebx {
718 constexpr uint32_t bmi1 = uint32_t(1) << 3;
719 constexpr uint32_t avx2 = uint32_t(1) << 5;
720 constexpr uint32_t bmi2 = uint32_t(1) << 8;
721 constexpr uint32_t avx512f = uint32_t(1) << 16;
722 constexpr uint32_t avx512dq = uint32_t(1) << 17;
723 constexpr uint32_t avx512ifma = uint32_t(1) << 21;
724 constexpr uint32_t avx512cd = uint32_t(1) << 28;
725 constexpr uint32_t avx512bw = uint32_t(1) << 30;
726 constexpr uint32_t avx512vl = uint32_t(1) << 31;
727 }
728
729 namespace ecx {
730 constexpr uint32_t avx512vbmi = uint32_t(1) << 1;
731 constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6;
732 constexpr uint32_t avx512vnni = uint32_t(1) << 11;
733 constexpr uint32_t avx512bitalg = uint32_t(1) << 12;
734 constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14;
735 }
736 namespace edx {
737 constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8;
738 }
739 namespace xcr0_bit {
740 constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX
741 constexpr uint64_t avx512_saved = uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
742 }
743 }
744 }
745
746
747
cpuid(uint32_t * eax,uint32_t * ebx,uint32_t * ecx,uint32_t * edx)748 static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
749 uint32_t *edx) {
750 #if defined(_MSC_VER)
751 int cpu_info[4];
752 __cpuidex(cpu_info, *eax, *ecx);
753 *eax = cpu_info[0];
754 *ebx = cpu_info[1];
755 *ecx = cpu_info[2];
756 *edx = cpu_info[3];
757 #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
758 uint32_t level = *eax;
759 __get_cpuid(level, eax, ebx, ecx, edx);
760 #else
761 uint32_t a = *eax, b, c = *ecx, d;
762 asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
763 *eax = a;
764 *ebx = b;
765 *ecx = c;
766 *edx = d;
767 #endif
768 }
769
xgetbv()770 static inline uint64_t xgetbv() {
771 #if defined(_MSC_VER)
772 return _xgetbv(0);
773 #else
774 uint32_t xcr0_lo, xcr0_hi;
775 asm volatile("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0));
776 return xcr0_lo | ((uint64_t)xcr0_hi << 32);
777 #endif
778 }
779
detect_supported_architectures()780 static inline uint32_t detect_supported_architectures() {
781 uint32_t eax;
782 uint32_t ebx = 0;
783 uint32_t ecx = 0;
784 uint32_t edx = 0;
785 uint32_t host_isa = 0x0;
786
787 // EBX for EAX=0x1
788 eax = 0x1;
789 cpuid(&eax, &ebx, &ecx, &edx);
790
791 if (ecx & cpuid_bit::sse42) {
792 host_isa |= instruction_set::SSE42;
793 }
794
795 if (ecx & cpuid_bit::pclmulqdq) {
796 host_isa |= instruction_set::PCLMULQDQ;
797 }
798
799 if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) {
800 return host_isa;
801 }
802
803 // xgetbv for checking if the OS saves registers
804 uint64_t xcr0 = xgetbv();
805
806 if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) {
807 return host_isa;
808 }
809 // ECX for EAX=0x7
810 eax = 0x7;
811 ecx = 0x0; // Sub-leaf = 0
812 cpuid(&eax, &ebx, &ecx, &edx);
813 if (ebx & cpuid_bit::ebx::avx2) {
814 host_isa |= instruction_set::AVX2;
815 }
816 if (ebx & cpuid_bit::ebx::bmi1) {
817 host_isa |= instruction_set::BMI1;
818 }
819 if (ebx & cpuid_bit::ebx::bmi2) {
820 host_isa |= instruction_set::BMI2;
821 }
822 if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == cpuid_bit::xcr0_bit::avx512_saved)) {
823 return host_isa;
824 }
825 if (ebx & cpuid_bit::ebx::avx512f) {
826 host_isa |= instruction_set::AVX512F;
827 }
828 if (ebx & cpuid_bit::ebx::avx512bw) {
829 host_isa |= instruction_set::AVX512BW;
830 }
831 if (ebx & cpuid_bit::ebx::avx512cd) {
832 host_isa |= instruction_set::AVX512CD;
833 }
834 if (ebx & cpuid_bit::ebx::avx512dq) {
835 host_isa |= instruction_set::AVX512DQ;
836 }
837 if (ebx & cpuid_bit::ebx::avx512vl) {
838 host_isa |= instruction_set::AVX512VL;
839 }
840 if (ecx & cpuid_bit::ecx::avx512vbmi2) {
841 host_isa |= instruction_set::AVX512VBMI2;
842 }
843 return host_isa;
844 }
845 #else // fallback
846
847 // includes 32-bit ARM.
detect_supported_architectures()848 static inline uint32_t detect_supported_architectures() {
849 return instruction_set::DEFAULT;
850 }
851
852
853 #endif // end SIMD extension detection code
854
855 } // namespace internal
856 } // namespace simdutf
857
858 #endif // SIMDutf_INTERNAL_ISADETECTION_H
859 /* end file include/simdutf/internal/isadetection.h */
860
861
862 namespace simdutf {
863
864 /**
865 * Autodetect the encoding of the input, a single encoding is recommended.
866 * E.g., the function might return simdutf::encoding_type::UTF8,
867 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
868 * simdutf::encoding_type::UTF32_LE.
869 *
870 * @param input the string to analyze.
871 * @param length the length of the string in bytes.
872 * @return the detected encoding type
873 */
874 simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * input, size_t length) noexcept;
autodetect_encoding(const uint8_t * input,size_t length)875 simdutf_really_inline simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const uint8_t * input, size_t length) noexcept {
876 return autodetect_encoding(reinterpret_cast<const char *>(input), length);
877 }
878
879 /**
880 * Autodetect the possible encodings of the input in one pass.
881 * E.g., if the input might be UTF-16LE or UTF-8, this function returns
882 * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
883 *
884 * Overriden by each implementation.
885 *
886 * @param input the string to analyze.
887 * @param length the length of the string in bytes.
888 * @return the detected encoding type
889 */
890 simdutf_warn_unused int detect_encodings(const char * input, size_t length) noexcept;
detect_encodings(const uint8_t * input,size_t length)891 simdutf_really_inline simdutf_warn_unused int detect_encodings(const uint8_t * input, size_t length) noexcept {
892 return detect_encodings(reinterpret_cast<const char *>(input), length);
893 }
894
895
896 /**
897 * Validate the UTF-8 string. This function may be best when you expect
898 * the input to be almost always valid. Otherwise, consider using
899 * validate_utf8_with_errors.
900 *
901 * Overridden by each implementation.
902 *
903 * @param buf the UTF-8 string to validate.
904 * @param len the length of the string in bytes.
905 * @return true if and only if the string is valid UTF-8.
906 */
907 simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
908
909 /**
910 * Validate the UTF-8 string and stop on error.
911 *
912 * Overridden by each implementation.
913 *
914 * @param buf the UTF-8 string to validate.
915 * @param len the length of the string in bytes.
916 * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
917 */
918 simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept;
919
920 /**
921 * Validate the ASCII string.
922 *
923 * Overridden by each implementation.
924 *
925 * @param buf the ASCII string to validate.
926 * @param len the length of the string in bytes.
927 * @return true if and only if the string is valid ASCII.
928 */
929 simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
930
931 /**
932 * Validate the ASCII string and stop on error. It might be faster than
933 * validate_utf8 when an error is expected to occur early.
934 *
935 * Overridden by each implementation.
936 *
937 * @param buf the ASCII string to validate.
938 * @param len the length of the string in bytes.
939 * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
940 */
941 simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept;
942
943 /**
944 * Using native endianness; Validate the UTF-16 string.
945 * This function may be best when you expect the input to be almost always valid.
946 * Otherwise, consider using validate_utf16_with_errors.
947 *
948 * Overridden by each implementation.
949 *
950 * This function is not BOM-aware.
951 *
952 * @param buf the UTF-16 string to validate.
953 * @param len the length of the string in number of 2-byte words (char16_t).
954 * @return true if and only if the string is valid UTF-16.
955 */
956 simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) noexcept;
957
958 /**
959 * Validate the UTF-16LE string. This function may be best when you expect
960 * the input to be almost always valid. Otherwise, consider using
961 * validate_utf16le_with_errors.
962 *
963 * Overridden by each implementation.
964 *
965 * This function is not BOM-aware.
966 *
967 * @param buf the UTF-16LE string to validate.
968 * @param len the length of the string in number of 2-byte words (char16_t).
969 * @return true if and only if the string is valid UTF-16LE.
970 */
971 simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) noexcept;
972
973 /**
974 * Validate the UTF-16BE string. This function may be best when you expect
975 * the input to be almost always valid. Otherwise, consider using
976 * validate_utf16be_with_errors.
977 *
978 * Overridden by each implementation.
979 *
980 * This function is not BOM-aware.
981 *
982 * @param buf the UTF-16BE string to validate.
983 * @param len the length of the string in number of 2-byte words (char16_t).
984 * @return true if and only if the string is valid UTF-16BE.
985 */
986 simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) noexcept;
987
988 /**
989 * Using native endianness; Validate the UTF-16 string and stop on error.
990 * It might be faster than validate_utf16 when an error is expected to occur early.
991 *
992 * Overridden by each implementation.
993 *
994 * This function is not BOM-aware.
995 *
996 * @param buf the UTF-16 string to validate.
997 * @param len the length of the string in number of 2-byte words (char16_t).
998 * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
999 */
1000 simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf, size_t len) noexcept;
1001
1002 /**
1003 * Validate the UTF-16LE string and stop on error. It might be faster than
1004 * validate_utf16le when an error is expected to occur early.
1005 *
1006 * Overridden by each implementation.
1007 *
1008 * This function is not BOM-aware.
1009 *
1010 * @param buf the UTF-16LE string to validate.
1011 * @param len the length of the string in number of 2-byte words (char16_t).
1012 * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
1013 */
1014 simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) noexcept;
1015
1016 /**
1017 * Validate the UTF-16BE string and stop on error. It might be faster than
1018 * validate_utf16be when an error is expected to occur early.
1019 *
1020 * Overridden by each implementation.
1021 *
1022 * This function is not BOM-aware.
1023 *
1024 * @param buf the UTF-16BE string to validate.
1025 * @param len the length of the string in number of 2-byte words (char16_t).
1026 * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
1027 */
1028 simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) noexcept;
1029
1030 /**
1031 * Validate the UTF-32 string. This function may be best when you expect
1032 * the input to be almost always valid. Otherwise, consider using
1033 * validate_utf32_with_errors.
1034 *
1035 * Overridden by each implementation.
1036 *
1037 * This function is not BOM-aware.
1038 *
1039 * @param buf the UTF-32 string to validate.
1040 * @param len the length of the string in number of 4-byte words (char32_t).
1041 * @return true if and only if the string is valid UTF-32.
1042 */
1043 simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcept;
1044
1045 /**
1046 * Validate the UTF-32 string and stop on error. It might be faster than
1047 * validate_utf32 when an error is expected to occur early.
1048 *
1049 * Overridden by each implementation.
1050 *
1051 * This function is not BOM-aware.
1052 *
1053 * @param buf the UTF-32 string to validate.
1054 * @param len the length of the string in number of 4-byte words (char32_t).
1055 * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
1056 */
1057 simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept;
1058
1059 /**
1060 * Using native endianness; Convert possibly broken UTF-8 string into UTF-16 string.
1061 *
1062 * During the conversion also validation of the input string is done.
1063 * This function is suitable to work with inputs from untrusted sources.
1064 *
1065 * @param input the UTF-8 string to convert
1066 * @param length the length of the string in bytes
1067 * @param utf16_buffer the pointer to buffer that can hold conversion result
1068 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
1069 */
1070 simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept;
1071
1072 /**
1073 * Convert possibly broken UTF-8 string into UTF-16LE string.
1074 *
1075 * During the conversion also validation of the input string is done.
1076 * This function is suitable to work with inputs from untrusted sources.
1077 *
1078 * @param input the UTF-8 string to convert
1079 * @param length the length of the string in bytes
1080 * @param utf16_buffer the pointer to buffer that can hold conversion result
1081 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
1082 */
1083 simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept;
1084
1085 /**
1086 * Convert possibly broken UTF-8 string into UTF-16BE string.
1087 *
1088 * During the conversion also validation of the input string is done.
1089 * This function is suitable to work with inputs from untrusted sources.
1090 *
1091 * @param input the UTF-8 string to convert
1092 * @param length the length of the string in bytes
1093 * @param utf16_buffer the pointer to buffer that can hold conversion result
1094 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
1095 */
1096 simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept;
1097
1098 /**
1099 * Using native endianness; Convert possibly broken UTF-8 string into UTF-16
1100 * string and stop on error.
1101 *
1102 * During the conversion also validation of the input string is done.
1103 * This function is suitable to work with inputs from untrusted sources.
1104 *
1105 * @param input the UTF-8 string to convert
1106 * @param length the length of the string in bytes
1107 * @param utf16_buffer the pointer to buffer that can hold conversion result
1108 * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
1109 */
1110 simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
1111
1112 /**
1113 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
1114 *
1115 * During the conversion also validation of the input string is done.
1116 * This function is suitable to work with inputs from untrusted sources.
1117 *
1118 * @param input the UTF-8 string to convert
1119 * @param length the length of the string in bytes
1120 * @param utf16_buffer the pointer to buffer that can hold conversion result
1121 * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
1122 */
1123 simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
1124
1125 /**
1126 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
1127 *
1128 * During the conversion also validation of the input string is done.
1129 * This function is suitable to work with inputs from untrusted sources.
1130 *
1131 * @param input the UTF-8 string to convert
1132 * @param length the length of the string in bytes
1133 * @param utf16_buffer the pointer to buffer that can hold conversion result
1134 * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
1135 */
1136 simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
1137
1138 /**
1139 * Convert possibly broken UTF-8 string into UTF-32 string.
1140 *
1141 * During the conversion also validation of the input string is done.
1142 * This function is suitable to work with inputs from untrusted sources.
1143 *
1144 * @param input the UTF-8 string to convert
1145 * @param length the length of the string in bytes
1146 * @param utf32_buffer the pointer to buffer that can hold conversion result
1147 * @return the number of written char32_t; 0 if the input was not valid UTF-8 string
1148 */
1149 simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept;
1150
1151 /**
1152 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
1153 *
1154 * During the conversion also validation of the input string is done.
1155 * This function is suitable to work with inputs from untrusted sources.
1156 *
1157 * @param input the UTF-8 string to convert
1158 * @param length the length of the string in bytes
1159 * @param utf32_buffer the pointer to buffer that can hold conversion result
1160 * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
1161 */
1162 simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept;
1163
1164 /**
1165 * Using native endianness; Convert valid UTF-8 string into UTF-16 string.
1166 *
1167 * This function assumes that the input string is valid UTF-8.
1168 *
1169 * @param input the UTF-8 string to convert
1170 * @param length the length of the string in bytes
1171 * @param utf16_buffer the pointer to buffer that can hold conversion result
1172 * @return the number of written char16_t
1173 */
1174 simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
1175
1176 /**
1177 * Convert valid UTF-8 string into UTF-16LE string.
1178 *
1179 * This function assumes that the input string is valid UTF-8.
1180 *
1181 * @param input the UTF-8 string to convert
1182 * @param length the length of the string in bytes
1183 * @param utf16_buffer the pointer to buffer that can hold conversion result
1184 * @return the number of written char16_t
1185 */
1186 simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
1187
1188 /**
1189 * Convert valid UTF-8 string into UTF-16BE string.
1190 *
1191 * This function assumes that the input string is valid UTF-8.
1192 *
1193 * @param input the UTF-8 string to convert
1194 * @param length the length of the string in bytes
1195 * @param utf16_buffer the pointer to buffer that can hold conversion result
1196 * @return the number of written char16_t
1197 */
1198 simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
1199
1200 /**
1201 * Convert valid UTF-8 string into UTF-32 string.
1202 *
1203 * This function assumes that the input string is valid UTF-8.
1204 *
1205 * @param input the UTF-8 string to convert
1206 * @param length the length of the string in bytes
1207 * @param utf32_buffer the pointer to buffer that can hold conversion result
1208 * @return the number of written char32_t
1209 */
1210 simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept;
1211
1212 /**
1213 * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format.
1214 *
1215 * This function does not validate the input.
1216 *
1217 * This function is not BOM-aware.
1218 *
1219 * @param input the UTF-8 string to process
1220 * @param length the length of the string in bytes
1221 * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE
1222 */
1223 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept;
1224
1225 /**
1226 * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32 format.
1227 *
1228 * This function is equivalent to count_utf8
1229 *
1230 * This function does not validate the input.
1231 *
1232 * This function is not BOM-aware.
1233 *
1234 * @param input the UTF-8 string to process
1235 * @param length the length of the string in bytes
1236 * @return the number of char32_t words required to encode the UTF-8 string as UTF-32
1237 */
1238 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept;
1239
1240 /**
1241 * Using native endianness; Convert possibly broken UTF-16 string into UTF-8 string.
1242 *
1243 * During the conversion also validation of the input string is done.
1244 * This function is suitable to work with inputs from untrusted sources.
1245 *
1246 * This function is not BOM-aware.
1247 *
1248 * @param input the UTF-16 string to convert
1249 * @param length the length of the string in 2-byte words (char16_t)
1250 * @param utf8_buffer the pointer to buffer that can hold conversion result
1251 * @return number of written words; 0 if input is not a valid UTF-16LE string
1252 */
1253 simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1254
1255 /**
1256 * Convert possibly broken UTF-16LE string into UTF-8 string.
1257 *
1258 * During the conversion also validation of the input string is done.
1259 * This function is suitable to work with inputs from untrusted sources.
1260 *
1261 * This function is not BOM-aware.
1262 *
1263 * @param input the UTF-16LE string to convert
1264 * @param length the length of the string in 2-byte words (char16_t)
1265 * @param utf8_buffer the pointer to buffer that can hold conversion result
1266 * @return number of written words; 0 if input is not a valid UTF-16LE string
1267 */
1268 simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1269
1270 /**
1271 * Convert possibly broken UTF-16BE string into UTF-8 string.
1272 *
1273 * During the conversion also validation of the input string is done.
1274 * This function is suitable to work with inputs from untrusted sources.
1275 *
1276 * This function is not BOM-aware.
1277 *
1278 * @param input the UTF-16BE string to convert
1279 * @param length the length of the string in 2-byte words (char16_t)
1280 * @param utf8_buffer the pointer to buffer that can hold conversion result
1281 * @return number of written words; 0 if input is not a valid UTF-16LE string
1282 */
1283 simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1284
1285 /**
1286 * Using native endianness; Convert possibly broken UTF-16 string into UTF-8 string and stop on error.
1287 *
1288 * During the conversion also validation of the input string is done.
1289 * This function is suitable to work with inputs from untrusted sources.
1290 *
1291 * This function is not BOM-aware.
1292 *
1293 * @param input the UTF-16 string to convert
1294 * @param length the length of the string in 2-byte words (char16_t)
1295 * @param utf8_buffer the pointer to buffer that can hold conversion result
1296 * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
1297 */
1298 simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1299
1300 /**
1301 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
1302 *
1303 * During the conversion also validation of the input string is done.
1304 * This function is suitable to work with inputs from untrusted sources.
1305 *
1306 * This function is not BOM-aware.
1307 *
1308 * @param input the UTF-16LE string to convert
1309 * @param length the length of the string in 2-byte words (char16_t)
1310 * @param utf8_buffer the pointer to buffer that can hold conversion result
1311 * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
1312 */
1313 simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1314
1315 /**
1316 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
1317 *
1318 * During the conversion also validation of the input string is done.
1319 * This function is suitable to work with inputs from untrusted sources.
1320 *
1321 * This function is not BOM-aware.
1322 *
1323 * @param input the UTF-16BE string to convert
1324 * @param length the length of the string in 2-byte words (char16_t)
1325 * @param utf8_buffer the pointer to buffer that can hold conversion result
1326 * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
1327 */
1328 simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1329
1330 /**
1331 * Using native endianness; Convert valid UTF-16 string into UTF-8 string.
1332 *
1333 * This function assumes that the input string is valid UTF-16LE.
1334 *
1335 * This function is not BOM-aware.
1336 *
1337 * @param input the UTF-16 string to convert
1338 * @param length the length of the string in 2-byte words (char16_t)
1339 * @param utf8_buffer the pointer to buffer that can hold the conversion result
1340 * @return number of written words; 0 if conversion is not possible
1341 */
1342 simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1343
1344 /**
1345 * Convert valid UTF-16LE string into UTF-8 string.
1346 *
1347 * This function assumes that the input string is valid UTF-16LE.
1348 *
1349 * This function is not BOM-aware.
1350 *
1351 * @param input the UTF-16LE string to convert
1352 * @param length the length of the string in 2-byte words (char16_t)
1353 * @param utf8_buffer the pointer to buffer that can hold the conversion result
1354 * @return number of written words; 0 if conversion is not possible
1355 */
1356 simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1357
1358 /**
1359 * Convert valid UTF-16BE string into UTF-8 string.
1360 *
1361 * This function assumes that the input string is valid UTF-16BE.
1362 *
1363 * This function is not BOM-aware.
1364 *
1365 * @param input the UTF-16BE string to convert
1366 * @param length the length of the string in 2-byte words (char16_t)
1367 * @param utf8_buffer the pointer to buffer that can hold the conversion result
1368 * @return number of written words; 0 if conversion is not possible
1369 */
1370 simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1371
1372 /**
1373 * Using native endianness; Convert possibly broken UTF-16 string into UTF-32 string.
1374 *
1375 * During the conversion also validation of the input string is done.
1376 * This function is suitable to work with inputs from untrusted sources.
1377 *
1378 * This function is not BOM-aware.
1379 *
1380 * @param input the UTF-16 string to convert
1381 * @param length the length of the string in 2-byte words (char16_t)
1382 * @param utf32_buffer the pointer to buffer that can hold conversion result
1383 * @return number of written words; 0 if input is not a valid UTF-16LE string
1384 */
1385 simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1386
1387 /**
1388 * Convert possibly broken UTF-16LE string into UTF-32 string.
1389 *
1390 * During the conversion also validation of the input string is done.
1391 * This function is suitable to work with inputs from untrusted sources.
1392 *
1393 * This function is not BOM-aware.
1394 *
1395 * @param input the UTF-16LE string to convert
1396 * @param length the length of the string in 2-byte words (char16_t)
1397 * @param utf32_buffer the pointer to buffer that can hold conversion result
1398 * @return number of written words; 0 if input is not a valid UTF-16LE string
1399 */
1400 simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1401
1402 /**
1403 * Convert possibly broken UTF-16BE string into UTF-32 string.
1404 *
1405 * During the conversion also validation of the input string is done.
1406 * This function is suitable to work with inputs from untrusted sources.
1407 *
1408 * This function is not BOM-aware.
1409 *
1410 * @param input the UTF-16BE string to convert
1411 * @param length the length of the string in 2-byte words (char16_t)
1412 * @param utf32_buffer the pointer to buffer that can hold conversion result
1413 * @return number of written words; 0 if input is not a valid UTF-16LE string
1414 */
1415 simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1416
1417 /**
1418 * Using native endianness; Convert possibly broken UTF-16 string into
1419 * UTF-32 string and stop on error.
1420 *
1421 * During the conversion also validation of the input string is done.
1422 * This function is suitable to work with inputs from untrusted sources.
1423 *
1424 * This function is not BOM-aware.
1425 *
1426 * @param input the UTF-16 string to convert
1427 * @param length the length of the string in 2-byte words (char16_t)
1428 * @param utf32_buffer the pointer to buffer that can hold conversion result
1429 * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
1430 */
1431 simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1432
1433 /**
1434 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
1435 *
1436 * During the conversion also validation of the input string is done.
1437 * This function is suitable to work with inputs from untrusted sources.
1438 *
1439 * This function is not BOM-aware.
1440 *
1441 * @param input the UTF-16LE string to convert
1442 * @param length the length of the string in 2-byte words (char16_t)
1443 * @param utf32_buffer the pointer to buffer that can hold conversion result
1444 * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
1445 */
1446 simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1447
1448 /**
1449 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
1450 *
1451 * During the conversion also validation of the input string is done.
1452 * This function is suitable to work with inputs from untrusted sources.
1453 *
1454 * This function is not BOM-aware.
1455 *
1456 * @param input the UTF-16BE string to convert
1457 * @param length the length of the string in 2-byte words (char16_t)
1458 * @param utf32_buffer the pointer to buffer that can hold conversion result
1459 * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
1460 */
1461 simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1462
1463 /**
1464 * Using native endianness; Convert valid UTF-16 string into UTF-32 string.
1465 *
1466 * This function assumes that the input string is valid UTF-16 (native endianness).
1467 *
1468 * This function is not BOM-aware.
1469 *
1470 * @param input the UTF-16 string to convert
1471 * @param length the length of the string in 2-byte words (char16_t)
1472 * @param utf32_buffer the pointer to buffer that can hold the conversion result
1473 * @return number of written words; 0 if conversion is not possible
1474 */
1475 simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1476
1477 /**
1478 * Convert valid UTF-16LE string into UTF-32 string.
1479 *
1480 * This function assumes that the input string is valid UTF-16LE.
1481 *
1482 * This function is not BOM-aware.
1483 *
1484 * @param input the UTF-16LE string to convert
1485 * @param length the length of the string in 2-byte words (char16_t)
1486 * @param utf32_buffer the pointer to buffer that can hold the conversion result
1487 * @return number of written words; 0 if conversion is not possible
1488 */
1489 simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1490
1491 /**
1492 * Convert valid UTF-16BE string into UTF-32 string.
1493 *
1494 * This function assumes that the input string is valid UTF-16LE.
1495 *
1496 * This function is not BOM-aware.
1497 *
1498 * @param input the UTF-16BE string to convert
1499 * @param length the length of the string in 2-byte words (char16_t)
1500 * @param utf32_buffer the pointer to buffer that can hold the conversion result
1501 * @return number of written words; 0 if conversion is not possible
1502 */
1503 simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1504
1505 /**
1506 * Using native endianness; Compute the number of bytes that this UTF-16
1507 * string would require in UTF-8 format.
1508 *
1509 * This function does not validate the input.
1510 *
1511 * @param input the UTF-16 string to convert
1512 * @param length the length of the string in 2-byte words (char16_t)
1513 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
1514 */
1515 simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept;
1516
1517 /**
1518 * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
1519 *
1520 * This function does not validate the input.
1521 *
1522 * @param input the UTF-16LE string to convert
1523 * @param length the length of the string in 2-byte words (char16_t)
1524 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
1525 */
1526 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept;
1527
1528 /**
1529 * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format.
1530 *
1531 * This function does not validate the input.
1532 *
1533 * @param input the UTF-16BE string to convert
1534 * @param length the length of the string in 2-byte words (char16_t)
1535 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
1536 */
1537 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept;
1538
1539 /**
1540 * Convert possibly broken UTF-32 string into UTF-8 string.
1541 *
1542 * During the conversion also validation of the input string is done.
1543 * This function is suitable to work with inputs from untrusted sources.
1544 *
1545 * This function is not BOM-aware.
1546 *
1547 * @param input the UTF-32 string to convert
1548 * @param length the length of the string in 4-byte words (char32_t)
1549 * @param utf8_buffer the pointer to buffer that can hold conversion result
1550 * @return number of written words; 0 if input is not a valid UTF-32 string
1551 */
1552 simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
1553
1554 /**
1555 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
1556 *
1557 * During the conversion also validation of the input string is done.
1558 * This function is suitable to work with inputs from untrusted sources.
1559 *
1560 * This function is not BOM-aware.
1561 *
1562 * @param input the UTF-32 string to convert
1563 * @param length the length of the string in 4-byte words (char32_t)
1564 * @param utf8_buffer the pointer to buffer that can hold conversion result
1565 * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
1566 */
1567 simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
1568
1569 /**
1570 * Convert valid UTF-32 string into UTF-8 string.
1571 *
1572 * This function assumes that the input string is valid UTF-32.
1573 *
1574 * This function is not BOM-aware.
1575 *
1576 * @param input the UTF-32 string to convert
1577 * @param length the length of the string in 4-byte words (char32_t)
1578 * @param utf8_buffer the pointer to buffer that can hold the conversion result
1579 * @return number of written words; 0 if conversion is not possible
1580 */
1581 simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
1582
1583 /**
1584 * Using native endianness; Convert possibly broken UTF-32 string into UTF-16 string.
1585 *
1586 * During the conversion also validation of the input string is done.
1587 * This function is suitable to work with inputs from untrusted sources.
1588 *
1589 * This function is not BOM-aware.
1590 *
1591 * @param input the UTF-32 string to convert
1592 * @param length the length of the string in 4-byte words (char32_t)
1593 * @param utf16_buffer the pointer to buffer that can hold conversion result
1594 * @return number of written words; 0 if input is not a valid UTF-32 string
1595 */
1596 simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1597
1598 /**
1599 * Convert possibly broken UTF-32 string into UTF-16LE string.
1600 *
1601 * During the conversion also validation of the input string is done.
1602 * This function is suitable to work with inputs from untrusted sources.
1603 *
1604 * This function is not BOM-aware.
1605 *
1606 * @param input the UTF-32 string to convert
1607 * @param length the length of the string in 4-byte words (char32_t)
1608 * @param utf16_buffer the pointer to buffer that can hold conversion result
1609 * @return number of written words; 0 if input is not a valid UTF-32 string
1610 */
1611 simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1612
1613 /**
1614 * Convert possibly broken UTF-32 string into UTF-16BE string.
1615 *
1616 * During the conversion also validation of the input string is done.
1617 * This function is suitable to work with inputs from untrusted sources.
1618 *
1619 * This function is not BOM-aware.
1620 *
1621 * @param input the UTF-32 string to convert
1622 * @param length the length of the string in 4-byte words (char32_t)
1623 * @param utf16_buffer the pointer to buffer that can hold conversion result
1624 * @return number of written words; 0 if input is not a valid UTF-32 string
1625 */
1626 simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1627
1628 /**
1629 * Using native endianness; Convert possibly broken UTF-32 string into UTF-16
1630 * string and stop on error.
1631 *
1632 * During the conversion also validation of the input string is done.
1633 * This function is suitable to work with inputs from untrusted sources.
1634 *
1635 * This function is not BOM-aware.
1636 *
1637 * @param input the UTF-32 string to convert
1638 * @param length the length of the string in 4-byte words (char32_t)
1639 * @param utf16_buffer the pointer to buffer that can hold conversion result
1640 * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
1641 */
1642 simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1643
1644 /**
1645 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
1646 *
1647 * During the conversion also validation of the input string is done.
1648 * This function is suitable to work with inputs from untrusted sources.
1649 *
1650 * This function is not BOM-aware.
1651 *
1652 * @param input the UTF-32 string to convert
1653 * @param length the length of the string in 4-byte words (char32_t)
1654 * @param utf16_buffer the pointer to buffer that can hold conversion result
1655 * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
1656 */
1657 simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1658
1659 /**
1660 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
1661 *
1662 * During the conversion also validation of the input string is done.
1663 * This function is suitable to work with inputs from untrusted sources.
1664 *
1665 * This function is not BOM-aware.
1666 *
1667 * @param input the UTF-32 string to convert
1668 * @param length the length of the string in 4-byte words (char32_t)
1669 * @param utf16_buffer the pointer to buffer that can hold conversion result
1670 * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
1671 */
1672 simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1673
1674 /**
1675 * Using native endianness; Convert valid UTF-32 string into UTF-16 string.
1676 *
1677 * This function assumes that the input string is valid UTF-32.
1678 *
1679 * This function is not BOM-aware.
1680 *
1681 * @param input the UTF-32 string to convert
1682 * @param length the length of the string in 4-byte words (char32_t)
1683 * @param utf16_buffer the pointer to buffer that can hold the conversion result
1684 * @return number of written words; 0 if conversion is not possible
1685 */
1686 simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1687
1688 /**
1689 * Convert valid UTF-32 string into UTF-16LE string.
1690 *
1691 * This function assumes that the input string is valid UTF-32.
1692 *
1693 * This function is not BOM-aware.
1694 *
1695 * @param input the UTF-32 string to convert
1696 * @param length the length of the string in 4-byte words (char32_t)
1697 * @param utf16_buffer the pointer to buffer that can hold the conversion result
1698 * @return number of written words; 0 if conversion is not possible
1699 */
1700 simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1701
1702 /**
1703 * Convert valid UTF-32 string into UTF-16BE string.
1704 *
1705 * This function assumes that the input string is valid UTF-32.
1706 *
1707 * This function is not BOM-aware.
1708 *
1709 * @param input the UTF-32 string to convert
1710 * @param length the length of the string in 4-byte words (char32_t)
1711 * @param utf16_buffer the pointer to buffer that can hold the conversion result
1712 * @return number of written words; 0 if conversion is not possible
1713 */
1714 simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1715
1716 /**
1717 * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or
1718 * from UTF-16BE to UTF-16LE.
1719 *
1720 * This function does not validate the input.
1721 *
1722 * This function is not BOM-aware.
1723 *
1724 * @param input the UTF-16 string to process
1725 * @param length the length of the string in 2-byte words (char16_t)
1726 * @param output the pointer to buffer that can hold the conversion result
1727 */
1728 void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept;
1729
1730 /**
1731 * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
1732 *
1733 * This function does not validate the input.
1734 *
1735 * @param input the UTF-32 string to convert
1736 * @param length the length of the string in 4-byte words (char32_t)
1737 * @return the number of bytes required to encode the UTF-32 string as UTF-8
1738 */
1739 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept;
1740
1741 /**
1742 * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format.
1743 *
1744 * This function does not validate the input.
1745 *
1746 * @param input the UTF-32 string to convert
1747 * @param length the length of the string in 4-byte words (char32_t)
1748 * @return the number of bytes required to encode the UTF-32 string as UTF-16
1749 */
1750 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept;
1751
1752 /**
1753 * Using native endianness; Compute the number of bytes that this UTF-16
1754 * string would require in UTF-32 format.
1755 *
1756 * This function is equivalent to count_utf16.
1757 *
1758 * This function does not validate the input.
1759 *
1760 * This function is not BOM-aware.
1761 *
1762 * @param input the UTF-16 string to convert
1763 * @param length the length of the string in 2-byte words (char16_t)
1764 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
1765 */
1766 simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept;
1767
1768 /**
1769 * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format.
1770 *
1771 * This function is equivalent to count_utf16le.
1772 *
1773 * This function does not validate the input.
1774 *
1775 * This function is not BOM-aware.
1776 *
1777 * @param input the UTF-16LE string to convert
1778 * @param length the length of the string in 2-byte words (char16_t)
1779 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
1780 */
1781 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept;
1782
1783 /**
1784 * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format.
1785 *
1786 * This function is equivalent to count_utf16be.
1787 *
1788 * This function does not validate the input.
1789 *
1790 * This function is not BOM-aware.
1791 *
1792 * @param input the UTF-16BE string to convert
1793 * @param length the length of the string in 2-byte words (char16_t)
1794 * @return the number of bytes required to encode the UTF-16BE string as UTF-32
1795 */
1796 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept;
1797
1798 /**
1799 * Count the number of code points (characters) in the string assuming that
1800 * it is valid.
1801 *
1802 * This function assumes that the input string is valid UTF-16 (native endianness).
1803 *
1804 * This function is not BOM-aware.
1805 *
1806 * @param input the UTF-16 string to process
1807 * @param length the length of the string in 2-byte words (char16_t)
1808 * @return number of code points
1809 */
1810 simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept;
1811
1812 /**
1813 * Count the number of code points (characters) in the string assuming that
1814 * it is valid.
1815 *
1816 * This function assumes that the input string is valid UTF-16LE.
1817 *
1818 * This function is not BOM-aware.
1819 *
1820 * @param input the UTF-16LE string to process
1821 * @param length the length of the string in 2-byte words (char16_t)
1822 * @return number of code points
1823 */
1824 simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept;
1825
1826 /**
1827 * Count the number of code points (characters) in the string assuming that
1828 * it is valid.
1829 *
1830 * This function assumes that the input string is valid UTF-16BE.
1831 *
1832 * This function is not BOM-aware.
1833 *
1834 * @param input the UTF-16BE string to process
1835 * @param length the length of the string in 2-byte words (char16_t)
1836 * @return number of code points
1837 */
1838 simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept;
1839
1840 /**
1841 * Count the number of code points (characters) in the string assuming that
1842 * it is valid.
1843 *
1844 * This function assumes that the input string is valid UTF-8.
1845 *
1846 * @param input the UTF-8 string to process
1847 * @param length the length of the string in bytes
1848 * @return number of code points
1849 */
1850 simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept;
1851
1852 /**
1853 * An implementation of simdutf for a particular CPU architecture.
1854 *
1855 * Also used to maintain the currently active implementation. The active implementation is
1856 * automatically initialized on first use to the most advanced implementation supported by the host.
1857 */
1858 class implementation {
1859 public:
1860
1861 /**
1862 * The name of this implementation.
1863 *
1864 * const implementation *impl = simdutf::active_implementation;
1865 * cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
1866 *
1867 * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
1868 */
name()1869 virtual const std::string &name() const { return _name; }
1870
1871 /**
1872 * The description of this implementation.
1873 *
1874 * const implementation *impl = simdutf::active_implementation;
1875 * cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
1876 *
1877 * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
1878 */
description()1879 virtual const std::string &description() const { return _description; }
1880
1881 /**
1882 * The instruction sets this implementation is compiled against
1883 * and the current CPU match. This function may poll the current CPU/system
1884 * and should therefore not be called too often if performance is a concern.
1885 *
1886 *
1887 * @return true if the implementation can be safely used on the current system (determined at runtime)
1888 */
1889 bool supported_by_runtime_system() const;
1890
1891 /**
1892 * This function will try to detect the encoding
1893 * @param input the string to identify
1894 * @param length the length of the string in bytes.
1895 * @return the encoding type detected
1896 */
1897 virtual encoding_type autodetect_encoding(const char * input, size_t length) const noexcept;
1898
1899 /**
1900 * This function will try to detect the possible encodings in one pass
1901 * @param input the string to identify
1902 * @param length the length of the string in bytes.
1903 * @return the encoding type detected
1904 */
1905 virtual int detect_encodings(const char * input, size_t length) const noexcept = 0;
1906
1907 /**
1908 * @private For internal implementation use
1909 *
1910 * The instruction sets this implementation is compiled against.
1911 *
1912 * @return a mask of all required `internal::instruction_set::` values
1913 */
required_instruction_sets()1914 virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; }
1915
1916
1917 /**
1918 * Validate the UTF-8 string.
1919 *
1920 * Overridden by each implementation.
1921 *
1922 * @param buf the UTF-8 string to validate.
1923 * @param len the length of the string in bytes.
1924 * @return true if and only if the string is valid UTF-8.
1925 */
1926 simdutf_warn_unused virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0;
1927
1928 /**
1929 * Validate the UTF-8 string and stop on errors.
1930 *
1931 * Overridden by each implementation.
1932 *
1933 * @param buf the UTF-8 string to validate.
1934 * @param len the length of the string in bytes.
1935 * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
1936 */
1937 simdutf_warn_unused virtual result validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
1938
1939 /**
1940 * Validate the ASCII string.
1941 *
1942 * Overridden by each implementation.
1943 *
1944 * @param buf the ASCII string to validate.
1945 * @param len the length of the string in bytes.
1946 * @return true if and only if the string is valid ASCII.
1947 */
1948 simdutf_warn_unused virtual bool validate_ascii(const char *buf, size_t len) const noexcept = 0;
1949
1950 /**
1951 * Validate the ASCII string and stop on error.
1952 *
1953 * Overridden by each implementation.
1954 *
1955 * @param buf the ASCII string to validate.
1956 * @param len the length of the string in bytes.
1957 * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
1958 */
1959 simdutf_warn_unused virtual result validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
1960
1961 /**
1962 * Validate the UTF-16LE string.This function may be best when you expect
1963 * the input to be almost always valid. Otherwise, consider using
1964 * validate_utf16le_with_errors.
1965 *
1966 * Overridden by each implementation.
1967 *
1968 * This function is not BOM-aware.
1969 *
1970 * @param buf the UTF-16LE string to validate.
1971 * @param len the length of the string in number of 2-byte words (char16_t).
1972 * @return true if and only if the string is valid UTF-16LE.
1973 */
1974 simdutf_warn_unused virtual bool validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
1975
1976 /**
1977 * Validate the UTF-16BE string. This function may be best when you expect
1978 * the input to be almost always valid. Otherwise, consider using
1979 * validate_utf16be_with_errors.
1980 *
1981 * Overridden by each implementation.
1982 *
1983 * This function is not BOM-aware.
1984 *
1985 * @param buf the UTF-16BE string to validate.
1986 * @param len the length of the string in number of 2-byte words (char16_t).
1987 * @return true if and only if the string is valid UTF-16BE.
1988 */
1989 simdutf_warn_unused virtual bool validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
1990
1991 /**
1992 * Validate the UTF-16LE string and stop on error. It might be faster than
1993 * validate_utf16le when an error is expected to occur early.
1994 *
1995 * Overridden by each implementation.
1996 *
1997 * This function is not BOM-aware.
1998 *
1999 * @param buf the UTF-16LE string to validate.
2000 * @param len the length of the string in number of 2-byte words (char16_t).
2001 * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
2002 */
2003 simdutf_warn_unused virtual result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept = 0;
2004
2005 /**
2006 * Validate the UTF-16BE string and stop on error. It might be faster than
2007 * validate_utf16be when an error is expected to occur early.
2008 *
2009 * Overridden by each implementation.
2010 *
2011 * This function is not BOM-aware.
2012 *
2013 * @param buf the UTF-16BE string to validate.
2014 * @param len the length of the string in number of 2-byte words (char16_t).
2015 * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
2016 */
2017 simdutf_warn_unused virtual result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept = 0;
2018
2019 /**
2020 * Validate the UTF-32 string.
2021 *
2022 * Overridden by each implementation.
2023 *
2024 * This function is not BOM-aware.
2025 *
2026 * @param buf the UTF-32 string to validate.
2027 * @param len the length of the string in number of 4-byte words (char32_t).
2028 * @return true if and only if the string is valid UTF-32.
2029 */
2030 simdutf_warn_unused virtual bool validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
2031
2032 /**
2033 * Validate the UTF-32 string and stop on error.
2034 *
2035 * Overridden by each implementation.
2036 *
2037 * This function is not BOM-aware.
2038 *
2039 * @param buf the UTF-32 string to validate.
2040 * @param len the length of the string in number of 4-byte words (char32_t).
2041 * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
2042 */
2043 simdutf_warn_unused virtual result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept = 0;
2044
2045 /**
2046 * Convert possibly broken UTF-8 string into UTF-16LE string.
2047 *
2048 * During the conversion also validation of the input string is done.
2049 * This function is suitable to work with inputs from untrusted sources.
2050 *
2051 * @param input the UTF-8 string to convert
2052 * @param length the length of the string in bytes
2053 * @param utf16_buffer the pointer to buffer that can hold conversion result
2054 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
2055 */
2056 simdutf_warn_unused virtual size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2057
2058 /**
2059 * Convert possibly broken UTF-8 string into UTF-16BE string.
2060 *
2061 * During the conversion also validation of the input string is done.
2062 * This function is suitable to work with inputs from untrusted sources.
2063 *
2064 * @param input the UTF-8 string to convert
2065 * @param length the length of the string in bytes
2066 * @param utf16_buffer the pointer to buffer that can hold conversion result
2067 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
2068 */
2069 simdutf_warn_unused virtual size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2070
2071 /**
2072 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
2073 *
2074 * During the conversion also validation of the input string is done.
2075 * This function is suitable to work with inputs from untrusted sources.
2076 *
2077 * @param input the UTF-8 string to convert
2078 * @param length the length of the string in bytes
2079 * @param utf16_buffer the pointer to buffer that can hold conversion result
2080 * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
2081 */
2082 simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2083
2084 /**
2085 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
2086 *
2087 * During the conversion also validation of the input string is done.
2088 * This function is suitable to work with inputs from untrusted sources.
2089 *
2090 * @param input the UTF-8 string to convert
2091 * @param length the length of the string in bytes
2092 * @param utf16_buffer the pointer to buffer that can hold conversion result
2093 * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
2094 */
2095 simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2096
2097 /**
2098 * Convert possibly broken UTF-8 string into UTF-32 string.
2099 *
2100 * During the conversion also validation of the input string is done.
2101 * This function is suitable to work with inputs from untrusted sources.
2102 *
2103 * @param input the UTF-8 string to convert
2104 * @param length the length of the string in bytes
2105 * @param utf32_buffer the pointer to buffer that can hold conversion result
2106 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
2107 */
2108 simdutf_warn_unused virtual size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0;
2109
2110 /**
2111 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
2112 *
2113 * During the conversion also validation of the input string is done.
2114 * This function is suitable to work with inputs from untrusted sources.
2115 *
2116 * @param input the UTF-8 string to convert
2117 * @param length the length of the string in bytes
2118 * @param utf32_buffer the pointer to buffer that can hold conversion result
2119 * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
2120 */
2121 simdutf_warn_unused virtual result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0;
2122
2123 /**
2124 * Convert valid UTF-8 string into UTF-16LE string.
2125 *
2126 * This function assumes that the input string is valid UTF-8.
2127 *
2128 * @param input the UTF-8 string to convert
2129 * @param length the length of the string in bytes
2130 * @param utf16_buffer the pointer to buffer that can hold conversion result
2131 * @return the number of written char16_t
2132 */
2133 simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2134
2135 /**
2136 * Convert valid UTF-8 string into UTF-16BE string.
2137 *
2138 * This function assumes that the input string is valid UTF-8.
2139 *
2140 * @param input the UTF-8 string to convert
2141 * @param length the length of the string in bytes
2142 * @param utf16_buffer the pointer to buffer that can hold conversion result
2143 * @return the number of written char16_t
2144 */
2145 simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2146
2147 /**
2148 * Convert valid UTF-8 string into UTF-32 string.
2149 *
2150 * This function assumes that the input string is valid UTF-8.
2151 *
2152 * @param input the UTF-8 string to convert
2153 * @param length the length of the string in bytes
2154 * @param utf16_buffer the pointer to buffer that can hold conversion result
2155 * @return the number of written char32_t
2156 */
2157 simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2158
2159 /**
2160 * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format.
2161 *
2162 * This function does not validate the input.
2163 *
2164 * @param input the UTF-8 string to process
2165 * @param length the length of the string in bytes
2166 * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE
2167 */
2168 simdutf_warn_unused virtual size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept = 0;
2169
2170 /**
2171 * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32 format.
2172 *
2173 * This function is equivalent to count_utf8.
2174 *
2175 * This function does not validate the input.
2176 *
2177 * @param input the UTF-8 string to process
2178 * @param length the length of the string in bytes
2179 * @return the number of char32_t words required to encode the UTF-8 string as UTF-32
2180 */
2181 simdutf_warn_unused virtual size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept = 0;
2182
2183 /**
2184 * Convert possibly broken UTF-16LE string into UTF-8 string.
2185 *
2186 * During the conversion also validation of the input string is done.
2187 * This function is suitable to work with inputs from untrusted sources.
2188 *
2189 * This function is not BOM-aware.
2190 *
2191 * @param input the UTF-16LE string to convert
2192 * @param length the length of the string in 2-byte words (char16_t)
2193 * @param utf8_buffer the pointer to buffer that can hold conversion result
2194 * @return number of written words; 0 if input is not a valid UTF-16LE string
2195 */
2196 simdutf_warn_unused virtual size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2197
2198 /**
2199 * Convert possibly broken UTF-16BE string into UTF-8 string.
2200 *
2201 * During the conversion also validation of the input string is done.
2202 * This function is suitable to work with inputs from untrusted sources.
2203 *
2204 * This function is not BOM-aware.
2205 *
2206 * @param input the UTF-16BE string to convert
2207 * @param length the length of the string in 2-byte words (char16_t)
2208 * @param utf8_buffer the pointer to buffer that can hold conversion result
2209 * @return number of written words; 0 if input is not a valid UTF-16BE string
2210 */
2211 simdutf_warn_unused virtual size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2212
2213 /**
2214 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
2215 *
2216 * During the conversion also validation of the input string is done.
2217 * This function is suitable to work with inputs from untrusted sources.
2218 *
2219 * This function is not BOM-aware.
2220 *
2221 * @param input the UTF-16LE string to convert
2222 * @param length the length of the string in 2-byte words (char16_t)
2223 * @param utf8_buffer the pointer to buffer that can hold conversion result
2224 * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
2225 */
2226 simdutf_warn_unused virtual result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2227
2228 /**
2229 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
2230 *
2231 * During the conversion also validation of the input string is done.
2232 * This function is suitable to work with inputs from untrusted sources.
2233 *
2234 * This function is not BOM-aware.
2235 *
2236 * @param input the UTF-16BE string to convert
2237 * @param length the length of the string in 2-byte words (char16_t)
2238 * @param utf8_buffer the pointer to buffer that can hold conversion result
2239 * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
2240 */
2241 simdutf_warn_unused virtual result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2242
2243 /**
2244 * Convert valid UTF-16LE string into UTF-8 string.
2245 *
2246 * This function assumes that the input string is valid UTF-16LE.
2247 *
2248 * This function is not BOM-aware.
2249 *
2250 * @param input the UTF-16LE string to convert
2251 * @param length the length of the string in 2-byte words (char16_t)
2252 * @param utf8_buffer the pointer to buffer that can hold the conversion result
2253 * @return number of written words; 0 if conversion is not possible
2254 */
2255 simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2256
2257 /**
2258 * Convert valid UTF-16BE string into UTF-8 string.
2259 *
2260 * This function assumes that the input string is valid UTF-16BE.
2261 *
2262 * This function is not BOM-aware.
2263 *
2264 * @param input the UTF-16BE string to convert
2265 * @param length the length of the string in 2-byte words (char16_t)
2266 * @param utf8_buffer the pointer to buffer that can hold the conversion result
2267 * @return number of written words; 0 if conversion is not possible
2268 */
2269 simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2270
2271 /**
2272 * Convert possibly broken UTF-16LE string into UTF-32 string.
2273 *
2274 * During the conversion also validation of the input string is done.
2275 * This function is suitable to work with inputs from untrusted sources.
2276 *
2277 * This function is not BOM-aware.
2278 *
2279 * @param input the UTF-16LE string to convert
2280 * @param length the length of the string in 2-byte words (char16_t)
2281 * @param utf32_buffer the pointer to buffer that can hold conversion result
2282 * @return number of written words; 0 if input is not a valid UTF-16LE string
2283 */
2284 simdutf_warn_unused virtual size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2285
2286 /**
2287 * Convert possibly broken UTF-16BE string into UTF-32 string.
2288 *
2289 * During the conversion also validation of the input string is done.
2290 * This function is suitable to work with inputs from untrusted sources.
2291 *
2292 * This function is not BOM-aware.
2293 *
2294 * @param input the UTF-16BE string to convert
2295 * @param length the length of the string in 2-byte words (char16_t)
2296 * @param utf32_buffer the pointer to buffer that can hold conversion result
2297 * @return number of written words; 0 if input is not a valid UTF-16BE string
2298 */
2299 simdutf_warn_unused virtual size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2300
2301 /**
2302 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
2303 *
2304 * During the conversion also validation of the input string is done.
2305 * This function is suitable to work with inputs from untrusted sources.
2306 *
2307 * This function is not BOM-aware.
2308 *
2309 * @param input the UTF-16LE string to convert
2310 * @param length the length of the string in 2-byte words (char16_t)
2311 * @param utf32_buffer the pointer to buffer that can hold conversion result
2312 * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
2313 */
2314 simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2315
2316 /**
2317 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
2318 *
2319 * During the conversion also validation of the input string is done.
2320 * This function is suitable to work with inputs from untrusted sources.
2321 *
2322 * This function is not BOM-aware.
2323 *
2324 * @param input the UTF-16BE string to convert
2325 * @param length the length of the string in 2-byte words (char16_t)
2326 * @param utf32_buffer the pointer to buffer that can hold conversion result
2327 * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
2328 */
2329 simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2330
2331 /**
2332 * Convert valid UTF-16LE string into UTF-32 string.
2333 *
2334 * This function assumes that the input string is valid UTF-16LE.
2335 *
2336 * This function is not BOM-aware.
2337 *
2338 * @param input the UTF-16LE string to convert
2339 * @param length the length of the string in 2-byte words (char16_t)
2340 * @param utf32_buffer the pointer to buffer that can hold the conversion result
2341 * @return number of written words; 0 if conversion is not possible
2342 */
2343 simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2344
2345 /**
2346 * Convert valid UTF-16LE string into UTF-32BE string.
2347 *
2348 * This function assumes that the input string is valid UTF-16BE.
2349 *
2350 * This function is not BOM-aware.
2351 *
2352 * @param input the UTF-16BE string to convert
2353 * @param length the length of the string in 2-byte words (char16_t)
2354 * @param utf32_buffer the pointer to buffer that can hold the conversion result
2355 * @return number of written words; 0 if conversion is not possible
2356 */
2357 simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2358
2359 /**
2360 * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
2361 *
2362 * This function does not validate the input.
2363 *
2364 * This function is not BOM-aware.
2365 *
2366 * @param input the UTF-16LE string to convert
2367 * @param length the length of the string in 2-byte words (char16_t)
2368 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2369 */
2370 simdutf_warn_unused virtual size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0;
2371
2372 /**
2373 * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format.
2374 *
2375 * This function does not validate the input.
2376 *
2377 * This function is not BOM-aware.
2378 *
2379 * @param input the UTF-16BE string to convert
2380 * @param length the length of the string in 2-byte words (char16_t)
2381 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
2382 */
2383 simdutf_warn_unused virtual size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0;
2384
2385 /**
2386 * Convert possibly broken UTF-32 string into UTF-8 string.
2387 *
2388 * During the conversion also validation of the input string is done.
2389 * This function is suitable to work with inputs from untrusted sources.
2390 *
2391 * This function is not BOM-aware.
2392 *
2393 * @param input the UTF-32 string to convert
2394 * @param length the length of the string in 4-byte words (char32_t)
2395 * @param utf8_buffer the pointer to buffer that can hold conversion result
2396 * @return number of written words; 0 if input is not a valid UTF-32 string
2397 */
2398 simdutf_warn_unused virtual size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2399
2400 /**
2401 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
2402 *
2403 * During the conversion also validation of the input string is done.
2404 * This function is suitable to work with inputs from untrusted sources.
2405 *
2406 * This function is not BOM-aware.
2407 *
2408 * @param input the UTF-32 string to convert
2409 * @param length the length of the string in 4-byte words (char32_t)
2410 * @param utf8_buffer the pointer to buffer that can hold conversion result
2411 * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
2412 */
2413 simdutf_warn_unused virtual result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2414
2415 /**
2416 * Convert valid UTF-32 string into UTF-8 string.
2417 *
2418 * This function assumes that the input string is valid UTF-32.
2419 *
2420 * This function is not BOM-aware.
2421 *
2422 * @param input the UTF-32 string to convert
2423 * @param length the length of the string in 4-byte words (char32_t)
2424 * @param utf8_buffer the pointer to buffer that can hold the conversion result
2425 * @return number of written words; 0 if conversion is not possible
2426 */
2427 simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2428
2429 /**
2430 * Convert possibly broken UTF-32 string into UTF-16LE string.
2431 *
2432 * During the conversion also validation of the input string is done.
2433 * This function is suitable to work with inputs from untrusted sources.
2434 *
2435 * This function is not BOM-aware.
2436 *
2437 * @param input the UTF-32 string to convert
2438 * @param length the length of the string in 4-byte words (char32_t)
2439 * @param utf16_buffer the pointer to buffer that can hold conversion result
2440 * @return number of written words; 0 if input is not a valid UTF-32 string
2441 */
2442 simdutf_warn_unused virtual size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2443
2444 /**
2445 * Convert possibly broken UTF-32 string into UTF-16BE string.
2446 *
2447 * During the conversion also validation of the input string is done.
2448 * This function is suitable to work with inputs from untrusted sources.
2449 *
2450 * This function is not BOM-aware.
2451 *
2452 * @param input the UTF-32 string to convert
2453 * @param length the length of the string in 4-byte words (char32_t)
2454 * @param utf16_buffer the pointer to buffer that can hold conversion result
2455 * @return number of written words; 0 if input is not a valid UTF-32 string
2456 */
2457 simdutf_warn_unused virtual size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2458
2459 /**
2460 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
2461 *
2462 * During the conversion also validation of the input string is done.
2463 * This function is suitable to work with inputs from untrusted sources.
2464 *
2465 * This function is not BOM-aware.
2466 *
2467 * @param input the UTF-32 string to convert
2468 * @param length the length of the string in 4-byte words (char32_t)
2469 * @param utf16_buffer the pointer to buffer that can hold conversion result
2470 * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
2471 */
2472 simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2473
2474 /**
2475 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
2476 *
2477 * During the conversion also validation of the input string is done.
2478 * This function is suitable to work with inputs from untrusted sources.
2479 *
2480 * This function is not BOM-aware.
2481 *
2482 * @param input the UTF-32 string to convert
2483 * @param length the length of the string in 4-byte words (char32_t)
2484 * @param utf16_buffer the pointer to buffer that can hold conversion result
2485 * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
2486 */
2487 simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2488
2489 /**
2490 * Convert valid UTF-32 string into UTF-16LE string.
2491 *
2492 * This function assumes that the input string is valid UTF-32.
2493 *
2494 * This function is not BOM-aware.
2495 *
2496 * @param input the UTF-32 string to convert
2497 * @param length the length of the string in 4-byte words (char32_t)
2498 * @param utf16_buffer the pointer to buffer that can hold the conversion result
2499 * @return number of written words; 0 if conversion is not possible
2500 */
2501 simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2502
2503 /**
2504 * Convert valid UTF-32 string into UTF-16BE string.
2505 *
2506 * This function assumes that the input string is valid UTF-32.
2507 *
2508 * This function is not BOM-aware.
2509 *
2510 * @param input the UTF-32 string to convert
2511 * @param length the length of the string in 4-byte words (char32_t)
2512 * @param utf16_buffer the pointer to buffer that can hold the conversion result
2513 * @return number of written words; 0 if conversion is not possible
2514 */
2515 simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2516
2517 /**
2518 * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or
2519 * from UTF-16BE to UTF-16LE.
2520 *
2521 * This function does not validate the input.
2522 *
2523 * This function is not BOM-aware.
2524 *
2525 * @param input the UTF-16 string to process
2526 * @param length the length of the string in 2-byte words (char16_t)
2527 * @param output the pointer to buffer that can hold the conversion result
2528 */
2529 virtual void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept = 0;
2530
2531 /**
2532 * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
2533 *
2534 * This function does not validate the input.
2535 *
2536 * @param input the UTF-32 string to convert
2537 * @param length the length of the string in 4-byte words (char32_t)
2538 * @return the number of bytes required to encode the UTF-32 string as UTF-8
2539 */
2540 simdutf_warn_unused virtual size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
2541
2542 /**
2543 * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format.
2544 *
2545 * This function does not validate the input.
2546 *
2547 * @param input the UTF-32 string to convert
2548 * @param length the length of the string in 4-byte words (char32_t)
2549 * @return the number of bytes required to encode the UTF-32 string as UTF-16
2550 */
2551 simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
2552
2553 /*
2554 * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format.
2555 *
2556 * This function is equivalent to count_utf16le.
2557 *
2558 * This function does not validate the input.
2559 *
2560 * This function is not BOM-aware.
2561 *
2562 * @param input the UTF-16LE string to convert
2563 * @param length the length of the string in 2-byte words (char16_t)
2564 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2565 */
2566 simdutf_warn_unused virtual size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0;
2567
2568 /*
2569 * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format.
2570 *
2571 * This function is equivalent to count_utf16be.
2572 *
2573 * This function does not validate the input.
2574 *
2575 * This function is not BOM-aware.
2576 *
2577 * @param input the UTF-16BE string to convert
2578 * @param length the length of the string in 2-byte words (char16_t)
2579 * @return the number of bytes required to encode the UTF-16BE string as UTF-32
2580 */
2581 simdutf_warn_unused virtual size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0;
2582
2583 /**
2584 * Count the number of code points (characters) in the string assuming that
2585 * it is valid.
2586 *
2587 * This function assumes that the input string is valid UTF-16LE.
2588 *
2589 * This function is not BOM-aware.
2590 *
2591 * @param input the UTF-16LE string to process
2592 * @param length the length of the string in 2-byte words (char16_t)
2593 * @return number of code points
2594 */
2595 simdutf_warn_unused virtual size_t count_utf16le(const char16_t * input, size_t length) const noexcept = 0;
2596
2597 /**
2598 * Count the number of code points (characters) in the string assuming that
2599 * it is valid.
2600 *
2601 * This function assumes that the input string is valid UTF-16BE.
2602 *
2603 * This function is not BOM-aware.
2604 *
2605 * @param input the UTF-16BE string to process
2606 * @param length the length of the string in 2-byte words (char16_t)
2607 * @return number of code points
2608 */
2609 simdutf_warn_unused virtual size_t count_utf16be(const char16_t * input, size_t length) const noexcept = 0;
2610
2611
2612 /**
2613 * Count the number of code points (characters) in the string assuming that
2614 * it is valid.
2615 *
2616 * This function assumes that the input string is valid UTF-8.
2617 *
2618 * @param input the UTF-8 string to process
2619 * @param length the length of the string in bytes
2620 * @return number of code points
2621 */
2622 simdutf_warn_unused virtual size_t count_utf8(const char * input, size_t length) const noexcept = 0;
2623
2624
2625
2626 protected:
2627 /** @private Construct an implementation with the given name and description. For subclasses. */
implementation(std::string name,std::string description,uint32_t required_instruction_sets)2628 simdutf_really_inline implementation(
2629 std::string name,
2630 std::string description,
2631 uint32_t required_instruction_sets
2632 ) :
2633 _name(name),
2634 _description(description),
2635 _required_instruction_sets(required_instruction_sets)
2636 {
2637 }
2638 virtual ~implementation()=default;
2639
2640 private:
2641 /**
2642 * The name of this implementation.
2643 */
2644 const std::string _name;
2645
2646 /**
2647 * The description of this implementation.
2648 */
2649 const std::string _description;
2650
2651 /**
2652 * Instruction sets required for this implementation.
2653 */
2654 const uint32_t _required_instruction_sets;
2655 };
2656
2657 /** @private */
2658 namespace internal {
2659
2660 /**
2661 * The list of available implementations compiled into simdutf.
2662 */
2663 class available_implementation_list {
2664 public:
2665 /** Get the list of available implementations compiled into simdutf */
available_implementation_list()2666 simdutf_really_inline available_implementation_list() {}
2667 /** Number of implementations */
2668 size_t size() const noexcept;
2669 /** STL const begin() iterator */
2670 const implementation * const *begin() const noexcept;
2671 /** STL const end() iterator */
2672 const implementation * const *end() const noexcept;
2673
2674 /**
2675 * Get the implementation with the given name.
2676 *
2677 * Case sensitive.
2678 *
2679 * const implementation *impl = simdutf::available_implementations["westmere"];
2680 * if (!impl) { exit(1); }
2681 * if (!imp->supported_by_runtime_system()) { exit(1); }
2682 * simdutf::active_implementation = impl;
2683 *
2684 * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
2685 * @return the implementation, or nullptr if the parse failed.
2686 */
2687 const implementation * operator[](const std::string &name) const noexcept {
2688 for (const implementation * impl : *this) {
2689 if (impl->name() == name) { return impl; }
2690 }
2691 return nullptr;
2692 }
2693
2694 /**
2695 * Detect the most advanced implementation supported by the current host.
2696 *
2697 * This is used to initialize the implementation on startup.
2698 *
2699 * const implementation *impl = simdutf::available_implementation::detect_best_supported();
2700 * simdutf::active_implementation = impl;
2701 *
2702 * @return the most advanced supported implementation for the current host, or an
2703 * implementation that returns UNSUPPORTED_ARCHITECTURE if there is no supported
2704 * implementation. Will never return nullptr.
2705 */
2706 const implementation *detect_best_supported() const noexcept;
2707 };
2708
2709 template<typename T>
2710 class atomic_ptr {
2711 public:
atomic_ptr(T * _ptr)2712 atomic_ptr(T *_ptr) : ptr{_ptr} {}
2713
2714 #if defined(SIMDUTF_NO_THREADS)
2715 operator const T*() const { return ptr; }
2716 const T& operator*() const { return *ptr; }
2717 const T* operator->() const { return ptr; }
2718
2719 operator T*() { return ptr; }
2720 T& operator*() { return *ptr; }
2721 T* operator->() { return ptr; }
2722 atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; }
2723
2724 #else
2725 operator const T*() const { return ptr.load(); }
2726 const T& operator*() const { return *ptr; }
2727 const T* operator->() const { return ptr.load(); }
2728
2729 operator T*() { return ptr.load(); }
2730 T& operator*() { return *ptr; }
2731 T* operator->() { return ptr.load(); }
2732 atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; }
2733
2734 #endif
2735
2736 private:
2737 #if defined(SIMDUTF_NO_THREADS)
2738 T* ptr;
2739 #else
2740 std::atomic<T*> ptr;
2741 #endif
2742 };
2743
2744 class detect_best_supported_implementation_on_first_use;
2745
2746 } // namespace internal
2747
2748 /**
2749 * The list of available implementations compiled into simdutf.
2750 */
2751 extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations();
2752
2753 /**
2754 * The active implementation.
2755 *
2756 * Automatically initialized on first use to the most advanced implementation supported by this hardware.
2757 */
2758 extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation();
2759
2760
2761 } // namespace simdutf
2762
2763 #endif // SIMDUTF_IMPLEMENTATION_H
2764 /* end file include/simdutf/implementation.h */
2765
2766
2767 // Implementation-internal files (must be included before the implementations themselves, to keep
2768 // amalgamation working--otherwise, the first time a file is included, it might be put inside the
2769 // #ifdef SIMDUTF_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other implementations can't
2770 // compile unless that implementation is turned on).
2771
2772
2773 SIMDUTF_POP_DISABLE_WARNINGS
2774
2775 #endif // SIMDUTF_H
2776 /* end file include/simdutf.h */
2777