• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* SwapBytes.c -- Byte Swap conversion filter
2 2023-04-07 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 #include "Compiler.h"
7 #include "CpuArch.h"
8 #include "RotateDefs.h"
9 #include "SwapBytes.h"
10 
11 typedef UInt16 CSwapUInt16;
12 typedef UInt32 CSwapUInt32;
13 
14 // #define k_SwapBytes_Mode_BASE   0
15 
16 #ifdef MY_CPU_X86_OR_AMD64
17 
18 #define k_SwapBytes_Mode_SSE2   1
19 #define k_SwapBytes_Mode_SSSE3  2
20 #define k_SwapBytes_Mode_AVX2   3
21 
22   // #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
23   #if defined(__clang__) && (__clang_major__ >= 4) \
24       || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701)
25       #define k_SwapBytes_Mode_MAX  k_SwapBytes_Mode_AVX2
26       #define SWAP_ATTRIB_SSE2  __attribute__((__target__("sse2")))
27       #define SWAP_ATTRIB_SSSE3 __attribute__((__target__("ssse3")))
28       #define SWAP_ATTRIB_AVX2  __attribute__((__target__("avx2")))
29   #elif defined(_MSC_VER)
30     #if (_MSC_VER == 1900)
31       #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
32     #endif
33     #if (_MSC_VER >= 1900)
34       #define k_SwapBytes_Mode_MAX  k_SwapBytes_Mode_AVX2
35     #elif (_MSC_VER >= 1500)  // (VS2008)
36       #define k_SwapBytes_Mode_MAX  k_SwapBytes_Mode_SSSE3
37     #elif (_MSC_VER >= 1310)  // (VS2003)
38       #define k_SwapBytes_Mode_MAX  k_SwapBytes_Mode_SSE2
39     #endif
40   #endif // _MSC_VER
41 
42 /*
43 // for debug
44 #ifdef k_SwapBytes_Mode_MAX
45 #undef k_SwapBytes_Mode_MAX
46 #endif
47 */
48 
49 #ifndef k_SwapBytes_Mode_MAX
50 #define k_SwapBytes_Mode_MAX 0
51 #endif
52 
53 #if (k_SwapBytes_Mode_MAX != 0) && defined(MY_CPU_AMD64)
54   #define k_SwapBytes_Mode_MIN  k_SwapBytes_Mode_SSE2
55 #else
56   #define k_SwapBytes_Mode_MIN  0
57 #endif
58 
59 #if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_AVX2)
60   #define USE_SWAP_AVX2
61 #endif
62 #if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSSE3)
63   #define USE_SWAP_SSSE3
64 #endif
65 #if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSE2)
66   #define USE_SWAP_128
67 #endif
68 
69 #if k_SwapBytes_Mode_MAX <= k_SwapBytes_Mode_MIN || !defined(USE_SWAP_128)
70 #define FORCE_SWAP_MODE
71 #endif
72 
73 
74 #ifdef USE_SWAP_128
75 /*
76  <mmintrin.h> MMX
77 <xmmintrin.h> SSE
78 <emmintrin.h> SSE2
79 <pmmintrin.h> SSE3
80 <tmmintrin.h> SSSE3
81 <smmintrin.h> SSE4.1
82 <nmmintrin.h> SSE4.2
83 <ammintrin.h> SSE4A
84 <wmmintrin.h> AES
85 <immintrin.h> AVX, AVX2, FMA
86 */
87 
88 #include <emmintrin.h> // sse2
89 // typedef __m128i v128;
90 
91 #define SWAP2_128(i) { \
92   const __m128i v = *(const __m128i *)(const void *)(items + (i) * 8); \
93                     *(      __m128i *)(      void *)(items + (i) * 8) = \
94     _mm_or_si128( \
95       _mm_slli_epi16(v, 8), \
96       _mm_srli_epi16(v, 8)); }
97 // _mm_or_si128() has more ports to execute than _mm_add_epi16().
98 
99 static
100 #ifdef SWAP_ATTRIB_SSE2
101 SWAP_ATTRIB_SSE2
102 #endif
103 void
104 Z7_FASTCALL
SwapBytes2_128(CSwapUInt16 * items,const CSwapUInt16 * lim)105 SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim)
106 {
107   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
108   do
109   {
110     SWAP2_128(0)  SWAP2_128(1)  items += 2 * 8;
111     SWAP2_128(0)  SWAP2_128(1)  items += 2 * 8;
112   }
113   while (items != lim);
114 }
115 
116 /*
117 // sse2
118 #define SWAP4_128_pack(i) { \
119   __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \
120   __m128i v0 = _mm_unpacklo_epi8(v, mask); \
121   __m128i v1 = _mm_unpackhi_epi8(v, mask); \
122   v0 = _mm_shufflelo_epi16(v0, 0x1b); \
123   v1 = _mm_shufflelo_epi16(v1, 0x1b); \
124   v0 = _mm_shufflehi_epi16(v0, 0x1b); \
125   v1 = _mm_shufflehi_epi16(v1, 0x1b); \
126   *(__m128i *)(void *)(items + (i) * 4) = _mm_packus_epi16(v0, v1); }
127 
128 static
129 #ifdef SWAP_ATTRIB_SSE2
130 SWAP_ATTRIB_SSE2
131 #endif
132 void
133 Z7_FASTCALL
134 SwapBytes4_128_pack(CSwapUInt32 *items, const CSwapUInt32 *lim)
135 {
136   const __m128i mask = _mm_setzero_si128();
137   // const __m128i mask = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
138   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
139   do
140   {
141     SWAP4_128_pack(0); items += 1 * 4;
142     // SWAP4_128_pack(0); SWAP4_128_pack(1); items += 2 * 4;
143   }
144   while (items != lim);
145 }
146 
147 // sse2
148 #define SWAP4_128_shift(i) { \
149   __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \
150   __m128i v2; \
151   v2 = _mm_or_si128( \
152         _mm_slli_si128(_mm_and_si128(v, mask), 1), \
153         _mm_and_si128(_mm_srli_si128(v, 1), mask)); \
154   v = _mm_or_si128( \
155         _mm_slli_epi32(v, 24), \
156         _mm_srli_epi32(v, 24)); \
157   *(__m128i *)(void *)(items + (i) * 4) = _mm_or_si128(v2, v); }
158 
159 static
160 #ifdef SWAP_ATTRIB_SSE2
161 SWAP_ATTRIB_SSE2
162 #endif
163 void
164 Z7_FASTCALL
165 SwapBytes4_128_shift(CSwapUInt32 *items, const CSwapUInt32 *lim)
166 {
167   #define M1 0xff00
168   const __m128i mask = _mm_set_epi32(M1, M1, M1, M1);
169   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
170   do
171   {
172     // SWAP4_128_shift(0)  SWAP4_128_shift(1)  items += 2 * 4;
173     // SWAP4_128_shift(0)  SWAP4_128_shift(1)  items += 2 * 4;
174     SWAP4_128_shift(0); items += 1 * 4;
175   }
176   while (items != lim);
177 }
178 */
179 
180 
181 #if defined(USE_SWAP_SSSE3) || defined(USE_SWAP_AVX2)
182 
183 #define SWAP_SHUF_REV_SEQ_2_VALS(v)                (v)+1, (v)
184 #define SWAP_SHUF_REV_SEQ_4_VALS(v)  (v)+3, (v)+2, (v)+1, (v)
185 
186 #define SWAP2_SHUF_MASK_16_BYTES \
187     SWAP_SHUF_REV_SEQ_2_VALS (0 * 2), \
188     SWAP_SHUF_REV_SEQ_2_VALS (1 * 2), \
189     SWAP_SHUF_REV_SEQ_2_VALS (2 * 2), \
190     SWAP_SHUF_REV_SEQ_2_VALS (3 * 2), \
191     SWAP_SHUF_REV_SEQ_2_VALS (4 * 2), \
192     SWAP_SHUF_REV_SEQ_2_VALS (5 * 2), \
193     SWAP_SHUF_REV_SEQ_2_VALS (6 * 2), \
194     SWAP_SHUF_REV_SEQ_2_VALS (7 * 2)
195 
196 #define SWAP4_SHUF_MASK_16_BYTES \
197     SWAP_SHUF_REV_SEQ_4_VALS (0 * 4), \
198     SWAP_SHUF_REV_SEQ_4_VALS (1 * 4), \
199     SWAP_SHUF_REV_SEQ_4_VALS (2 * 4), \
200     SWAP_SHUF_REV_SEQ_4_VALS (3 * 4)
201 
202 #if defined(USE_SWAP_AVX2)
203 /* if we use 256_BIT_INIT_MASK, each static array mask will be larger for 16 bytes */
204 // #define SWAP_USE_256_BIT_INIT_MASK
205 #endif
206 
207 #if defined(SWAP_USE_256_BIT_INIT_MASK) && defined(USE_SWAP_AVX2)
208 #define SWAP_MASK_INIT_SIZE 32
209 #else
210 #define SWAP_MASK_INIT_SIZE 16
211 #endif
212 
213 MY_ALIGN(SWAP_MASK_INIT_SIZE)
214 static const Byte k_ShufMask_Swap2[] =
215 {
216     SWAP2_SHUF_MASK_16_BYTES
217   #if SWAP_MASK_INIT_SIZE > 16
218   , SWAP2_SHUF_MASK_16_BYTES
219   #endif
220 };
221 
222 MY_ALIGN(SWAP_MASK_INIT_SIZE)
223 static const Byte k_ShufMask_Swap4[] =
224 {
225     SWAP4_SHUF_MASK_16_BYTES
226   #if SWAP_MASK_INIT_SIZE > 16
227   , SWAP4_SHUF_MASK_16_BYTES
228   #endif
229 };
230 
231 
232 #ifdef USE_SWAP_SSSE3
233 
234 #include <tmmintrin.h> // ssse3
235 
236 #define SHUF_128(i)   *(items + (i)) = \
237      _mm_shuffle_epi8(*(items + (i)), mask); // SSSE3
238 
239 // Z7_NO_INLINE
240 static
241 #ifdef SWAP_ATTRIB_SSSE3
242 SWAP_ATTRIB_SSSE3
243 #endif
244 Z7_ATTRIB_NO_VECTORIZE
245 void
246 Z7_FASTCALL
ShufBytes_128(void * items8,const void * lim8,const void * mask128_ptr)247 ShufBytes_128(void *items8, const void *lim8, const void *mask128_ptr)
248 {
249   __m128i *items = (__m128i *)items8;
250   const __m128i *lim = (const __m128i *)lim8;
251   // const __m128i mask = _mm_set_epi8(SHUF_SWAP2_MASK_16_VALS);
252   // const __m128i mask = _mm_set_epi8(SHUF_SWAP4_MASK_16_VALS);
253   // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
254   // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
255   // const __m128i mask = *(const __m128i *)(const void *)&(k_ShufMask_Swap4[0]);
256   const __m128i mask = *(const __m128i *)mask128_ptr;
257   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
258   do
259   {
260     SHUF_128(0)  SHUF_128(1)  items += 2;
261     SHUF_128(0)  SHUF_128(1)  items += 2;
262   }
263   while (items != lim);
264 }
265 
266 #endif // USE_SWAP_SSSE3
267 
268 
269 
270 #ifdef USE_SWAP_AVX2
271 
272 #include <immintrin.h> // avx, avx2
273 #if defined(__clang__)
274 #include <avxintrin.h>
275 #include <avx2intrin.h>
276 #endif
277 
278 #define SHUF_256(i)   *(items + (i)) = \
279   _mm256_shuffle_epi8(*(items + (i)), mask); // AVX2
280 
281 // Z7_NO_INLINE
282 static
283 #ifdef SWAP_ATTRIB_AVX2
284 SWAP_ATTRIB_AVX2
285 #endif
286 Z7_ATTRIB_NO_VECTORIZE
287 void
288 Z7_FASTCALL
ShufBytes_256(void * items8,const void * lim8,const void * mask128_ptr)289 ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr)
290 {
291   __m256i *items = (__m256i *)items8;
292   const __m256i *lim = (const __m256i *)lim8;
293   /*
294   UNUSED_VAR(mask128_ptr)
295   __m256i mask =
296   for Swap4: _mm256_setr_epi8(SWAP4_SHUF_MASK_16_BYTES, SWAP4_SHUF_MASK_16_BYTES);
297   for Swap2: _mm256_setr_epi8(SWAP2_SHUF_MASK_16_BYTES, SWAP2_SHUF_MASK_16_BYTES);
298   */
299   const __m256i mask =
300  #if SWAP_MASK_INIT_SIZE > 16
301       *(const __m256i *)(const void *)mask128_ptr;
302  #else
303   /* msvc: broadcastsi128() version reserves the stack for no reason
304      msvc 19.29-: _mm256_insertf128_si256() / _mm256_set_m128i)) versions use non-avx movdqu   xmm0,XMMWORD PTR [r8]
305      msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want
306   */
307   // _mm256_broadcastsi128_si256(*mask128_ptr);
308   /*
309   #define MY_mm256_set_m128i(hi, lo)  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
310   MY_mm256_set_m128i
311   */
312       _mm256_set_m128i(
313         *(const __m128i *)mask128_ptr,
314         *(const __m128i *)mask128_ptr);
315  #endif
316 
317   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
318   do
319   {
320     SHUF_256(0)  SHUF_256(1)  items += 2;
321     SHUF_256(0)  SHUF_256(1)  items += 2;
322   }
323   while (items != lim);
324 }
325 
326 #endif // USE_SWAP_AVX2
327 #endif // USE_SWAP_SSSE3 || USE_SWAP_AVX2
328 #endif // USE_SWAP_128
329 
330 
331 
332 // compile message "NEON intrinsics not available with the soft-float ABI"
333 #elif defined(MY_CPU_ARM_OR_ARM64) || \
334     (defined(__ARM_ARCH) && (__ARM_ARCH >= 7))
335 // #elif defined(MY_CPU_ARM64)
336 
337   #if defined(__clang__) && (__clang_major__ >= 8) \
338     || defined(__GNUC__) && (__GNUC__ >= 8)
339     #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) \
340         || defined(MY_CPU_ARM64)
341       #define USE_SWAP_128
342     #endif
343     #ifdef MY_CPU_ARM64
344       // #define SWAP_ATTRIB_NEON __attribute__((__target__("")))
345     #else
346       // #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
347     #endif
348   #elif defined(_MSC_VER)
349     #if (_MSC_VER >= 1910)
350       #define USE_SWAP_128
351     #endif
352   #endif
353 
354   #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
355     #include <arm64_neon.h>
356   #else
357     #include <arm_neon.h>
358   #endif
359 
360 #ifndef USE_SWAP_128
361   #define FORCE_SWAP_MODE
362 #else
363 
364 #ifdef MY_CPU_ARM64
365   // for debug : comment it
366   #define FORCE_SWAP_MODE
367 #else
368   #define k_SwapBytes_Mode_NEON 1
369 #endif
370 // typedef uint8x16_t v128;
371 #define SWAP2_128(i)   *(uint8x16_t *)      (void *)(items + (i) * 8) = \
372       vrev16q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 8));
373 #define SWAP4_128(i)   *(uint8x16_t *)      (void *)(items + (i) * 4) = \
374       vrev32q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 4));
375 
376 // Z7_NO_INLINE
377 static
378 #ifdef SWAP_ATTRIB_NEON
379 SWAP_ATTRIB_NEON
380 #endif
381 Z7_ATTRIB_NO_VECTORIZE
382 void
383 Z7_FASTCALL
SwapBytes2_128(CSwapUInt16 * items,const CSwapUInt16 * lim)384 SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim)
385 {
386   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
387   do
388   {
389     SWAP2_128(0)  SWAP2_128(1)  items += 2 * 8;
390     SWAP2_128(0)  SWAP2_128(1)  items += 2 * 8;
391   }
392   while (items != lim);
393 }
394 
395 // Z7_NO_INLINE
396 static
397 #ifdef SWAP_ATTRIB_NEON
398 SWAP_ATTRIB_NEON
399 #endif
400 Z7_ATTRIB_NO_VECTORIZE
401 void
402 Z7_FASTCALL
SwapBytes4_128(CSwapUInt32 * items,const CSwapUInt32 * lim)403 SwapBytes4_128(CSwapUInt32 *items, const CSwapUInt32 *lim)
404 {
405   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
406   do
407   {
408     SWAP4_128(0)  SWAP4_128(1)  items += 2 * 4;
409     SWAP4_128(0)  SWAP4_128(1)  items += 2 * 4;
410   }
411   while (items != lim);
412 }
413 
414 #endif // USE_SWAP_128
415 
416 #else // MY_CPU_ARM_OR_ARM64
417 #define FORCE_SWAP_MODE
418 #endif // MY_CPU_ARM_OR_ARM64
419 
420 
421 
422 
423 
424 
425 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_X86)
426   /* _byteswap_ushort() in MSVC x86 32-bit works via slow { mov dh, al; mov dl, ah }
427      So we use own versions of byteswap function */
428   #if (_MSC_VER < 1400 )  // old MSVC-X86 without _rotr16() support
429     #define SWAP2_16(i)  { UInt32 v = items[i];  v += (v << 16);  v >>= 8;  items[i] = (CSwapUInt16)v; }
430   #else  // is new MSVC-X86 with fast _rotr16()
431     #include <intrin.h>
432     #define SWAP2_16(i)  { items[i] = _rotr16(items[i], 8); }
433   #endif
434 #else  // is not MSVC-X86
435   #define SWAP2_16(i)  { CSwapUInt16 v = items[i];  items[i] = Z7_BSWAP16(v); }
436 #endif  // MSVC-X86
437 
438 #if defined(Z7_CPU_FAST_BSWAP_SUPPORTED)
439   #define SWAP4_32(i)  { CSwapUInt32 v = items[i];  items[i] = Z7_BSWAP32(v); }
440 #else
441   #define SWAP4_32(i)  \
442     { UInt32 v = items[i]; \
443       v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff); \
444       v = rotlFixed(v, 16); \
445       items[i] = v; }
446 #endif
447 
448 
449 
450 
451 #if defined(FORCE_SWAP_MODE) && defined(USE_SWAP_128)
452   #define DEFAULT_Swap2  SwapBytes2_128
453   #if !defined(MY_CPU_X86_OR_AMD64)
454     #define DEFAULT_Swap4  SwapBytes4_128
455   #endif
456 #endif
457 
458 #if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4)
459 
460 #define SWAP_BASE_FUNCS_PREFIXES \
461 Z7_FORCE_INLINE  \
462 static \
463 Z7_ATTRIB_NO_VECTOR  \
464 void Z7_FASTCALL
465 
466 
467 #ifdef MY_CPU_64BIT
468 
469 #if defined(MY_CPU_ARM64) \
470     && defined(__ARM_ARCH) && (__ARM_ARCH >= 8) \
471     && (  (defined(__GNUC__) && (__GNUC__ >= 4)) \
472        || (defined(__clang__) && (__clang_major__ >= 4)))
473 
474   #define SWAP2_64_VAR(v)  asm ("rev16 %x0,%x0" : "+r" (v));
475   #define SWAP4_64_VAR(v)  asm ("rev32 %x0,%x0" : "+r" (v));
476 
477 #else  // is not ARM64-GNU
478 
479 #if !defined(MY_CPU_X86_OR_AMD64) || (k_SwapBytes_Mode_MIN == 0) || !defined(USE_SWAP_128)
480   #define SWAP2_64_VAR(v) \
481     v = ( 0x00ff00ff00ff00ff & (v >> 8))  \
482       + ((0x00ff00ff00ff00ff & v) << 8);
483       /* plus gives faster code in MSVC */
484 #endif
485 
486 #ifdef Z7_CPU_FAST_BSWAP_SUPPORTED
487   #define SWAP4_64_VAR(v) \
488     v = Z7_BSWAP64(v); \
489     v = Z7_ROTL64(v, 32);
490 #else
491   #define SWAP4_64_VAR(v) \
492     v = ( 0x000000ff000000ff & (v >> 24))  \
493       + ((0x000000ff000000ff & v) << 24 )  \
494       + ( 0x0000ff000000ff00 & (v >>  8))  \
495       + ((0x0000ff000000ff00 & v) <<  8 )  \
496       ;
497 #endif
498 
499 #endif  // ARM64-GNU
500 
501 
502 #ifdef SWAP2_64_VAR
503 
504 #define SWAP2_64(i) { \
505     UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 4); \
506     SWAP2_64_VAR(v) \
507     *(UInt64 *)(void *)(items + (i) * 4) = v; }
508 
509 SWAP_BASE_FUNCS_PREFIXES
SwapBytes2_64(CSwapUInt16 * items,const CSwapUInt16 * lim)510 SwapBytes2_64(CSwapUInt16 *items, const CSwapUInt16 *lim)
511 {
512   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
513   do
514   {
515     SWAP2_64(0)  SWAP2_64(1)  items += 2 * 4;
516     SWAP2_64(0)  SWAP2_64(1)  items += 2 * 4;
517   }
518   while (items != lim);
519 }
520 
521   #define DEFAULT_Swap2  SwapBytes2_64
522   #if !defined(FORCE_SWAP_MODE)
523     #define SWAP2_DEFAULT_MODE 0
524   #endif
525 #else // !defined(SWAP2_64_VAR)
526   #define DEFAULT_Swap2  SwapBytes2_128
527   #if !defined(FORCE_SWAP_MODE)
528     #define SWAP2_DEFAULT_MODE 1
529   #endif
530 #endif // SWAP2_64_VAR
531 
532 
533 #define SWAP4_64(i) { \
534     UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 2); \
535     SWAP4_64_VAR(v) \
536     *(UInt64 *)(void *)(items + (i) * 2) = v; }
537 
538 SWAP_BASE_FUNCS_PREFIXES
SwapBytes4_64(CSwapUInt32 * items,const CSwapUInt32 * lim)539 SwapBytes4_64(CSwapUInt32 *items, const CSwapUInt32 *lim)
540 {
541   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
542   do
543   {
544     SWAP4_64(0)  SWAP4_64(1)  items += 2 * 2;
545     SWAP4_64(0)  SWAP4_64(1)  items += 2 * 2;
546   }
547   while (items != lim);
548 }
549 
550 #define DEFAULT_Swap4  SwapBytes4_64
551 
552 #else  // is not 64BIT
553 
554 
555 #if defined(MY_CPU_ARM_OR_ARM64) \
556     && defined(__ARM_ARCH) && (__ARM_ARCH >= 6) \
557     && (  (defined(__GNUC__) && (__GNUC__ >= 4)) \
558        || (defined(__clang__) && (__clang_major__ >= 4)))
559 
560 #ifdef MY_CPU_64BIT
561   #define SWAP2_32_VAR(v)  asm ("rev16 %w0,%w0" : "+r" (v));
562 #else
563   #define SWAP2_32_VAR(v)  asm ("rev16 %0,%0" : "+r" (v)); // for clang/gcc
564     // asm ("rev16 %r0,%r0" : "+r" (a));  // for gcc
565 #endif
566 
567 #elif defined(_MSC_VER) && (_MSC_VER < 1300) && defined(MY_CPU_X86) \
568     || !defined(Z7_CPU_FAST_BSWAP_SUPPORTED) \
569     || !defined(Z7_CPU_FAST_ROTATE_SUPPORTED)
570   // old msvc doesn't support _byteswap_ulong()
571   #define SWAP2_32_VAR(v) \
572     v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff);
573 
574 #else  // is not ARM and is not old-MSVC-X86 and fast BSWAP/ROTATE are supported
575   #define SWAP2_32_VAR(v) \
576     v = Z7_BSWAP32(v); \
577     v = rotlFixed(v, 16);
578 
579 #endif  // GNU-ARM*
580 
581 #define SWAP2_32(i) { \
582     UInt32 v = *(const UInt32 *)(const void *)(items + (i) * 2); \
583     SWAP2_32_VAR(v); \
584     *(UInt32 *)(void *)(items + (i) * 2) = v; }
585 
586 
587 SWAP_BASE_FUNCS_PREFIXES
SwapBytes2_32(CSwapUInt16 * items,const CSwapUInt16 * lim)588 SwapBytes2_32(CSwapUInt16 *items, const CSwapUInt16 *lim)
589 {
590   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
591   do
592   {
593     SWAP2_32(0)  SWAP2_32(1)  items += 2 * 2;
594     SWAP2_32(0)  SWAP2_32(1)  items += 2 * 2;
595   }
596   while (items != lim);
597 }
598 
599 
600 SWAP_BASE_FUNCS_PREFIXES
SwapBytes4_32(CSwapUInt32 * items,const CSwapUInt32 * lim)601 SwapBytes4_32(CSwapUInt32 *items, const CSwapUInt32 *lim)
602 {
603   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
604   do
605   {
606     SWAP4_32(0)  SWAP4_32(1)  items += 2;
607     SWAP4_32(0)  SWAP4_32(1)  items += 2;
608   }
609   while (items != lim);
610 }
611 
612 #define DEFAULT_Swap2  SwapBytes2_32
613 #define DEFAULT_Swap4  SwapBytes4_32
614 #if !defined(FORCE_SWAP_MODE)
615   #define SWAP2_DEFAULT_MODE 0
616 #endif
617 
618 #endif // MY_CPU_64BIT
619 #endif // if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4)
620 
621 
622 
623 #if !defined(FORCE_SWAP_MODE)
624 static unsigned g_SwapBytes_Mode;
625 #endif
626 
627 /* size of largest unrolled loop iteration: 128 bytes = 4 * 32 bytes (AVX). */
628 #define SWAP_ITERATION_BLOCK_SIZE_MAX  (1 << 7)
629 
630 // 32 bytes for (AVX) or 2 * 16-bytes for NEON.
631 #define SWAP_VECTOR_ALIGN_SIZE  (1 << 5)
632 
633 Z7_NO_INLINE
z7_SwapBytes2(CSwapUInt16 * items,size_t numItems)634 void z7_SwapBytes2(CSwapUInt16 *items, size_t numItems)
635 {
636   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
637   for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--)
638   {
639     SWAP2_16(0)
640     items++;
641   }
642   {
643     const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt16) - 1;
644     size_t numItems2 = numItems;
645     CSwapUInt16 *lim;
646     numItems &= k_Align_Mask;
647     numItems2 &= ~(size_t)k_Align_Mask;
648     lim = items + numItems2;
649     if (numItems2 != 0)
650     {
651      #if !defined(FORCE_SWAP_MODE)
652       #ifdef MY_CPU_X86_OR_AMD64
653         #ifdef USE_SWAP_AVX2
654           if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3)
655             ShufBytes_256((__m256i *)(void *)items,
656                 (const __m256i *)(const void *)lim,
657                 (const __m128i *)(const void *)&(k_ShufMask_Swap2[0]));
658           else
659         #endif
660         #ifdef USE_SWAP_SSSE3
661           if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3)
662             ShufBytes_128((__m128i *)(void *)items,
663                 (const __m128i *)(const void *)lim,
664                 (const __m128i *)(const void *)&(k_ShufMask_Swap2[0]));
665           else
666         #endif
667       #endif  // MY_CPU_X86_OR_AMD64
668       #if SWAP2_DEFAULT_MODE == 0
669           if (g_SwapBytes_Mode != 0)
670             SwapBytes2_128(items, lim);
671           else
672       #endif
673      #endif // FORCE_SWAP_MODE
674             DEFAULT_Swap2(items, lim);
675     }
676     items = lim;
677   }
678   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
679   for (; numItems != 0; numItems--)
680   {
681     SWAP2_16(0)
682     items++;
683   }
684 }
685 
686 
687 Z7_NO_INLINE
z7_SwapBytes4(CSwapUInt32 * items,size_t numItems)688 void z7_SwapBytes4(CSwapUInt32 *items, size_t numItems)
689 {
690   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
691   for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--)
692   {
693     SWAP4_32(0)
694     items++;
695   }
696   {
697     const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt32) - 1;
698     size_t numItems2 = numItems;
699     CSwapUInt32 *lim;
700     numItems &= k_Align_Mask;
701     numItems2 &= ~(size_t)k_Align_Mask;
702     lim = items + numItems2;
703     if (numItems2 != 0)
704     {
705      #if !defined(FORCE_SWAP_MODE)
706       #ifdef MY_CPU_X86_OR_AMD64
707         #ifdef USE_SWAP_AVX2
708           if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3)
709             ShufBytes_256((__m256i *)(void *)items,
710                 (const __m256i *)(const void *)lim,
711                 (const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
712           else
713         #endif
714         #ifdef USE_SWAP_SSSE3
715           if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3)
716             ShufBytes_128((__m128i *)(void *)items,
717                 (const __m128i *)(const void *)lim,
718                 (const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
719           else
720         #endif
721       #else  // MY_CPU_X86_OR_AMD64
722 
723           if (g_SwapBytes_Mode != 0)
724             SwapBytes4_128(items, lim);
725           else
726       #endif  // MY_CPU_X86_OR_AMD64
727      #endif // FORCE_SWAP_MODE
728             DEFAULT_Swap4(items, lim);
729     }
730     items = lim;
731   }
732   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
733   for (; numItems != 0; numItems--)
734   {
735     SWAP4_32(0)
736     items++;
737   }
738 }
739 
740 
741 // #define SHOW_HW_STATUS
742 
743 #ifdef SHOW_HW_STATUS
744 #include <stdio.h>
745 #define PRF(x) x
746 #else
747 #define PRF(x)
748 #endif
749 
z7_SwapBytesPrepare(void)750 void z7_SwapBytesPrepare(void)
751 {
752 #ifndef FORCE_SWAP_MODE
753   unsigned mode = 0; // k_SwapBytes_Mode_BASE;
754 
755 #ifdef MY_CPU_ARM_OR_ARM64
756   {
757     if (CPU_IsSupported_NEON())
758     {
759       // #pragma message ("=== SwapBytes NEON")
760       PRF(printf("\n=== SwapBytes NEON\n");)
761       mode = k_SwapBytes_Mode_NEON;
762     }
763   }
764 #else // MY_CPU_ARM_OR_ARM64
765   {
766     #ifdef USE_SWAP_AVX2
767       if (CPU_IsSupported_AVX2())
768       {
769         // #pragma message ("=== SwapBytes AVX2")
770         PRF(printf("\n=== SwapBytes AVX2\n");)
771         mode = k_SwapBytes_Mode_AVX2;
772       }
773       else
774     #endif
775     #ifdef USE_SWAP_SSSE3
776       if (CPU_IsSupported_SSSE3())
777       {
778         // #pragma message ("=== SwapBytes SSSE3")
779         PRF(printf("\n=== SwapBytes SSSE3\n");)
780         mode = k_SwapBytes_Mode_SSSE3;
781       }
782       else
783     #endif
784     #if !defined(MY_CPU_AMD64)
785       if (CPU_IsSupported_SSE2())
786     #endif
787       {
788         // #pragma message ("=== SwapBytes SSE2")
789         PRF(printf("\n=== SwapBytes SSE2\n");)
790         mode = k_SwapBytes_Mode_SSE2;
791       }
792   }
793 #endif // MY_CPU_ARM_OR_ARM64
794   g_SwapBytes_Mode = mode;
795   // g_SwapBytes_Mode = 0; // for debug
796 #endif // FORCE_SWAP_MODE
797   PRF(printf("\n=== SwapBytesPrepare\n");)
798 }
799 
800 #undef PRF
801