1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2019-2024 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 /**
19 * @brief 8x32-bit vectors, implemented using AVX2.
20 *
21 * This module implements 8-wide 32-bit float, int, and mask vectors for x86
22 * AVX2.
23 *
24 * There is a baseline level of functionality provided by all vector widths and
25 * implementations. This is implemented using identical function signatures,
26 * modulo data type, so we can use them as substitutable implementations in VLA
27 * code.
28 */
29
30 #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
31 #define ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
32
33 #ifndef ASTCENC_SIMD_INLINE
34 #error "Include astcenc_vecmathlib.h, do not include directly"
35 #endif
36
37 #include <cstdio>
38
39 // Define convenience intrinsics that are missing on older compilers
40 #define astcenc_mm256_set_m128i(m, n) _mm256_insertf128_si256(_mm256_castsi128_si256((n)), (m), 1)
41
42 // ============================================================================
43 // vfloat8 data type
44 // ============================================================================
45
46 /**
47 * @brief Data type for 8-wide floats.
48 */
49 struct vfloat8
50 {
51 /**
52 * @brief Construct from zero-initialized value.
53 */
54 ASTCENC_SIMD_INLINE vfloat8() = default;
55
56 /**
57 * @brief Construct from 4 values loaded from an unaligned address.
58 *
59 * Consider using loada() which is better with vectors if data is aligned
60 * to vector length.
61 */
vfloat8vfloat862 ASTCENC_SIMD_INLINE explicit vfloat8(const float *p)
63 {
64 m = _mm256_loadu_ps(p);
65 }
66
67 /**
68 * @brief Construct from 1 scalar value replicated across all lanes.
69 *
70 * Consider using zero() for constexpr zeros.
71 */
vfloat8vfloat872 ASTCENC_SIMD_INLINE explicit vfloat8(float a)
73 {
74 m = _mm256_set1_ps(a);
75 }
76
77 /**
78 * @brief Construct from 8 scalar values.
79 *
80 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
81 */
vfloat8vfloat882 ASTCENC_SIMD_INLINE explicit vfloat8(
83 float a, float b, float c, float d,
84 float e, float f, float g, float h)
85 {
86 m = _mm256_set_ps(h, g, f, e, d, c, b, a);
87 }
88
89 /**
90 * @brief Construct from an existing SIMD register.
91 */
vfloat8vfloat892 ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a)
93 {
94 m = a;
95 }
96
97 /**
98 * @brief Get the scalar value of a single lane.
99 */
lanevfloat8100 template <int l> ASTCENC_SIMD_INLINE float lane() const
101 {
102 #if !defined(__clang__) && defined(_MSC_VER)
103 return m.m256_f32[l];
104 #else
105 union { __m256 m; float f[8]; } cvt;
106 cvt.m = m;
107 return cvt.f[l];
108 #endif
109 }
110
111 /**
112 * @brief Factory that returns a vector of zeros.
113 */
zerovfloat8114 static ASTCENC_SIMD_INLINE vfloat8 zero()
115 {
116 return vfloat8(_mm256_setzero_ps());
117 }
118
119 /**
120 * @brief Factory that returns a replicated scalar loaded from memory.
121 */
load1vfloat8122 static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p)
123 {
124 return vfloat8(_mm256_broadcast_ss(p));
125 }
126
127 /**
128 * @brief Factory that returns a vector loaded from 32B aligned memory.
129 */
loadavfloat8130 static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
131 {
132 return vfloat8(_mm256_load_ps(p));
133 }
134
135 /**
136 * @brief Factory that returns a vector containing the lane IDs.
137 */
lane_idvfloat8138 static ASTCENC_SIMD_INLINE vfloat8 lane_id()
139 {
140 return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
141 }
142
143 /**
144 * @brief The vector ...
145 */
146 __m256 m;
147 };
148
149 // ============================================================================
150 // vint8 data type
151 // ============================================================================
152
153 /**
154 * @brief Data type for 8-wide ints.
155 */
156 struct vint8
157 {
158 /**
159 * @brief Construct from zero-initialized value.
160 */
161 ASTCENC_SIMD_INLINE vint8() = default;
162
163 /**
164 * @brief Construct from 8 values loaded from an unaligned address.
165 *
166 * Consider using loada() which is better with vectors if data is aligned
167 * to vector length.
168 */
vint8vint8169 ASTCENC_SIMD_INLINE explicit vint8(const int *p)
170 {
171 m = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
172 }
173
174 /**
175 * @brief Construct from 8 uint8_t loaded from an unaligned address.
176 */
vint8vint8177 ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
178 {
179 // _mm_loadu_si64 would be nicer syntax, but missing on older GCC
180 m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*reinterpret_cast<const long long*>(p)));
181 }
182
183 /**
184 * @brief Construct from 1 scalar value replicated across all lanes.
185 *
186 * Consider using vfloat4::zero() for constexpr zeros.
187 */
vint8vint8188 ASTCENC_SIMD_INLINE explicit vint8(int a)
189 {
190 m = _mm256_set1_epi32(a);
191 }
192
193 /**
194 * @brief Construct from 8 scalar values.
195 *
196 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
197 */
vint8vint8198 ASTCENC_SIMD_INLINE explicit vint8(
199 int a, int b, int c, int d,
200 int e, int f, int g, int h)
201 {
202 m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
203 }
204
205 /**
206 * @brief Construct from an existing SIMD register.
207 */
vint8vint8208 ASTCENC_SIMD_INLINE explicit vint8(__m256i a)
209 {
210 m = a;
211 }
212
213 /**
214 * @brief Get the scalar from a single lane.
215 */
lanevint8216 template <int l> ASTCENC_SIMD_INLINE int lane() const
217 {
218 #if !defined(__clang__) && defined(_MSC_VER)
219 return m.m256i_i32[l];
220 #else
221 union { __m256i m; int f[8]; } cvt;
222 cvt.m = m;
223 return cvt.f[l];
224 #endif
225 }
226
227 /**
228 * @brief Factory that returns a vector of zeros.
229 */
zerovint8230 static ASTCENC_SIMD_INLINE vint8 zero()
231 {
232 return vint8(_mm256_setzero_si256());
233 }
234
235 /**
236 * @brief Factory that returns a replicated scalar loaded from memory.
237 */
load1vint8238 static ASTCENC_SIMD_INLINE vint8 load1(const int* p)
239 {
240 __m128i a = _mm_set1_epi32(*p);
241 return vint8(_mm256_broadcastd_epi32(a));
242 }
243
244 /**
245 * @brief Factory that returns a vector loaded from unaligned memory.
246 */
loadvint8247 static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p)
248 {
249 return vint8(_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(p)));
250 }
251
252 /**
253 * @brief Factory that returns a vector loaded from 32B aligned memory.
254 */
loadavint8255 static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
256 {
257 return vint8(_mm256_load_si256(reinterpret_cast<const __m256i*>(p)));
258 }
259
260 /**
261 * @brief Factory that returns a vector containing the lane IDs.
262 */
lane_idvint8263 static ASTCENC_SIMD_INLINE vint8 lane_id()
264 {
265 return vint8(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
266 }
267
268 /**
269 * @brief The vector ...
270 */
271 __m256i m;
272 };
273
274 // ============================================================================
275 // vmask8 data type
276 // ============================================================================
277
278 /**
279 * @brief Data type for 8-wide control plane masks.
280 */
281 struct vmask8
282 {
283 /**
284 * @brief Construct from an existing SIMD register.
285 */
vmask8vmask8286 ASTCENC_SIMD_INLINE explicit vmask8(__m256 a)
287 {
288 m = a;
289 }
290
291 /**
292 * @brief Construct from an existing SIMD register.
293 */
vmask8vmask8294 ASTCENC_SIMD_INLINE explicit vmask8(__m256i a)
295 {
296 m = _mm256_castsi256_ps(a);
297 }
298
299 /**
300 * @brief Construct from 1 scalar value.
301 */
vmask8vmask8302 ASTCENC_SIMD_INLINE explicit vmask8(bool a)
303 {
304 vint8 mask(a == false ? 0 : -1);
305 m = _mm256_castsi256_ps(mask.m);
306 }
307
308 /**
309 * @brief The vector ...
310 */
311 __m256 m;
312 };
313
314 // ============================================================================
315 // vmask8 operators and functions
316 // ============================================================================
317
318 /**
319 * @brief Overload: mask union (or).
320 */
321 ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b)
322 {
323 return vmask8(_mm256_or_ps(a.m, b.m));
324 }
325
326 /**
327 * @brief Overload: mask intersect (and).
328 */
329 ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b)
330 {
331 return vmask8(_mm256_and_ps(a.m, b.m));
332 }
333
334 /**
335 * @brief Overload: mask difference (xor).
336 */
337 ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b)
338 {
339 return vmask8(_mm256_xor_ps(a.m, b.m));
340 }
341
342 /**
343 * @brief Overload: mask invert (not).
344 */
345 ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
346 {
347 return vmask8(_mm256_xor_si256(_mm256_castps_si256(a.m), _mm256_set1_epi32(-1)));
348 }
349
350 /**
351 * @brief Return a 8-bit mask code indicating mask status.
352 *
353 * bit0 = lane 0
354 */
mask(vmask8 a)355 ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a)
356 {
357 return static_cast<unsigned int>(_mm256_movemask_ps(a.m));
358 }
359
360 /**
361 * @brief True if any lanes are enabled, false otherwise.
362 */
any(vmask8 a)363 ASTCENC_SIMD_INLINE bool any(vmask8 a)
364 {
365 return mask(a) != 0;
366 }
367
368 /**
369 * @brief True if all lanes are enabled, false otherwise.
370 */
all(vmask8 a)371 ASTCENC_SIMD_INLINE bool all(vmask8 a)
372 {
373 return mask(a) == 0xFF;
374 }
375
376 // ============================================================================
377 // vint8 operators and functions
378 // ============================================================================
379 /**
380 * @brief Overload: vector by vector addition.
381 */
382 ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b)
383 {
384 return vint8(_mm256_add_epi32(a.m, b.m));
385 }
386
387 /**
388 * @brief Overload: vector by vector incremental addition.
389 */
390 ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b)
391 {
392 a = a + b;
393 return a;
394 }
395
396 /**
397 * @brief Overload: vector by vector subtraction.
398 */
399 ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b)
400 {
401 return vint8(_mm256_sub_epi32(a.m, b.m));
402 }
403
404 /**
405 * @brief Overload: vector by vector multiplication.
406 */
407 ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b)
408 {
409 return vint8(_mm256_mullo_epi32(a.m, b.m));
410 }
411
412 /**
413 * @brief Overload: vector bit invert.
414 */
415 ASTCENC_SIMD_INLINE vint8 operator~(vint8 a)
416 {
417 return vint8(_mm256_xor_si256(a.m, _mm256_set1_epi32(-1)));
418 }
419
420 /**
421 * @brief Overload: vector by vector bitwise or.
422 */
423 ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b)
424 {
425 return vint8(_mm256_or_si256(a.m, b.m));
426 }
427
428 /**
429 * @brief Overload: vector by vector bitwise and.
430 */
431 ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b)
432 {
433 return vint8(_mm256_and_si256(a.m, b.m));
434 }
435
436 /**
437 * @brief Overload: vector by vector bitwise xor.
438 */
439 ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b)
440 {
441 return vint8(_mm256_xor_si256(a.m, b.m));
442 }
443
444 /**
445 * @brief Overload: vector by vector equality.
446 */
447 ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b)
448 {
449 return vmask8(_mm256_cmpeq_epi32(a.m, b.m));
450 }
451
452 /**
453 * @brief Overload: vector by vector inequality.
454 */
455 ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b)
456 {
457 return ~vmask8(_mm256_cmpeq_epi32(a.m, b.m));
458 }
459
460 /**
461 * @brief Overload: vector by vector less than.
462 */
463 ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b)
464 {
465 return vmask8(_mm256_cmpgt_epi32(b.m, a.m));
466 }
467
468 /**
469 * @brief Overload: vector by vector greater than.
470 */
471 ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
472 {
473 return vmask8(_mm256_cmpgt_epi32(a.m, b.m));
474 }
475
476 /**
477 * @brief Logical shift left.
478 */
lsl(vint8 a)479 template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a)
480 {
481 return vint8(_mm256_slli_epi32(a.m, s));
482 }
483
484 /**
485 * @brief Arithmetic shift right.
486 */
asr(vint8 a)487 template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a)
488 {
489 return vint8(_mm256_srai_epi32(a.m, s));
490 }
491
492 /**
493 * @brief Logical shift right.
494 */
lsr(vint8 a)495 template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a)
496 {
497 return vint8(_mm256_srli_epi32(a.m, s));
498 }
499
500 /**
501 * @brief Return the min vector of two vectors.
502 */
min(vint8 a,vint8 b)503 ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b)
504 {
505 return vint8(_mm256_min_epi32(a.m, b.m));
506 }
507
508 /**
509 * @brief Return the max vector of two vectors.
510 */
max(vint8 a,vint8 b)511 ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
512 {
513 return vint8(_mm256_max_epi32(a.m, b.m));
514 }
515
516 /**
517 * @brief Return the horizontal minimum of a vector.
518 */
hmin(vint8 a)519 ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
520 {
521 __m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
522 m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
523 m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
524 m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
525
526 __m256i r = astcenc_mm256_set_m128i(m, m);
527 vint8 vmin(r);
528 return vmin;
529 }
530
531 /**
532 * @brief Return the horizontal maximum of a vector.
533 */
hmax(vint8 a)534 ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
535 {
536 __m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
537 m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
538 m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
539 m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
540
541 __m256i r = astcenc_mm256_set_m128i(m, m);
542 vint8 vmax(r);
543 return vmax;
544 }
545
546 /**
547 * @brief Store a vector to a 16B aligned memory address.
548 */
storea(vint8 a,int * p)549 ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
550 {
551 _mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m);
552 }
553
554 /**
555 * @brief Store a vector to an unaligned memory address.
556 */
store(vint8 a,int * p)557 ASTCENC_SIMD_INLINE void store(vint8 a, int* p)
558 {
559 _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m);
560 }
561
562 /**
563 * @brief Store lowest N (vector width) bytes into an unaligned address.
564 */
store_nbytes(vint8 a,uint8_t * p)565 ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
566 {
567 // This is the most logical implementation, but the convenience intrinsic
568 // is missing on older compilers (supported in g++ 9 and clang++ 9).
569 // _mm_storeu_si64(ptr, _mm256_extracti128_si256(v.m, 0))
570 _mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
571 }
572
573 /**
574 * @brief Gather N (vector width) indices from the array.
575 */
gatheri(const int * base,vint8 indices)576 ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
577 {
578 return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
579 }
580
581 /**
582 * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
583 */
pack_low_bytes(vint8 v)584 ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
585 {
586 __m256i shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
587 0, 0, 0, 0, 28, 24, 20, 16,
588 0, 0, 0, 0, 0, 0, 0, 0,
589 0, 0, 0, 0, 12, 8, 4, 0);
590 __m256i a = _mm256_shuffle_epi8(v.m, shuf);
591 __m128i a0 = _mm256_extracti128_si256(a, 0);
592 __m128i a1 = _mm256_extracti128_si256(a, 1);
593 __m128i b = _mm_unpacklo_epi32(a0, a1);
594
595 __m256i r = astcenc_mm256_set_m128i(b, b);
596 return vint8(r);
597 }
598
599 /**
600 * @brief Return lanes from @c b if @c cond is set, else @c a.
601 */
select(vint8 a,vint8 b,vmask8 cond)602 ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
603 {
604 __m256i condi = _mm256_castps_si256(cond.m);
605 return vint8(_mm256_blendv_epi8(a.m, b.m, condi));
606 }
607
608 // ============================================================================
609 // vfloat4 operators and functions
610 // ============================================================================
611
612 /**
613 * @brief Overload: vector by vector addition.
614 */
615 ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b)
616 {
617 return vfloat8(_mm256_add_ps(a.m, b.m));
618 }
619
620 /**
621 * @brief Overload: vector by vector incremental addition.
622 */
623 ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b)
624 {
625 a = a + b;
626 return a;
627 }
628
629 /**
630 * @brief Overload: vector by vector subtraction.
631 */
632 ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b)
633 {
634 return vfloat8(_mm256_sub_ps(a.m, b.m));
635 }
636
637 /**
638 * @brief Overload: vector by vector multiplication.
639 */
640 ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b)
641 {
642 return vfloat8(_mm256_mul_ps(a.m, b.m));
643 }
644
645 /**
646 * @brief Overload: vector by scalar multiplication.
647 */
648 ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b)
649 {
650 return vfloat8(_mm256_mul_ps(a.m, _mm256_set1_ps(b)));
651 }
652
653 /**
654 * @brief Overload: scalar by vector multiplication.
655 */
656 ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b)
657 {
658 return vfloat8(_mm256_mul_ps(_mm256_set1_ps(a), b.m));
659 }
660
661 /**
662 * @brief Overload: vector by vector division.
663 */
664 ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b)
665 {
666 return vfloat8(_mm256_div_ps(a.m, b.m));
667 }
668
669 /**
670 * @brief Overload: vector by scalar division.
671 */
672 ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
673 {
674 return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
675 }
676
677
678 /**
679 * @brief Overload: scalar by vector division.
680 */
681 ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
682 {
683 return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
684 }
685
686
687 /**
688 * @brief Overload: vector by vector equality.
689 */
690 ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b)
691 {
692 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ));
693 }
694
695 /**
696 * @brief Overload: vector by vector inequality.
697 */
698 ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b)
699 {
700 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_NEQ_OQ));
701 }
702
703 /**
704 * @brief Overload: vector by vector less than.
705 */
706 ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b)
707 {
708 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LT_OQ));
709 }
710
711 /**
712 * @brief Overload: vector by vector greater than.
713 */
714 ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b)
715 {
716 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GT_OQ));
717 }
718
719 /**
720 * @brief Overload: vector by vector less than or equal.
721 */
722 ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b)
723 {
724 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LE_OQ));
725 }
726
727 /**
728 * @brief Overload: vector by vector greater than or equal.
729 */
730 ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b)
731 {
732 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GE_OQ));
733 }
734
735 /**
736 * @brief Return the min vector of two vectors.
737 *
738 * If either lane value is NaN, @c b will be returned for that lane.
739 */
min(vfloat8 a,vfloat8 b)740 ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
741 {
742 return vfloat8(_mm256_min_ps(a.m, b.m));
743 }
744
745 /**
746 * @brief Return the min vector of a vector and a scalar.
747 *
748 * If either lane value is NaN, @c b will be returned for that lane.
749 */
min(vfloat8 a,float b)750 ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b)
751 {
752 return min(a, vfloat8(b));
753 }
754
755 /**
756 * @brief Return the max vector of two vectors.
757 *
758 * If either lane value is NaN, @c b will be returned for that lane.
759 */
max(vfloat8 a,vfloat8 b)760 ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
761 {
762 return vfloat8(_mm256_max_ps(a.m, b.m));
763 }
764
765 /**
766 * @brief Return the max vector of a vector and a scalar.
767 *
768 * If either lane value is NaN, @c b will be returned for that lane.
769 */
max(vfloat8 a,float b)770 ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b)
771 {
772 return max(a, vfloat8(b));
773 }
774
775 /**
776 * @brief Return the clamped value between min and max.
777 *
778 * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
779 * then @c min will be returned for that lane.
780 */
clamp(float min,float max,vfloat8 a)781 ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
782 {
783 // Do not reorder - second operand will return if either is NaN
784 a.m = _mm256_max_ps(a.m, _mm256_set1_ps(min));
785 a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
786 return a;
787 }
788
789 /**
790 * @brief Return a clamped value between 0.0f and max.
791 *
792 * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
793 * be returned for that lane.
794 */
clampz(float max,vfloat8 a)795 ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
796 {
797 a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
798 a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
799 return a;
800 }
801
802 /**
803 * @brief Return a clamped value between 0.0f and 1.0f.
804 *
805 * If @c a is NaN then zero will be returned for that lane.
806 */
clampzo(vfloat8 a)807 ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a)
808 {
809 a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
810 a.m = _mm256_min_ps(a.m, _mm256_set1_ps(1.0f));
811 return a;
812 }
813
814 /**
815 * @brief Return the absolute value of the float vector.
816 */
abs(vfloat8 a)817 ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a)
818 {
819 __m256 msk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
820 return vfloat8(_mm256_and_ps(a.m, msk));
821 }
822
823 /**
824 * @brief Return a float rounded to the nearest integer value.
825 */
round(vfloat8 a)826 ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a)
827 {
828 constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
829 return vfloat8(_mm256_round_ps(a.m, flags));
830 }
831
832 /**
833 * @brief Return the horizontal minimum of a vector.
834 */
hmin(vfloat8 a)835 ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
836 {
837 __m128 vlow = _mm256_castps256_ps128(a.m);
838 __m128 vhigh = _mm256_extractf128_ps(a.m, 1);
839 vlow = _mm_min_ps(vlow, vhigh);
840
841 // First do an horizontal reduction.
842 __m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));
843 __m128 mins = _mm_min_ps(vlow, shuf);
844 shuf = _mm_movehl_ps(shuf, mins);
845 mins = _mm_min_ss(mins, shuf);
846
847 // This is the most logical implementation, but the convenience intrinsic
848 // is missing on older compilers (supported in g++ 9 and clang++ 9).
849 //__m256i r = _mm256_set_m128(m, m)
850 __m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(mins), mins, 1);
851
852 return vfloat8(_mm256_permute_ps(r, 0));
853 }
854
855 /**
856 * @brief Return the horizontal minimum of a vector.
857 */
hmin_s(vfloat8 a)858 ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
859 {
860 return hmin(a).lane<0>();
861 }
862
863 /**
864 * @brief Return the horizontal maximum of a vector.
865 */
hmax(vfloat8 a)866 ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
867 {
868 __m128 vlow = _mm256_castps256_ps128(a.m);
869 __m128 vhigh = _mm256_extractf128_ps(a.m, 1);
870 vhigh = _mm_max_ps(vlow, vhigh);
871
872 // First do an horizontal reduction.
873 __m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1));
874 __m128 maxs = _mm_max_ps(vhigh, shuf);
875 shuf = _mm_movehl_ps(shuf,maxs);
876 maxs = _mm_max_ss(maxs, shuf);
877
878 // This is the most logical implementation, but the convenience intrinsic
879 // is missing on older compilers (supported in g++ 9 and clang++ 9).
880 //__m256i r = _mm256_set_m128(m, m)
881 __m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(maxs), maxs, 1);
882 return vfloat8(_mm256_permute_ps(r, 0));
883 }
884
885 /**
886 * @brief Return the horizontal maximum of a vector.
887 */
hmax_s(vfloat8 a)888 ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
889 {
890 return hmax(a).lane<0>();
891 }
892
893 /**
894 * @brief Return the horizontal sum of a vector.
895 */
hadd_s(vfloat8 a)896 ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a)
897 {
898 // Two sequential 4-wide adds gives invariance with 4-wide code
899 vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
900 vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
901 return hadd_s(lo) + hadd_s(hi);
902 }
903
904 /**
905 * @brief Return lanes from @c b if @c cond is set, else @c a.
906 */
select(vfloat8 a,vfloat8 b,vmask8 cond)907 ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
908 {
909 return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
910 }
911
912 /**
913 * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
914 */
select_msb(vfloat8 a,vfloat8 b,vmask8 cond)915 ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond)
916 {
917 return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
918 }
919
920 /**
921 * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
922 *
923 * This is invariant with 4-wide implementations.
924 */
haccumulate(vfloat4 & accum,vfloat8 a)925 ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a)
926 {
927 vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
928 haccumulate(accum, lo);
929
930 vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
931 haccumulate(accum, hi);
932 }
933
934 /**
935 * @brief Accumulate lane-wise sums for a vector.
936 *
937 * This is NOT invariant with 4-wide implementations.
938 */
haccumulate(vfloat8 & accum,vfloat8 a)939 ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a)
940 {
941 accum += a;
942 }
943
944 /**
945 * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide.
946 *
947 * This is invariant with 4-wide implementations.
948 */
haccumulate(vfloat4 & accum,vfloat8 a,vmask8 m)949 ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m)
950 {
951 a = select(vfloat8::zero(), a, m);
952 haccumulate(accum, a);
953 }
954
955 /**
956 * @brief Accumulate masked lane-wise sums for a vector.
957 *
958 * This is NOT invariant with 4-wide implementations.
959 */
haccumulate(vfloat8 & accum,vfloat8 a,vmask8 m)960 ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m)
961 {
962 a = select(vfloat8::zero(), a, m);
963 haccumulate(accum, a);
964 }
965
966 /**
967 * @brief Return the sqrt of the lanes in the vector.
968 */
sqrt(vfloat8 a)969 ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a)
970 {
971 return vfloat8(_mm256_sqrt_ps(a.m));
972 }
973
974 /**
975 * @brief Load a vector of gathered results from an array;
976 */
gatherf(const float * base,vint8 indices)977 ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
978 {
979 return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
980 }
981
982 /**
983 * @brief Store a vector to an unaligned memory address.
984 */
store(vfloat8 a,float * p)985 ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
986 {
987 _mm256_storeu_ps(p, a.m);
988 }
989
990 /**
991 * @brief Store a vector to a 32B aligned memory address.
992 */
storea(vfloat8 a,float * p)993 ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
994 {
995 _mm256_store_ps(p, a.m);
996 }
997
998 /**
999 * @brief Return a integer value for a float vector, using truncation.
1000 */
float_to_int(vfloat8 a)1001 ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
1002 {
1003 return vint8(_mm256_cvttps_epi32(a.m));
1004 }
1005
1006 /**
1007 * @brief Return a integer value for a float vector, using round-to-nearest.
1008 */
float_to_int_rtn(vfloat8 a)1009 ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
1010 {
1011 a = a + vfloat8(0.5f);
1012 return vint8(_mm256_cvttps_epi32(a.m));
1013 }
1014
1015
1016 /**
1017 * @brief Return a float value for an integer vector.
1018 */
int_to_float(vint8 a)1019 ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a)
1020 {
1021 return vfloat8(_mm256_cvtepi32_ps(a.m));
1022 }
1023
1024 /**
1025 * @brief Return a float value as an integer bit pattern (i.e. no conversion).
1026 *
1027 * It is a common trick to convert floats into integer bit patterns, perform
1028 * some bit hackery based on knowledge they are IEEE 754 layout, and then
1029 * convert them back again. This is the first half of that flip.
1030 */
float_as_int(vfloat8 a)1031 ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a)
1032 {
1033 return vint8(_mm256_castps_si256(a.m));
1034 }
1035
1036 /**
1037 * @brief Return a integer value as a float bit pattern (i.e. no conversion).
1038 *
1039 * It is a common trick to convert floats into integer bit patterns, perform
1040 * some bit hackery based on knowledge they are IEEE 754 layout, and then
1041 * convert them back again. This is the second half of that flip.
1042 */
int_as_float(vint8 a)1043 ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
1044 {
1045 return vfloat8(_mm256_castsi256_ps(a.m));
1046 }
1047
1048 /**
1049 * @brief Prepare a vtable lookup table for use with the native SIMD size.
1050 */
vtable_prepare(vint4 t0,vint8 & t0p)1051 ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
1052 {
1053 // AVX2 duplicates the table within each 128-bit lane
1054 __m128i t0n = t0.m;
1055 t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
1056 }
1057
1058 /**
1059 * @brief Prepare a vtable lookup table for use with the native SIMD size.
1060 */
vtable_prepare(vint4 t0,vint4 t1,vint8 & t0p,vint8 & t1p)1061 ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
1062 {
1063 // AVX2 duplicates the table within each 128-bit lane
1064 __m128i t0n = t0.m;
1065 t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
1066
1067 __m128i t1n = _mm_xor_si128(t0.m, t1.m);
1068 t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
1069 }
1070
1071 /**
1072 * @brief Prepare a vtable lookup table for use with the native SIMD size.
1073 */
vtable_prepare(vint4 t0,vint4 t1,vint4 t2,vint4 t3,vint8 & t0p,vint8 & t1p,vint8 & t2p,vint8 & t3p)1074 ASTCENC_SIMD_INLINE void vtable_prepare(
1075 vint4 t0, vint4 t1, vint4 t2, vint4 t3,
1076 vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
1077 {
1078 // AVX2 duplicates the table within each 128-bit lane
1079 __m128i t0n = t0.m;
1080 t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
1081
1082 __m128i t1n = _mm_xor_si128(t0.m, t1.m);
1083 t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
1084
1085 __m128i t2n = _mm_xor_si128(t1.m, t2.m);
1086 t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n));
1087
1088 __m128i t3n = _mm_xor_si128(t2.m, t3.m);
1089 t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n));
1090 }
1091
1092 /**
1093 * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
1094 */
vtable_8bt_32bi(vint8 t0,vint8 idx)1095 ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
1096 {
1097 // Set index byte MSB to 1 for unused bytes so shuffle returns zero
1098 __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1099
1100 __m256i result = _mm256_shuffle_epi8(t0.m, idxx);
1101 return vint8(result);
1102 }
1103
1104 /**
1105 * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
1106 */
vtable_8bt_32bi(vint8 t0,vint8 t1,vint8 idx)1107 ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
1108 {
1109 // Set index byte MSB to 1 for unused bytes so shuffle returns zero
1110 __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1111
1112 __m256i result = _mm256_shuffle_epi8(t0.m, idxx);
1113 idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1114
1115 __m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
1116 result = _mm256_xor_si256(result, result2);
1117 return vint8(result);
1118 }
1119
1120 /**
1121 * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
1122 */
vtable_8bt_32bi(vint8 t0,vint8 t1,vint8 t2,vint8 t3,vint8 idx)1123 ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
1124 {
1125 // Set index byte MSB to 1 for unused bytes so shuffle returns zero
1126 __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1127
1128 __m256i result = _mm256_shuffle_epi8(t0.m, idxx);
1129 idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1130
1131 __m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
1132 result = _mm256_xor_si256(result, result2);
1133 idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1134
1135 result2 = _mm256_shuffle_epi8(t2.m, idxx);
1136 result = _mm256_xor_si256(result, result2);
1137 idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1138
1139 result2 = _mm256_shuffle_epi8(t3.m, idxx);
1140 result = _mm256_xor_si256(result, result2);
1141
1142 return vint8(result);
1143 }
1144
1145 /**
1146 * @brief Return a vector of interleaved RGBA data.
1147 *
1148 * Input vectors have the value stored in the bottom 8 bits of each lane,
1149 * with high bits set to zero.
1150 *
1151 * Output vector stores a single RGBA texel packed in each lane.
1152 */
interleave_rgba8(vint8 r,vint8 g,vint8 b,vint8 a)1153 ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
1154 {
1155 return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
1156 }
1157
1158 /**
1159 * @brief Store a vector, skipping masked lanes.
1160 *
1161 * All masked lanes must be at the end of vector, after all non-masked lanes.
1162 */
store_lanes_masked(uint8_t * base,vint8 data,vmask8 mask)1163 ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask)
1164 {
1165 _mm256_maskstore_epi32(reinterpret_cast<int*>(base), _mm256_castps_si256(mask.m), data.m);
1166 }
1167
1168 /**
1169 * @brief Debug function to print a vector of ints.
1170 */
print(vint8 a)1171 ASTCENC_SIMD_INLINE void print(vint8 a)
1172 {
1173 alignas(32) int v[8];
1174 storea(a, v);
1175 printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n",
1176 v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
1177 }
1178
1179 /**
1180 * @brief Debug function to print a vector of ints.
1181 */
printx(vint8 a)1182 ASTCENC_SIMD_INLINE void printx(vint8 a)
1183 {
1184 alignas(32) int v[8];
1185 storea(a, v);
1186 printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n",
1187 v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
1188 }
1189
1190 /**
1191 * @brief Debug function to print a vector of floats.
1192 */
print(vfloat8 a)1193 ASTCENC_SIMD_INLINE void print(vfloat8 a)
1194 {
1195 alignas(32) float v[8];
1196 storea(a, v);
1197 printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
1198 static_cast<double>(v[0]), static_cast<double>(v[1]),
1199 static_cast<double>(v[2]), static_cast<double>(v[3]),
1200 static_cast<double>(v[4]), static_cast<double>(v[5]),
1201 static_cast<double>(v[6]), static_cast<double>(v[7]));
1202 }
1203
1204 /**
1205 * @brief Debug function to print a vector of masks.
1206 */
print(vmask8 a)1207 ASTCENC_SIMD_INLINE void print(vmask8 a)
1208 {
1209 print(select(vint8(0), vint8(1), a));
1210 }
1211
1212 #endif // #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
1213