1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2019-2021 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 /**
19 * @brief 8x32-bit vectors, implemented using AVX2.
20 *
21 * This module implements 8-wide 32-bit float, int, and mask vectors for x86
22 * AVX2.
23 *
24 * There is a baseline level of functionality provided by all vector widths and
25 * implementations. This is implemented using identical function signatures,
26 * modulo data type, so we can use them as substitutable implementations in VLA
27 * code.
28 */
29
30 #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
31 #define ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
32
33 #ifndef ASTCENC_SIMD_INLINE
34 #error "Include astcenc_vecmathlib.h, do not include directly"
35 #endif
36
37 #include <cstdio>
38
39 // ============================================================================
40 // vfloat8 data type
41 // ============================================================================
42
43 /**
44 * @brief Data type for 8-wide floats.
45 */
46 struct vfloat8
47 {
48 /**
49 * @brief Construct from zero-initialized value.
50 */
51 ASTCENC_SIMD_INLINE vfloat8() = default;
52
53 /**
54 * @brief Construct from 4 values loaded from an unaligned address.
55 *
56 * Consider using loada() which is better with vectors if data is aligned
57 * to vector length.
58 */
vfloat8vfloat859 ASTCENC_SIMD_INLINE explicit vfloat8(const float *p)
60 {
61 m = _mm256_loadu_ps(p);
62 }
63
64 /**
65 * @brief Construct from 1 scalar value replicated across all lanes.
66 *
67 * Consider using zero() for constexpr zeros.
68 */
vfloat8vfloat869 ASTCENC_SIMD_INLINE explicit vfloat8(float a)
70 {
71 m = _mm256_set1_ps(a);
72 }
73
74 /**
75 * @brief Construct from 8 scalar values.
76 *
77 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
78 */
vfloat8vfloat879 ASTCENC_SIMD_INLINE explicit vfloat8(
80 float a, float b, float c, float d,
81 float e, float f, float g, float h)
82 {
83 m = _mm256_set_ps(h, g, f, e, d, c, b, a);
84 }
85
86 /**
87 * @brief Construct from an existing SIMD register.
88 */
vfloat8vfloat889 ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a) {
90 m = a;
91 }
92
93 /**
94 * @brief Get the scalar value of a single lane.
95 */
lanevfloat896 template <int l> ASTCENC_SIMD_INLINE float lane() const
97 {
98 #if !defined(__clang__) && defined(_MSC_VER)
99 return m.m256_f32[l];
100 #else
101 union { __m256 m; float f[8]; } cvt;
102 cvt.m = m;
103 return cvt.f[l];
104 #endif
105 }
106
107 /**
108 * @brief Factory that returns a vector of zeros.
109 */
zerovfloat8110 static ASTCENC_SIMD_INLINE vfloat8 zero()
111 {
112 return vfloat8(_mm256_setzero_ps());
113 }
114
115 /**
116 * @brief Factory that returns a replicated scalar loaded from memory.
117 */
load1vfloat8118 static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p)
119 {
120 return vfloat8(_mm256_broadcast_ss(p));
121 }
122
123 /**
124 * @brief Factory that returns a vector loaded from 32B aligned memory.
125 */
loadavfloat8126 static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
127 {
128 return vfloat8(_mm256_load_ps(p));
129 }
130
131 /**
132 * @brief Factory that returns a vector containing the lane IDs.
133 */
lane_idvfloat8134 static ASTCENC_SIMD_INLINE vfloat8 lane_id()
135 {
136 return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
137 }
138
139 /**
140 * @brief The vector ...
141 */
142 __m256 m;
143 };
144
145 // ============================================================================
146 // vint8 data type
147 // ============================================================================
148
149 /**
150 * @brief Data type for 8-wide ints.
151 */
152 struct vint8
153 {
154 /**
155 * @brief Construct from zero-initialized value.
156 */
157 ASTCENC_SIMD_INLINE vint8() = default;
158
159 /**
160 * @brief Construct from 8 values loaded from an unaligned address.
161 *
162 * Consider using loada() which is better with vectors if data is aligned
163 * to vector length.
164 */
vint8vint8165 ASTCENC_SIMD_INLINE explicit vint8(const int *p)
166 {
167 m = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
168 }
169
170 /**
171 * @brief Construct from 8 uint8_t loaded from an unaligned address.
172 */
vint8vint8173 ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
174 {
175 // _mm_loadu_si64 would be nicer syntax, but missing on older GCC
176 m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*reinterpret_cast<const long long*>(p)));
177 }
178
179 /**
180 * @brief Construct from 1 scalar value replicated across all lanes.
181 *
182 * Consider using vfloat4::zero() for constexpr zeros.
183 */
vint8vint8184 ASTCENC_SIMD_INLINE explicit vint8(int a)
185 {
186 m = _mm256_set1_epi32(a);
187 }
188
189 /**
190 * @brief Construct from 8 scalar values.
191 *
192 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
193 */
vint8vint8194 ASTCENC_SIMD_INLINE explicit vint8(
195 int a, int b, int c, int d,
196 int e, int f, int g, int h)
197 {
198 m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
199 }
200
201 /**
202 * @brief Construct from an existing SIMD register.
203 */
vint8vint8204 ASTCENC_SIMD_INLINE explicit vint8(__m256i a)
205 {
206 m = a;
207 }
208
209 /**
210 * @brief Get the scalar from a single lane.
211 */
lanevint8212 template <int l> ASTCENC_SIMD_INLINE int lane() const
213 {
214 #if !defined(__clang__) && defined(_MSC_VER)
215 return m.m256i_i32[l];
216 #else
217 union { __m256i m; int f[8]; } cvt;
218 cvt.m = m;
219 return cvt.f[l];
220 #endif
221 }
222
223 /**
224 * @brief Factory that returns a vector of zeros.
225 */
zerovint8226 static ASTCENC_SIMD_INLINE vint8 zero()
227 {
228 return vint8(_mm256_setzero_si256());
229 }
230
231 /**
232 * @brief Factory that returns a replicated scalar loaded from memory.
233 */
load1vint8234 static ASTCENC_SIMD_INLINE vint8 load1(const int* p)
235 {
236 __m128i a = _mm_set1_epi32(*p);
237 return vint8(_mm256_broadcastd_epi32(a));
238 }
239
240 /**
241 * @brief Factory that returns a vector loaded from 32B aligned memory.
242 */
loadavint8243 static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
244 {
245 return vint8(_mm256_load_si256(reinterpret_cast<const __m256i*>(p)));
246 }
247
248 /**
249 * @brief Factory that returns a vector containing the lane IDs.
250 */
lane_idvint8251 static ASTCENC_SIMD_INLINE vint8 lane_id()
252 {
253 return vint8(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
254 }
255
256 /**
257 * @brief The vector ...
258 */
259 __m256i m;
260 };
261
262 // ============================================================================
263 // vmask8 data type
264 // ============================================================================
265
266 /**
267 * @brief Data type for 8-wide control plane masks.
268 */
269 struct vmask8
270 {
271 /**
272 * @brief Construct from an existing SIMD register.
273 */
vmask8vmask8274 ASTCENC_SIMD_INLINE explicit vmask8(__m256 a)
275 {
276 m = a;
277 }
278
279 /**
280 * @brief Construct from an existing SIMD register.
281 */
vmask8vmask8282 ASTCENC_SIMD_INLINE explicit vmask8(__m256i a)
283 {
284 m = _mm256_castsi256_ps(a);
285 }
286
287 /**
288 * @brief Construct from 1 scalar value.
289 */
vmask8vmask8290 ASTCENC_SIMD_INLINE explicit vmask8(bool a)
291 {
292 vint8 mask(a == false ? 0 : -1);
293 m = _mm256_castsi256_ps(mask.m);
294 }
295
296 /**
297 * @brief The vector ...
298 */
299 __m256 m;
300 };
301
302 // ============================================================================
303 // vmask8 operators and functions
304 // ============================================================================
305
306 /**
307 * @brief Overload: mask union (or).
308 */
309 ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b)
310 {
311 return vmask8(_mm256_or_ps(a.m, b.m));
312 }
313
314 /**
315 * @brief Overload: mask intersect (and).
316 */
317 ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b)
318 {
319 return vmask8(_mm256_and_ps(a.m, b.m));
320 }
321
322 /**
323 * @brief Overload: mask difference (xor).
324 */
325 ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b)
326 {
327 return vmask8(_mm256_xor_ps(a.m, b.m));
328 }
329
330 /**
331 * @brief Overload: mask invert (not).
332 */
333 ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
334 {
335 return vmask8(_mm256_xor_si256(_mm256_castps_si256(a.m), _mm256_set1_epi32(-1)));
336 }
337
338 /**
339 * @brief Return a 8-bit mask code indicating mask status.
340 *
341 * bit0 = lane 0
342 */
mask(vmask8 a)343 ASTCENC_SIMD_INLINE unsigned mask(vmask8 a)
344 {
345 return _mm256_movemask_ps(a.m);
346 }
347
348 /**
349 * @brief True if any lanes are enabled, false otherwise.
350 */
any(vmask8 a)351 ASTCENC_SIMD_INLINE bool any(vmask8 a)
352 {
353 return mask(a) != 0;
354 }
355
356 /**
357 * @brief True if any lanes are enabled, false otherwise.
358 */
all(vmask8 a)359 ASTCENC_SIMD_INLINE bool all(vmask8 a)
360 {
361 return mask(a) == 0xFF;
362 }
363
364 // ============================================================================
365 // vint8 operators and functions
366 // ============================================================================
367 /**
368 * @brief Overload: vector by vector addition.
369 */
370 ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b)
371 {
372 return vint8(_mm256_add_epi32(a.m, b.m));
373 }
374
375 /**
376 * @brief Overload: vector by vector incremental addition.
377 */
378 ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b)
379 {
380 a = a + b;
381 return a;
382 }
383
384 /**
385 * @brief Overload: vector by vector subtraction.
386 */
387 ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b)
388 {
389 return vint8(_mm256_sub_epi32(a.m, b.m));
390 }
391
392 /**
393 * @brief Overload: vector by vector multiplication.
394 */
395 ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b)
396 {
397 return vint8(_mm256_mullo_epi32(a.m, b.m));
398 }
399
400 /**
401 * @brief Overload: vector bit invert.
402 */
403 ASTCENC_SIMD_INLINE vint8 operator~(vint8 a)
404 {
405 return vint8(_mm256_xor_si256(a.m, _mm256_set1_epi32(-1)));
406 }
407
408 /**
409 * @brief Overload: vector by vector bitwise or.
410 */
411 ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b)
412 {
413 return vint8(_mm256_or_si256(a.m, b.m));
414 }
415
416 /**
417 * @brief Overload: vector by vector bitwise and.
418 */
419 ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b)
420 {
421 return vint8(_mm256_and_si256(a.m, b.m));
422 }
423
424 /**
425 * @brief Overload: vector by vector bitwise xor.
426 */
427 ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b)
428 {
429 return vint8(_mm256_xor_si256(a.m, b.m));
430 }
431
432 /**
433 * @brief Overload: vector by vector equality.
434 */
435 ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b)
436 {
437 return vmask8(_mm256_cmpeq_epi32(a.m, b.m));
438 }
439
440 /**
441 * @brief Overload: vector by vector inequality.
442 */
443 ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b)
444 {
445 return ~vmask8(_mm256_cmpeq_epi32(a.m, b.m));
446 }
447
448 /**
449 * @brief Overload: vector by vector less than.
450 */
451 ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b)
452 {
453 return vmask8(_mm256_cmpgt_epi32(b.m, a.m));
454 }
455
456 /**
457 * @brief Overload: vector by vector greater than.
458 */
459 ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
460 {
461 return vmask8(_mm256_cmpgt_epi32(a.m, b.m));
462 }
463
464 /**
465 * @brief Arithmetic shift right.
466 */
asr(vint8 a)467 template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a)
468 {
469 return vint8(_mm256_srai_epi32(a.m, s));
470 }
471
472 /**
473 * @brief Logical shift right.
474 */
lsr(vint8 a)475 template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a)
476 {
477 return vint8(_mm256_srli_epi32(a.m, s));
478 }
479
480 /**
481 * @brief Return the min vector of two vectors.
482 */
min(vint8 a,vint8 b)483 ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b)
484 {
485 return vint8(_mm256_min_epi32(a.m, b.m));
486 }
487
488 /**
489 * @brief Return the max vector of two vectors.
490 */
max(vint8 a,vint8 b)491 ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
492 {
493 return vint8(_mm256_max_epi32(a.m, b.m));
494 }
495
496 /**
497 * @brief Return the horizontal minimum of a vector.
498 */
hmin(vint8 a)499 ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
500 {
501 __m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
502 m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
503 m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
504 m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
505
506 // This is the most logical implementation, but the convenience intrinsic
507 // is missing on older compilers (supported in g++ 9 and clang++ 9).
508 //__m256i r = _mm256_set_m128i(m, m)
509 __m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(m), m, 1);
510 vint8 vmin(r);
511 return vmin;
512 }
513
514 /**
515 * @brief Return the horizontal minimum of a vector.
516 */
hmax(vint8 a)517 ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
518 {
519 __m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
520 m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
521 m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
522 m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
523
524 // This is the most logical implementation, but the convenience intrinsic
525 // is missing on older compilers (supported in g++ 9 and clang++ 9).
526 //__m256i r = _mm256_set_m128i(m, m)
527 __m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(m), m, 1);
528 vint8 vmax(r);
529 return vmax;
530 }
531
532 /**
533 * @brief Store a vector to a 16B aligned memory address.
534 */
storea(vint8 a,int * p)535 ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
536 {
537 _mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m);
538 }
539
540 /**
541 * @brief Store a vector to an unaligned memory address.
542 */
store(vint8 a,int * p)543 ASTCENC_SIMD_INLINE void store(vint8 a, int* p)
544 {
545 _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m);
546 }
547
548 /**
549 * @brief Store lowest N (vector width) bytes into an unaligned address.
550 */
store_nbytes(vint8 a,uint8_t * p)551 ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
552 {
553 // This is the most logical implementation, but the convenience intrinsic
554 // is missing on older compilers (supported in g++ 9 and clang++ 9).
555 // _mm_storeu_si64(ptr, _mm256_extracti128_si256(v.m, 0))
556 _mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
557 }
558
559 /**
560 * @brief Gather N (vector width) indices from the array.
561 */
gatheri(const int * base,vint8 indices)562 ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
563 {
564 return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
565 }
566
567 /**
568 * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
569 */
pack_low_bytes(vint8 v)570 ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
571 {
572 __m256i shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
573 0, 0, 0, 0, 28, 24, 20, 16,
574 0, 0, 0, 0, 0, 0, 0, 0,
575 0, 0, 0, 0, 12, 8, 4, 0);
576 __m256i a = _mm256_shuffle_epi8(v.m, shuf);
577 __m128i a0 = _mm256_extracti128_si256(a, 0);
578 __m128i a1 = _mm256_extracti128_si256(a, 1);
579 __m128i b = _mm_unpacklo_epi32(a0, a1);
580
581 // This is the most logical implementation, but the convenience intrinsic
582 // is missing on older compilers (supported in g++ 9 and clang++ 9).
583 //__m256i r = _mm256_set_m128i(b, b)
584 __m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(b), b, 1);
585 return vint8(r);
586 }
587
588 /**
589 * @brief Return lanes from @c b if @c cond is set, else @c a.
590 */
select(vint8 a,vint8 b,vmask8 cond)591 ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
592 {
593 __m256i condi = _mm256_castps_si256(cond.m);
594 return vint8(_mm256_blendv_epi8(a.m, b.m, condi));
595 }
596
597 // ============================================================================
598 // vfloat4 operators and functions
599 // ============================================================================
600
601 /**
602 * @brief Overload: vector by vector addition.
603 */
604 ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b)
605 {
606 return vfloat8(_mm256_add_ps(a.m, b.m));
607 }
608
609 /**
610 * @brief Overload: vector by vector incremental addition.
611 */
612 ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b)
613 {
614 a = a + b;
615 return a;
616 }
617
618 /**
619 * @brief Overload: vector by vector subtraction.
620 */
621 ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b)
622 {
623 return vfloat8(_mm256_sub_ps(a.m, b.m));
624 }
625
626 /**
627 * @brief Overload: vector by vector multiplication.
628 */
629 ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b)
630 {
631 return vfloat8(_mm256_mul_ps(a.m, b.m));
632 }
633
634 /**
635 * @brief Overload: vector by scalar multiplication.
636 */
637 ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b)
638 {
639 return vfloat8(_mm256_mul_ps(a.m, _mm256_set1_ps(b)));
640 }
641
642 /**
643 * @brief Overload: scalar by vector multiplication.
644 */
645 ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b)
646 {
647 return vfloat8(_mm256_mul_ps(_mm256_set1_ps(a), b.m));
648 }
649
650 /**
651 * @brief Overload: vector by vector division.
652 */
653 ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b)
654 {
655 return vfloat8(_mm256_div_ps(a.m, b.m));
656 }
657
658 /**
659 * @brief Overload: vector by scalar division.
660 */
661 ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
662 {
663 return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
664 }
665
666
667 /**
668 * @brief Overload: scalar by vector division.
669 */
670 ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
671 {
672 return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
673 }
674
675
676 /**
677 * @brief Overload: vector by vector equality.
678 */
679 ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b)
680 {
681 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ));
682 }
683
684 /**
685 * @brief Overload: vector by vector inequality.
686 */
687 ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b)
688 {
689 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_NEQ_OQ));
690 }
691
692 /**
693 * @brief Overload: vector by vector less than.
694 */
695 ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b)
696 {
697 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LT_OQ));
698 }
699
700 /**
701 * @brief Overload: vector by vector greater than.
702 */
703 ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b)
704 {
705 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GT_OQ));
706 }
707
708 /**
709 * @brief Overload: vector by vector less than or equal.
710 */
711 ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b)
712 {
713 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LE_OQ));
714 }
715
716 /**
717 * @brief Overload: vector by vector greater than or equal.
718 */
719 ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b)
720 {
721 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GE_OQ));
722 }
723
724 /**
725 * @brief Return the min vector of two vectors.
726 *
727 * If either lane value is NaN, @c b will be returned for that lane.
728 */
min(vfloat8 a,vfloat8 b)729 ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
730 {
731 return vfloat8(_mm256_min_ps(a.m, b.m));
732 }
733
734 /**
735 * @brief Return the max vector of two vectors.
736 *
737 * If either lane value is NaN, @c b will be returned for that lane.
738 */
max(vfloat8 a,vfloat8 b)739 ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
740 {
741 return vfloat8(_mm256_max_ps(a.m, b.m));
742 }
743
744 /**
745 * @brief Return the clamped value between min and max.
746 *
747 * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
748 * then @c min will be returned for that lane.
749 */
clamp(float min,float max,vfloat8 a)750 ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
751 {
752 // Do not reorder - second operand will return if either is NaN
753 a.m = _mm256_max_ps(a.m, _mm256_set1_ps(min));
754 a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
755 return a;
756 }
757
758 /**
759 * @brief Return a clamped value between 0.0f and max.
760 *
761 * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
762 * be returned for that lane.
763 */
clampz(float max,vfloat8 a)764 ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
765 {
766 a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
767 a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
768 return a;
769 }
770
771 /**
772 * @brief Return a clamped value between 0.0f and 1.0f.
773 *
774 * If @c a is NaN then zero will be returned for that lane.
775 */
clampzo(vfloat8 a)776 ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a)
777 {
778 a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
779 a.m = _mm256_min_ps(a.m, _mm256_set1_ps(1.0f));
780 return a;
781 }
782
783 /**
784 * @brief Return the absolute value of the float vector.
785 */
abs(vfloat8 a)786 ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a)
787 {
788 __m256 msk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
789 return vfloat8(_mm256_and_ps(a.m, msk));
790 }
791
792 /**
793 * @brief Return a float rounded to the nearest integer value.
794 */
round(vfloat8 a)795 ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a)
796 {
797 constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
798 return vfloat8(_mm256_round_ps(a.m, flags));
799 }
800
801 /**
802 * @brief Return the horizontal minimum of a vector.
803 */
hmin(vfloat8 a)804 ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
805 {
806 __m128 vlow = _mm256_castps256_ps128(a.m);
807 __m128 vhigh = _mm256_extractf128_ps(a.m, 1);
808 vlow = _mm_min_ps(vlow, vhigh);
809
810 // First do an horizontal reduction.
811 __m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));
812 __m128 mins = _mm_min_ps(vlow, shuf);
813 shuf = _mm_movehl_ps(shuf, mins);
814 mins = _mm_min_ss(mins, shuf);
815
816 // This is the most logical implementation, but the convenience intrinsic
817 // is missing on older compilers (supported in g++ 9 and clang++ 9).
818 //__m256i r = _mm256_set_m128(m, m)
819 __m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(mins), mins, 1);
820
821 return vfloat8(_mm256_permute_ps(r, 0));
822 }
823
824 /**
825 * @brief Return the horizontal minimum of a vector.
826 */
hmin_s(vfloat8 a)827 ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
828 {
829 return hmin(a).lane<0>();
830 }
831
832 /**
833 * @brief Return the horizontal maximum of a vector.
834 */
hmax(vfloat8 a)835 ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
836 {
837 __m128 vlow = _mm256_castps256_ps128(a.m);
838 __m128 vhigh = _mm256_extractf128_ps(a.m, 1);
839 vhigh = _mm_max_ps(vlow, vhigh);
840
841 // First do an horizontal reduction.
842 __m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1));
843 __m128 maxs = _mm_max_ps(vhigh, shuf);
844 shuf = _mm_movehl_ps(shuf,maxs);
845 maxs = _mm_max_ss(maxs, shuf);
846
847 // This is the most logical implementation, but the convenience intrinsic
848 // is missing on older compilers (supported in g++ 9 and clang++ 9).
849 //__m256i r = _mm256_set_m128(m, m)
850 __m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(maxs), maxs, 1);
851 return vfloat8(_mm256_permute_ps(r, 0));
852 }
853
854 /**
855 * @brief Return the horizontal maximum of a vector.
856 */
hmax_s(vfloat8 a)857 ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
858 {
859 return hmax(a).lane<0>();
860 }
861
862 /**
863 * @brief Return the horizontal sum of a vector.
864 */
hadd_s(vfloat8 a)865 ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a)
866 {
867 // Two sequential 4-wide adds gives invariance with 4-wide code
868 vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
869 vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
870 return hadd_s(lo) + hadd_s(hi);
871 }
872
873 /**
874 * @brief Return lanes from @c b if @c cond is set, else @c a.
875 */
select(vfloat8 a,vfloat8 b,vmask8 cond)876 ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
877 {
878 return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
879 }
880
881 /**
882 * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
883 */
select_msb(vfloat8 a,vfloat8 b,vmask8 cond)884 ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond)
885 {
886 return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
887 }
888
889 /**
890 * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
891 *
892 * This is invariant with 4-wide implementations.
893 */
haccumulate(vfloat4 & accum,vfloat8 a)894 ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a)
895 {
896 vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
897 haccumulate(accum, lo);
898
899 vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
900 haccumulate(accum, hi);
901 }
902
903 /**
904 * @brief Accumulate lane-wise sums for a vector.
905 *
906 * This is NOT invariant with 4-wide implementations.
907 */
haccumulate(vfloat8 & accum,vfloat8 a)908 ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a)
909 {
910 accum += a;
911 }
912
913 /**
914 * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide.
915 *
916 * This is invariant with 4-wide implementations.
917 */
haccumulate(vfloat4 & accum,vfloat8 a,vmask8 m)918 ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m)
919 {
920 a = select(vfloat8::zero(), a, m);
921 haccumulate(accum, a);
922 }
923
924 /**
925 * @brief Accumulate masked lane-wise sums for a vector.
926 *
927 * This is NOT invariant with 4-wide implementations.
928 */
haccumulate(vfloat8 & accum,vfloat8 a,vmask8 m)929 ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m)
930 {
931 a = select(vfloat8::zero(), a, m);
932 haccumulate(accum, a);
933 }
934
935 /**
936 * @brief Return the sqrt of the lanes in the vector.
937 */
sqrt(vfloat8 a)938 ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a)
939 {
940 return vfloat8(_mm256_sqrt_ps(a.m));
941 }
942
943 /**
944 * @brief Load a vector of gathered results from an array;
945 */
gatherf(const float * base,vint8 indices)946 ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
947 {
948 return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
949 }
950
951 /**
952 * @brief Store a vector to an unaligned memory address.
953 */
store(vfloat8 a,float * p)954 ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
955 {
956 _mm256_storeu_ps(p, a.m);
957 }
958
959 /**
960 * @brief Store a vector to a 32B aligned memory address.
961 */
storea(vfloat8 a,float * p)962 ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
963 {
964 _mm256_store_ps(p, a.m);
965 }
966
967 /**
968 * @brief Return a integer value for a float vector, using truncation.
969 */
float_to_int(vfloat8 a)970 ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
971 {
972 return vint8(_mm256_cvttps_epi32(a.m));
973 }
974
975 /**
976 * @brief Return a float value for an integer vector.
977 */
int_to_float(vint8 a)978 ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a)
979 {
980 return vfloat8(_mm256_cvtepi32_ps(a.m));
981 }
982
983 /**
984 * @brief Return a float value as an integer bit pattern (i.e. no conversion).
985 *
986 * It is a common trick to convert floats into integer bit patterns, perform
987 * some bit hackery based on knowledge they are IEEE 754 layout, and then
988 * convert them back again. This is the first half of that flip.
989 */
float_as_int(vfloat8 a)990 ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a)
991 {
992 return vint8(_mm256_castps_si256(a.m));
993 }
994
995 /**
996 * @brief Return a integer value as a float bit pattern (i.e. no conversion).
997 *
998 * It is a common trick to convert floats into integer bit patterns, perform
999 * some bit hackery based on knowledge they are IEEE 754 layout, and then
1000 * convert them back again. This is the second half of that flip.
1001 */
int_as_float(vint8 a)1002 ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
1003 {
1004 return vfloat8(_mm256_castsi256_ps(a.m));
1005 }
1006
1007 /**
1008 * @brief Debug function to print a vector of ints.
1009 */
print(vint8 a)1010 ASTCENC_SIMD_INLINE void print(vint8 a)
1011 {
1012 alignas(ASTCENC_VECALIGN) int v[8];
1013 storea(a, v);
1014 printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n",
1015 v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
1016 }
1017
1018 /**
1019 * @brief Debug function to print a vector of floats.
1020 */
print(vfloat8 a)1021 ASTCENC_SIMD_INLINE void print(vfloat8 a)
1022 {
1023 alignas(ASTCENC_VECALIGN) float v[8];
1024 storea(a, v);
1025 printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
1026 static_cast<double>(v[0]), static_cast<double>(v[1]),
1027 static_cast<double>(v[2]), static_cast<double>(v[3]),
1028 static_cast<double>(v[4]), static_cast<double>(v[5]),
1029 static_cast<double>(v[6]), static_cast<double>(v[7]));
1030 }
1031
1032 /**
1033 * @brief Debug function to print a vector of masks.
1034 */
print(vmask8 a)1035 ASTCENC_SIMD_INLINE void print(vmask8 a)
1036 {
1037 print(select(vint8(0), vint8(1), a));
1038 }
1039
1040 #endif // #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
1041