• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2019-2021 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 /**
19  * @brief 8x32-bit vectors, implemented using AVX2.
20  *
21  * This module implements 8-wide 32-bit float, int, and mask vectors for x86
22  * AVX2.
23  *
24  * There is a baseline level of functionality provided by all vector widths and
25  * implementations. This is implemented using identical function signatures,
26  * modulo data type, so we can use them as substitutable implementations in VLA
27  * code.
28  */
29 
30 #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
31 #define ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
32 
33 #ifndef ASTCENC_SIMD_INLINE
34 	#error "Include astcenc_vecmathlib.h, do not include directly"
35 #endif
36 
37 #include <cstdio>
38 
39 // ============================================================================
40 // vfloat8 data type
41 // ============================================================================
42 
43 /**
44  * @brief Data type for 8-wide floats.
45  */
46 struct vfloat8
47 {
48 	/**
49 	 * @brief Construct from zero-initialized value.
50 	 */
51 	ASTCENC_SIMD_INLINE vfloat8() = default;
52 
53 	/**
54 	 * @brief Construct from 4 values loaded from an unaligned address.
55 	 *
56 	 * Consider using loada() which is better with vectors if data is aligned
57 	 * to vector length.
58 	 */
vfloat8vfloat859 	ASTCENC_SIMD_INLINE explicit vfloat8(const float *p)
60 	{
61 		m = _mm256_loadu_ps(p);
62 	}
63 
64 	/**
65 	 * @brief Construct from 1 scalar value replicated across all lanes.
66 	 *
67 	 * Consider using zero() for constexpr zeros.
68 	 */
vfloat8vfloat869 	ASTCENC_SIMD_INLINE explicit vfloat8(float a)
70 	{
71 		m = _mm256_set1_ps(a);
72 	}
73 
74 	/**
75 	 * @brief Construct from 8 scalar values.
76 	 *
77 	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
78 	 */
vfloat8vfloat879 	ASTCENC_SIMD_INLINE explicit vfloat8(
80 		float a, float b, float c, float d,
81 		float e, float f, float g, float h)
82 	{
83 		m = _mm256_set_ps(h, g, f, e, d, c, b, a);
84 	}
85 
86 	/**
87 	 * @brief Construct from an existing SIMD register.
88 	 */
vfloat8vfloat889 	ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a) {
90 		m = a;
91 	}
92 
93 	/**
94 	 * @brief Get the scalar value of a single lane.
95 	 */
lanevfloat896 	template <int l> ASTCENC_SIMD_INLINE float lane() const
97 	{
98 	#if !defined(__clang__) && defined(_MSC_VER)
99 		return m.m256_f32[l];
100 	#else
101 		union { __m256 m; float f[8]; } cvt;
102 		cvt.m = m;
103 		return cvt.f[l];
104 	#endif
105 	}
106 
107 	/**
108 	 * @brief Factory that returns a vector of zeros.
109 	 */
zerovfloat8110 	static ASTCENC_SIMD_INLINE vfloat8 zero()
111 	{
112 		return vfloat8(_mm256_setzero_ps());
113 	}
114 
115 	/**
116 	 * @brief Factory that returns a replicated scalar loaded from memory.
117 	 */
load1vfloat8118 	static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p)
119 	{
120 		return vfloat8(_mm256_broadcast_ss(p));
121 	}
122 
123 	/**
124 	 * @brief Factory that returns a vector loaded from 32B aligned memory.
125 	 */
loadavfloat8126 	static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
127 	{
128 		return vfloat8(_mm256_load_ps(p));
129 	}
130 
131 	/**
132 	 * @brief Factory that returns a vector containing the lane IDs.
133 	 */
lane_idvfloat8134 	static ASTCENC_SIMD_INLINE vfloat8 lane_id()
135 	{
136 		return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
137 	}
138 
139 	/**
140 	 * @brief The vector ...
141 	 */
142 	__m256 m;
143 };
144 
145 // ============================================================================
146 // vint8 data type
147 // ============================================================================
148 
149 /**
150  * @brief Data type for 8-wide ints.
151  */
152 struct vint8
153 {
154 	/**
155 	 * @brief Construct from zero-initialized value.
156 	 */
157 	ASTCENC_SIMD_INLINE vint8() = default;
158 
159 	/**
160 	 * @brief Construct from 8 values loaded from an unaligned address.
161 	 *
162 	 * Consider using loada() which is better with vectors if data is aligned
163 	 * to vector length.
164 	 */
vint8vint8165 	ASTCENC_SIMD_INLINE explicit vint8(const int *p)
166 	{
167 		m = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
168 	}
169 
170 	/**
171 	 * @brief Construct from 8 uint8_t loaded from an unaligned address.
172 	 */
vint8vint8173 	ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
174 	{
175 		// _mm_loadu_si64 would be nicer syntax, but missing on older GCC
176 		m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*reinterpret_cast<const long long*>(p)));
177 	}
178 
179 	/**
180 	 * @brief Construct from 1 scalar value replicated across all lanes.
181 	 *
182 	 * Consider using vfloat4::zero() for constexpr zeros.
183 	 */
vint8vint8184 	ASTCENC_SIMD_INLINE explicit vint8(int a)
185 	{
186 		m = _mm256_set1_epi32(a);
187 	}
188 
189 	/**
190 	 * @brief Construct from 8 scalar values.
191 	 *
192 	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
193 	 */
vint8vint8194 	ASTCENC_SIMD_INLINE explicit vint8(
195 		int a, int b, int c, int d,
196 		int e, int f, int g, int h)
197 	{
198 		m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
199 	}
200 
201 	/**
202 	 * @brief Construct from an existing SIMD register.
203 	 */
vint8vint8204 	ASTCENC_SIMD_INLINE explicit vint8(__m256i a)
205 	{
206 		m = a;
207 	}
208 
209 	/**
210 	 * @brief Get the scalar from a single lane.
211 	 */
lanevint8212 	template <int l> ASTCENC_SIMD_INLINE int lane() const
213 	{
214 	#if !defined(__clang__) && defined(_MSC_VER)
215 		return m.m256i_i32[l];
216 	#else
217 		union { __m256i m; int f[8]; } cvt;
218 		cvt.m = m;
219 		return cvt.f[l];
220 	#endif
221 	}
222 
223 	/**
224 	 * @brief Factory that returns a vector of zeros.
225 	 */
zerovint8226 	static ASTCENC_SIMD_INLINE vint8 zero()
227 	{
228 		return vint8(_mm256_setzero_si256());
229 	}
230 
231 	/**
232 	 * @brief Factory that returns a replicated scalar loaded from memory.
233 	 */
load1vint8234 	static ASTCENC_SIMD_INLINE vint8 load1(const int* p)
235 	{
236 		__m128i a = _mm_set1_epi32(*p);
237 		return vint8(_mm256_broadcastd_epi32(a));
238 	}
239 
240 	/**
241 	 * @brief Factory that returns a vector loaded from 32B aligned memory.
242 	 */
loadavint8243 	static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
244 	{
245 		return vint8(_mm256_load_si256(reinterpret_cast<const __m256i*>(p)));
246 	}
247 
248 	/**
249 	 * @brief Factory that returns a vector containing the lane IDs.
250 	 */
lane_idvint8251 	static ASTCENC_SIMD_INLINE vint8 lane_id()
252 	{
253 		return vint8(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
254 	}
255 
256 	/**
257 	 * @brief The vector ...
258 	 */
259 	__m256i m;
260 };
261 
262 // ============================================================================
263 // vmask8 data type
264 // ============================================================================
265 
266 /**
267  * @brief Data type for 8-wide control plane masks.
268  */
269 struct vmask8
270 {
271 	/**
272 	 * @brief Construct from an existing SIMD register.
273 	 */
vmask8vmask8274 	ASTCENC_SIMD_INLINE explicit vmask8(__m256 a)
275 	{
276 		m = a;
277 	}
278 
279 	/**
280 	 * @brief Construct from an existing SIMD register.
281 	 */
vmask8vmask8282 	ASTCENC_SIMD_INLINE explicit vmask8(__m256i a)
283 	{
284 		m = _mm256_castsi256_ps(a);
285 	}
286 
287 	/**
288 	 * @brief Construct from 1 scalar value.
289 	 */
vmask8vmask8290 	ASTCENC_SIMD_INLINE explicit vmask8(bool a)
291 	{
292 		vint8 mask(a == false ? 0 : -1);
293 		m = _mm256_castsi256_ps(mask.m);
294 	}
295 
296 	/**
297 	 * @brief The vector ...
298 	 */
299 	__m256 m;
300 };
301 
302 // ============================================================================
303 // vmask8 operators and functions
304 // ============================================================================
305 
306 /**
307  * @brief Overload: mask union (or).
308  */
309 ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b)
310 {
311 	return vmask8(_mm256_or_ps(a.m, b.m));
312 }
313 
314 /**
315  * @brief Overload: mask intersect (and).
316  */
317 ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b)
318 {
319 	return vmask8(_mm256_and_ps(a.m, b.m));
320 }
321 
322 /**
323  * @brief Overload: mask difference (xor).
324  */
325 ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b)
326 {
327 	return vmask8(_mm256_xor_ps(a.m, b.m));
328 }
329 
330 /**
331  * @brief Overload: mask invert (not).
332  */
333 ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
334 {
335 	return vmask8(_mm256_xor_si256(_mm256_castps_si256(a.m), _mm256_set1_epi32(-1)));
336 }
337 
338 /**
339  * @brief Return a 8-bit mask code indicating mask status.
340  *
341  * bit0 = lane 0
342  */
mask(vmask8 a)343 ASTCENC_SIMD_INLINE unsigned mask(vmask8 a)
344 {
345 	return _mm256_movemask_ps(a.m);
346 }
347 
348 /**
349  * @brief True if any lanes are enabled, false otherwise.
350  */
any(vmask8 a)351 ASTCENC_SIMD_INLINE bool any(vmask8 a)
352 {
353 	return mask(a) != 0;
354 }
355 
356 /**
357  * @brief True if any lanes are enabled, false otherwise.
358  */
all(vmask8 a)359 ASTCENC_SIMD_INLINE bool all(vmask8 a)
360 {
361 	return mask(a) == 0xFF;
362 }
363 
364 // ============================================================================
365 // vint8 operators and functions
366 // ============================================================================
367 /**
368  * @brief Overload: vector by vector addition.
369  */
370 ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b)
371 {
372 	return vint8(_mm256_add_epi32(a.m, b.m));
373 }
374 
375 /**
376  * @brief Overload: vector by vector incremental addition.
377  */
378 ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b)
379 {
380 	a = a + b;
381 	return a;
382 }
383 
384 /**
385  * @brief Overload: vector by vector subtraction.
386  */
387 ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b)
388 {
389 	return vint8(_mm256_sub_epi32(a.m, b.m));
390 }
391 
392 /**
393  * @brief Overload: vector by vector multiplication.
394  */
395 ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b)
396 {
397 	return vint8(_mm256_mullo_epi32(a.m, b.m));
398 }
399 
400 /**
401  * @brief Overload: vector bit invert.
402  */
403 ASTCENC_SIMD_INLINE vint8 operator~(vint8 a)
404 {
405 	return vint8(_mm256_xor_si256(a.m, _mm256_set1_epi32(-1)));
406 }
407 
408 /**
409  * @brief Overload: vector by vector bitwise or.
410  */
411 ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b)
412 {
413 	return vint8(_mm256_or_si256(a.m, b.m));
414 }
415 
416 /**
417  * @brief Overload: vector by vector bitwise and.
418  */
419 ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b)
420 {
421 	return vint8(_mm256_and_si256(a.m, b.m));
422 }
423 
424 /**
425  * @brief Overload: vector by vector bitwise xor.
426  */
427 ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b)
428 {
429 	return vint8(_mm256_xor_si256(a.m, b.m));
430 }
431 
432 /**
433  * @brief Overload: vector by vector equality.
434  */
435 ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b)
436 {
437 	return vmask8(_mm256_cmpeq_epi32(a.m, b.m));
438 }
439 
440 /**
441  * @brief Overload: vector by vector inequality.
442  */
443 ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b)
444 {
445 	return ~vmask8(_mm256_cmpeq_epi32(a.m, b.m));
446 }
447 
448 /**
449  * @brief Overload: vector by vector less than.
450  */
451 ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b)
452 {
453 	return vmask8(_mm256_cmpgt_epi32(b.m, a.m));
454 }
455 
456 /**
457  * @brief Overload: vector by vector greater than.
458  */
459 ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
460 {
461 	return vmask8(_mm256_cmpgt_epi32(a.m, b.m));
462 }
463 
464 /**
465  * @brief Arithmetic shift right.
466  */
asr(vint8 a)467 template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a)
468 {
469 	return vint8(_mm256_srai_epi32(a.m, s));
470 }
471 
472 /**
473  * @brief Logical shift right.
474  */
lsr(vint8 a)475 template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a)
476 {
477 	return vint8(_mm256_srli_epi32(a.m, s));
478 }
479 
480 /**
481  * @brief Return the min vector of two vectors.
482  */
min(vint8 a,vint8 b)483 ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b)
484 {
485 	return vint8(_mm256_min_epi32(a.m, b.m));
486 }
487 
488 /**
489  * @brief Return the max vector of two vectors.
490  */
max(vint8 a,vint8 b)491 ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
492 {
493 	return vint8(_mm256_max_epi32(a.m, b.m));
494 }
495 
496 /**
497  * @brief Return the horizontal minimum of a vector.
498  */
hmin(vint8 a)499 ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
500 {
501 	__m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
502 	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
503 	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
504 	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
505 
506 	// This is the most logical implementation, but the convenience intrinsic
507 	// is missing on older compilers (supported in g++ 9 and clang++ 9).
508 	//__m256i r = _mm256_set_m128i(m, m)
509 	__m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(m), m, 1);
510 	vint8 vmin(r);
511 	return vmin;
512 }
513 
514 /**
515  * @brief Return the horizontal minimum of a vector.
516  */
hmax(vint8 a)517 ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
518 {
519 	__m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
520 	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
521 	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
522 	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
523 
524 	// This is the most logical implementation, but the convenience intrinsic
525 	// is missing on older compilers (supported in g++ 9 and clang++ 9).
526 	//__m256i r = _mm256_set_m128i(m, m)
527 	__m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(m), m, 1);
528 	vint8 vmax(r);
529 	return vmax;
530 }
531 
532 /**
533  * @brief Store a vector to a 16B aligned memory address.
534  */
storea(vint8 a,int * p)535 ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
536 {
537 	_mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m);
538 }
539 
540 /**
541  * @brief Store a vector to an unaligned memory address.
542  */
store(vint8 a,int * p)543 ASTCENC_SIMD_INLINE void store(vint8 a, int* p)
544 {
545 	_mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m);
546 }
547 
548 /**
549  * @brief Store lowest N (vector width) bytes into an unaligned address.
550  */
store_nbytes(vint8 a,uint8_t * p)551 ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
552 {
553 	// This is the most logical implementation, but the convenience intrinsic
554 	// is missing on older compilers (supported in g++ 9 and clang++ 9).
555 	// _mm_storeu_si64(ptr, _mm256_extracti128_si256(v.m, 0))
556 	_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
557 }
558 
559 /**
560  * @brief Gather N (vector width) indices from the array.
561  */
gatheri(const int * base,vint8 indices)562 ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
563 {
564 	return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
565 }
566 
567 /**
568  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
569  */
pack_low_bytes(vint8 v)570 ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
571 {
572 	__m256i shuf = _mm256_set_epi8(0, 0, 0, 0,  0,  0,  0,  0,
573 	                               0, 0, 0, 0, 28, 24, 20, 16,
574 	                               0, 0, 0, 0,  0,  0,  0,  0,
575 	                               0, 0, 0, 0, 12,  8,  4,  0);
576 	__m256i a = _mm256_shuffle_epi8(v.m, shuf);
577 	__m128i a0 = _mm256_extracti128_si256(a, 0);
578 	__m128i a1 = _mm256_extracti128_si256(a, 1);
579 	__m128i b = _mm_unpacklo_epi32(a0, a1);
580 
581 	// This is the most logical implementation, but the convenience intrinsic
582 	// is missing on older compilers (supported in g++ 9 and clang++ 9).
583 	//__m256i r = _mm256_set_m128i(b, b)
584 	__m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(b), b, 1);
585 	return vint8(r);
586 }
587 
588 /**
589  * @brief Return lanes from @c b if @c cond is set, else @c a.
590  */
select(vint8 a,vint8 b,vmask8 cond)591 ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
592 {
593 	__m256i condi = _mm256_castps_si256(cond.m);
594 	return vint8(_mm256_blendv_epi8(a.m, b.m, condi));
595 }
596 
597 // ============================================================================
598 // vfloat4 operators and functions
599 // ============================================================================
600 
601 /**
602  * @brief Overload: vector by vector addition.
603  */
604 ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b)
605 {
606 	return vfloat8(_mm256_add_ps(a.m, b.m));
607 }
608 
609 /**
610  * @brief Overload: vector by vector incremental addition.
611  */
612 ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b)
613 {
614 	a = a + b;
615 	return a;
616 }
617 
618 /**
619  * @brief Overload: vector by vector subtraction.
620  */
621 ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b)
622 {
623 	return vfloat8(_mm256_sub_ps(a.m, b.m));
624 }
625 
626 /**
627  * @brief Overload: vector by vector multiplication.
628  */
629 ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b)
630 {
631 	return vfloat8(_mm256_mul_ps(a.m, b.m));
632 }
633 
634 /**
635  * @brief Overload: vector by scalar multiplication.
636  */
637 ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b)
638 {
639 	return vfloat8(_mm256_mul_ps(a.m, _mm256_set1_ps(b)));
640 }
641 
642 /**
643  * @brief Overload: scalar by vector multiplication.
644  */
645 ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b)
646 {
647 	return vfloat8(_mm256_mul_ps(_mm256_set1_ps(a), b.m));
648 }
649 
650 /**
651  * @brief Overload: vector by vector division.
652  */
653 ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b)
654 {
655 	return vfloat8(_mm256_div_ps(a.m, b.m));
656 }
657 
658 /**
659  * @brief Overload: vector by scalar division.
660  */
661 ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
662 {
663 	return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
664 }
665 
666 
667 /**
668  * @brief Overload: scalar by vector division.
669  */
670 ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
671 {
672 	return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
673 }
674 
675 
676 /**
677  * @brief Overload: vector by vector equality.
678  */
679 ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b)
680 {
681 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ));
682 }
683 
684 /**
685  * @brief Overload: vector by vector inequality.
686  */
687 ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b)
688 {
689 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_NEQ_OQ));
690 }
691 
692 /**
693  * @brief Overload: vector by vector less than.
694  */
695 ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b)
696 {
697 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LT_OQ));
698 }
699 
700 /**
701  * @brief Overload: vector by vector greater than.
702  */
703 ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b)
704 {
705 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GT_OQ));
706 }
707 
708 /**
709  * @brief Overload: vector by vector less than or equal.
710  */
711 ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b)
712 {
713 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LE_OQ));
714 }
715 
716 /**
717  * @brief Overload: vector by vector greater than or equal.
718  */
719 ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b)
720 {
721 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GE_OQ));
722 }
723 
724 /**
725  * @brief Return the min vector of two vectors.
726  *
727  * If either lane value is NaN, @c b will be returned for that lane.
728  */
min(vfloat8 a,vfloat8 b)729 ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
730 {
731 	return vfloat8(_mm256_min_ps(a.m, b.m));
732 }
733 
734 /**
735  * @brief Return the max vector of two vectors.
736  *
737  * If either lane value is NaN, @c b will be returned for that lane.
738  */
max(vfloat8 a,vfloat8 b)739 ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
740 {
741 	return vfloat8(_mm256_max_ps(a.m, b.m));
742 }
743 
744 /**
745  * @brief Return the clamped value between min and max.
746  *
747  * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
748  * then @c min will be returned for that lane.
749  */
clamp(float min,float max,vfloat8 a)750 ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
751 {
752 	// Do not reorder - second operand will return if either is NaN
753 	a.m = _mm256_max_ps(a.m, _mm256_set1_ps(min));
754 	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
755 	return a;
756 }
757 
758 /**
759  * @brief Return a clamped value between 0.0f and max.
760  *
761  * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
762  * be returned for that lane.
763  */
clampz(float max,vfloat8 a)764 ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
765 {
766 	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
767 	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
768 	return a;
769 }
770 
771 /**
772  * @brief Return a clamped value between 0.0f and 1.0f.
773  *
774  * If @c a is NaN then zero will be returned for that lane.
775  */
clampzo(vfloat8 a)776 ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a)
777 {
778 	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
779 	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(1.0f));
780 	return a;
781 }
782 
783 /**
784  * @brief Return the absolute value of the float vector.
785  */
abs(vfloat8 a)786 ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a)
787 {
788 	__m256 msk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
789 	return vfloat8(_mm256_and_ps(a.m, msk));
790 }
791 
792 /**
793  * @brief Return a float rounded to the nearest integer value.
794  */
round(vfloat8 a)795 ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a)
796 {
797 	constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
798 	return vfloat8(_mm256_round_ps(a.m, flags));
799 }
800 
801 /**
802  * @brief Return the horizontal minimum of a vector.
803  */
hmin(vfloat8 a)804 ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
805 {
806 	__m128 vlow = _mm256_castps256_ps128(a.m);
807 	__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
808 	vlow  = _mm_min_ps(vlow, vhigh);
809 
810 	// First do an horizontal reduction.
811 	__m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));
812 	__m128 mins = _mm_min_ps(vlow, shuf);
813 	shuf        = _mm_movehl_ps(shuf, mins);
814 	mins        = _mm_min_ss(mins, shuf);
815 
816 	// This is the most logical implementation, but the convenience intrinsic
817 	// is missing on older compilers (supported in g++ 9 and clang++ 9).
818 	//__m256i r = _mm256_set_m128(m, m)
819 	__m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(mins), mins, 1);
820 
821 	return vfloat8(_mm256_permute_ps(r, 0));
822 }
823 
824 /**
825  * @brief Return the horizontal minimum of a vector.
826  */
hmin_s(vfloat8 a)827 ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
828 {
829 	return hmin(a).lane<0>();
830 }
831 
832 /**
833  * @brief Return the horizontal maximum of a vector.
834  */
hmax(vfloat8 a)835 ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
836 {
837 	__m128 vlow = _mm256_castps256_ps128(a.m);
838 	__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
839 	vhigh  = _mm_max_ps(vlow, vhigh);
840 
841 	// First do an horizontal reduction.
842 	__m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1));
843 	__m128 maxs = _mm_max_ps(vhigh, shuf);
844 	shuf        = _mm_movehl_ps(shuf,maxs);
845 	maxs        = _mm_max_ss(maxs, shuf);
846 
847 	// This is the most logical implementation, but the convenience intrinsic
848 	// is missing on older compilers (supported in g++ 9 and clang++ 9).
849 	//__m256i r = _mm256_set_m128(m, m)
850 	__m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(maxs), maxs, 1);
851 	return vfloat8(_mm256_permute_ps(r, 0));
852 }
853 
854 /**
855  * @brief Return the horizontal maximum of a vector.
856  */
hmax_s(vfloat8 a)857 ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
858 {
859 	return hmax(a).lane<0>();
860 }
861 
862 /**
863  * @brief Return the horizontal sum of a vector.
864  */
hadd_s(vfloat8 a)865 ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a)
866 {
867 	// Two sequential 4-wide adds gives invariance with 4-wide code
868 	vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
869 	vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
870 	return hadd_s(lo) + hadd_s(hi);
871 }
872 
873 /**
874  * @brief Return lanes from @c b if @c cond is set, else @c a.
875  */
select(vfloat8 a,vfloat8 b,vmask8 cond)876 ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
877 {
878 	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
879 }
880 
881 /**
882  * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
883  */
select_msb(vfloat8 a,vfloat8 b,vmask8 cond)884 ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond)
885 {
886 	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
887 }
888 
889 /**
890  * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
891  *
892  * This is invariant with 4-wide implementations.
893  */
haccumulate(vfloat4 & accum,vfloat8 a)894 ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a)
895 {
896 	vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
897 	haccumulate(accum, lo);
898 
899 	vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
900 	haccumulate(accum, hi);
901 }
902 
903 /**
904  * @brief Accumulate lane-wise sums for a vector.
905  *
906  * This is NOT invariant with 4-wide implementations.
907  */
haccumulate(vfloat8 & accum,vfloat8 a)908 ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a)
909 {
910 	accum += a;
911 }
912 
913 /**
914  * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide.
915  *
916  * This is invariant with 4-wide implementations.
917  */
haccumulate(vfloat4 & accum,vfloat8 a,vmask8 m)918 ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m)
919 {
920 	a = select(vfloat8::zero(), a, m);
921 	haccumulate(accum, a);
922 }
923 
924 /**
925  * @brief Accumulate masked lane-wise sums for a vector.
926  *
927  * This is NOT invariant with 4-wide implementations.
928  */
haccumulate(vfloat8 & accum,vfloat8 a,vmask8 m)929 ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m)
930 {
931 	a = select(vfloat8::zero(), a, m);
932 	haccumulate(accum, a);
933 }
934 
935 /**
936  * @brief Return the sqrt of the lanes in the vector.
937  */
sqrt(vfloat8 a)938 ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a)
939 {
940 	return vfloat8(_mm256_sqrt_ps(a.m));
941 }
942 
943 /**
944  * @brief Load a vector of gathered results from an array;
945  */
gatherf(const float * base,vint8 indices)946 ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
947 {
948 	return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
949 }
950 
951 /**
952  * @brief Store a vector to an unaligned memory address.
953  */
store(vfloat8 a,float * p)954 ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
955 {
956 	_mm256_storeu_ps(p, a.m);
957 }
958 
959 /**
960  * @brief Store a vector to a 32B aligned memory address.
961  */
storea(vfloat8 a,float * p)962 ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
963 {
964 	_mm256_store_ps(p, a.m);
965 }
966 
967 /**
968  * @brief Return a integer value for a float vector, using truncation.
969  */
float_to_int(vfloat8 a)970 ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
971 {
972 	return vint8(_mm256_cvttps_epi32(a.m));
973 }
974 
975 /**
976  * @brief Return a float value for an integer vector.
977  */
int_to_float(vint8 a)978 ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a)
979 {
980 	return vfloat8(_mm256_cvtepi32_ps(a.m));
981 }
982 
983 /**
984  * @brief Return a float value as an integer bit pattern (i.e. no conversion).
985  *
986  * It is a common trick to convert floats into integer bit patterns, perform
987  * some bit hackery based on knowledge they are IEEE 754 layout, and then
988  * convert them back again. This is the first half of that flip.
989  */
float_as_int(vfloat8 a)990 ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a)
991 {
992 	return vint8(_mm256_castps_si256(a.m));
993 }
994 
995 /**
996  * @brief Return a integer value as a float bit pattern (i.e. no conversion).
997  *
998  * It is a common trick to convert floats into integer bit patterns, perform
999  * some bit hackery based on knowledge they are IEEE 754 layout, and then
1000  * convert them back again. This is the second half of that flip.
1001  */
int_as_float(vint8 a)1002 ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
1003 {
1004 	return vfloat8(_mm256_castsi256_ps(a.m));
1005 }
1006 
1007 /**
1008  * @brief Debug function to print a vector of ints.
1009  */
print(vint8 a)1010 ASTCENC_SIMD_INLINE void print(vint8 a)
1011 {
1012 	alignas(ASTCENC_VECALIGN) int v[8];
1013 	storea(a, v);
1014 	printf("v8_i32:\n  %8d %8d %8d %8d %8d %8d %8d %8d\n",
1015 	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
1016 }
1017 
1018 /**
1019  * @brief Debug function to print a vector of floats.
1020  */
print(vfloat8 a)1021 ASTCENC_SIMD_INLINE void print(vfloat8 a)
1022 {
1023 	alignas(ASTCENC_VECALIGN) float v[8];
1024 	storea(a, v);
1025 	printf("v8_f32:\n  %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
1026 	       static_cast<double>(v[0]), static_cast<double>(v[1]),
1027 	       static_cast<double>(v[2]), static_cast<double>(v[3]),
1028 	       static_cast<double>(v[4]), static_cast<double>(v[5]),
1029 	       static_cast<double>(v[6]), static_cast<double>(v[7]));
1030 }
1031 
1032 /**
1033  * @brief Debug function to print a vector of masks.
1034  */
print(vmask8 a)1035 ASTCENC_SIMD_INLINE void print(vmask8 a)
1036 {
1037 	print(select(vint8(0), vint8(1), a));
1038 }
1039 
1040 #endif // #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
1041