• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2019-2024 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 /**
19  * @brief 8x32-bit vectors, implemented using AVX2.
20  *
21  * This module implements 8-wide 32-bit float, int, and mask vectors for x86
22  * AVX2.
23  *
24  * There is a baseline level of functionality provided by all vector widths and
25  * implementations. This is implemented using identical function signatures,
26  * modulo data type, so we can use them as substitutable implementations in VLA
27  * code.
28  */
29 
30 #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
31 #define ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
32 
33 #ifndef ASTCENC_SIMD_INLINE
34 	#error "Include astcenc_vecmathlib.h, do not include directly"
35 #endif
36 
37 #include <cstdio>
38 
39 // Define convenience intrinsics that are missing on older compilers
40 #define astcenc_mm256_set_m128i(m, n) _mm256_insertf128_si256(_mm256_castsi128_si256((n)), (m), 1)
41 
42 // ============================================================================
43 // vfloat8 data type
44 // ============================================================================
45 
46 /**
47  * @brief Data type for 8-wide floats.
48  */
49 struct vfloat8
50 {
51 	/**
52 	 * @brief Construct from zero-initialized value.
53 	 */
54 	ASTCENC_SIMD_INLINE vfloat8() = default;
55 
56 	/**
57 	 * @brief Construct from 4 values loaded from an unaligned address.
58 	 *
59 	 * Consider using loada() which is better with vectors if data is aligned
60 	 * to vector length.
61 	 */
vfloat8vfloat862 	ASTCENC_SIMD_INLINE explicit vfloat8(const float *p)
63 	{
64 		m = _mm256_loadu_ps(p);
65 	}
66 
67 	/**
68 	 * @brief Construct from 1 scalar value replicated across all lanes.
69 	 *
70 	 * Consider using zero() for constexpr zeros.
71 	 */
vfloat8vfloat872 	ASTCENC_SIMD_INLINE explicit vfloat8(float a)
73 	{
74 		m = _mm256_set1_ps(a);
75 	}
76 
77 	/**
78 	 * @brief Construct from 8 scalar values.
79 	 *
80 	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
81 	 */
vfloat8vfloat882 	ASTCENC_SIMD_INLINE explicit vfloat8(
83 		float a, float b, float c, float d,
84 		float e, float f, float g, float h)
85 	{
86 		m = _mm256_set_ps(h, g, f, e, d, c, b, a);
87 	}
88 
89 	/**
90 	 * @brief Construct from an existing SIMD register.
91 	 */
vfloat8vfloat892 	ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a)
93 	{
94 		m = a;
95 	}
96 
97 	/**
98 	 * @brief Get the scalar value of a single lane.
99 	 */
lanevfloat8100 	template <int l> ASTCENC_SIMD_INLINE float lane() const
101 	{
102 	#if !defined(__clang__) && defined(_MSC_VER)
103 		return m.m256_f32[l];
104 	#else
105 		union { __m256 m; float f[8]; } cvt;
106 		cvt.m = m;
107 		return cvt.f[l];
108 	#endif
109 	}
110 
111 	/**
112 	 * @brief Factory that returns a vector of zeros.
113 	 */
zerovfloat8114 	static ASTCENC_SIMD_INLINE vfloat8 zero()
115 	{
116 		return vfloat8(_mm256_setzero_ps());
117 	}
118 
119 	/**
120 	 * @brief Factory that returns a replicated scalar loaded from memory.
121 	 */
load1vfloat8122 	static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p)
123 	{
124 		return vfloat8(_mm256_broadcast_ss(p));
125 	}
126 
127 	/**
128 	 * @brief Factory that returns a vector loaded from 32B aligned memory.
129 	 */
loadavfloat8130 	static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
131 	{
132 		return vfloat8(_mm256_load_ps(p));
133 	}
134 
135 	/**
136 	 * @brief Factory that returns a vector containing the lane IDs.
137 	 */
lane_idvfloat8138 	static ASTCENC_SIMD_INLINE vfloat8 lane_id()
139 	{
140 		return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
141 	}
142 
143 	/**
144 	 * @brief The vector ...
145 	 */
146 	__m256 m;
147 };
148 
149 // ============================================================================
150 // vint8 data type
151 // ============================================================================
152 
153 /**
154  * @brief Data type for 8-wide ints.
155  */
156 struct vint8
157 {
158 	/**
159 	 * @brief Construct from zero-initialized value.
160 	 */
161 	ASTCENC_SIMD_INLINE vint8() = default;
162 
163 	/**
164 	 * @brief Construct from 8 values loaded from an unaligned address.
165 	 *
166 	 * Consider using loada() which is better with vectors if data is aligned
167 	 * to vector length.
168 	 */
vint8vint8169 	ASTCENC_SIMD_INLINE explicit vint8(const int *p)
170 	{
171 		m = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
172 	}
173 
174 	/**
175 	 * @brief Construct from 8 uint8_t loaded from an unaligned address.
176 	 */
vint8vint8177 	ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
178 	{
179 		// _mm_loadu_si64 would be nicer syntax, but missing on older GCC
180 		m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*reinterpret_cast<const long long*>(p)));
181 	}
182 
183 	/**
184 	 * @brief Construct from 1 scalar value replicated across all lanes.
185 	 *
186 	 * Consider using vfloat4::zero() for constexpr zeros.
187 	 */
vint8vint8188 	ASTCENC_SIMD_INLINE explicit vint8(int a)
189 	{
190 		m = _mm256_set1_epi32(a);
191 	}
192 
193 	/**
194 	 * @brief Construct from 8 scalar values.
195 	 *
196 	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
197 	 */
vint8vint8198 	ASTCENC_SIMD_INLINE explicit vint8(
199 		int a, int b, int c, int d,
200 		int e, int f, int g, int h)
201 	{
202 		m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
203 	}
204 
205 	/**
206 	 * @brief Construct from an existing SIMD register.
207 	 */
vint8vint8208 	ASTCENC_SIMD_INLINE explicit vint8(__m256i a)
209 	{
210 		m = a;
211 	}
212 
213 	/**
214 	 * @brief Get the scalar from a single lane.
215 	 */
lanevint8216 	template <int l> ASTCENC_SIMD_INLINE int lane() const
217 	{
218 	#if !defined(__clang__) && defined(_MSC_VER)
219 		return m.m256i_i32[l];
220 	#else
221 		union { __m256i m; int f[8]; } cvt;
222 		cvt.m = m;
223 		return cvt.f[l];
224 	#endif
225 	}
226 
227 	/**
228 	 * @brief Factory that returns a vector of zeros.
229 	 */
zerovint8230 	static ASTCENC_SIMD_INLINE vint8 zero()
231 	{
232 		return vint8(_mm256_setzero_si256());
233 	}
234 
235 	/**
236 	 * @brief Factory that returns a replicated scalar loaded from memory.
237 	 */
load1vint8238 	static ASTCENC_SIMD_INLINE vint8 load1(const int* p)
239 	{
240 		__m128i a = _mm_set1_epi32(*p);
241 		return vint8(_mm256_broadcastd_epi32(a));
242 	}
243 
244 	/**
245 	 * @brief Factory that returns a vector loaded from unaligned memory.
246 	 */
loadvint8247 	static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p)
248 	{
249 		return vint8(_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(p)));
250 	}
251 
252 	/**
253 	 * @brief Factory that returns a vector loaded from 32B aligned memory.
254 	 */
loadavint8255 	static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
256 	{
257 		return vint8(_mm256_load_si256(reinterpret_cast<const __m256i*>(p)));
258 	}
259 
260 	/**
261 	 * @brief Factory that returns a vector containing the lane IDs.
262 	 */
lane_idvint8263 	static ASTCENC_SIMD_INLINE vint8 lane_id()
264 	{
265 		return vint8(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
266 	}
267 
268 	/**
269 	 * @brief The vector ...
270 	 */
271 	__m256i m;
272 };
273 
274 // ============================================================================
275 // vmask8 data type
276 // ============================================================================
277 
278 /**
279  * @brief Data type for 8-wide control plane masks.
280  */
281 struct vmask8
282 {
283 	/**
284 	 * @brief Construct from an existing SIMD register.
285 	 */
vmask8vmask8286 	ASTCENC_SIMD_INLINE explicit vmask8(__m256 a)
287 	{
288 		m = a;
289 	}
290 
291 	/**
292 	 * @brief Construct from an existing SIMD register.
293 	 */
vmask8vmask8294 	ASTCENC_SIMD_INLINE explicit vmask8(__m256i a)
295 	{
296 		m = _mm256_castsi256_ps(a);
297 	}
298 
299 	/**
300 	 * @brief Construct from 1 scalar value.
301 	 */
vmask8vmask8302 	ASTCENC_SIMD_INLINE explicit vmask8(bool a)
303 	{
304 		vint8 mask(a == false ? 0 : -1);
305 		m = _mm256_castsi256_ps(mask.m);
306 	}
307 
308 	/**
309 	 * @brief The vector ...
310 	 */
311 	__m256 m;
312 };
313 
314 // ============================================================================
315 // vmask8 operators and functions
316 // ============================================================================
317 
318 /**
319  * @brief Overload: mask union (or).
320  */
321 ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b)
322 {
323 	return vmask8(_mm256_or_ps(a.m, b.m));
324 }
325 
326 /**
327  * @brief Overload: mask intersect (and).
328  */
329 ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b)
330 {
331 	return vmask8(_mm256_and_ps(a.m, b.m));
332 }
333 
334 /**
335  * @brief Overload: mask difference (xor).
336  */
337 ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b)
338 {
339 	return vmask8(_mm256_xor_ps(a.m, b.m));
340 }
341 
342 /**
343  * @brief Overload: mask invert (not).
344  */
345 ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
346 {
347 	return vmask8(_mm256_xor_si256(_mm256_castps_si256(a.m), _mm256_set1_epi32(-1)));
348 }
349 
350 /**
351  * @brief Return a 8-bit mask code indicating mask status.
352  *
353  * bit0 = lane 0
354  */
mask(vmask8 a)355 ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a)
356 {
357 	return static_cast<unsigned int>(_mm256_movemask_ps(a.m));
358 }
359 
360 /**
361  * @brief True if any lanes are enabled, false otherwise.
362  */
any(vmask8 a)363 ASTCENC_SIMD_INLINE bool any(vmask8 a)
364 {
365 	return mask(a) != 0;
366 }
367 
368 /**
369  * @brief True if all lanes are enabled, false otherwise.
370  */
all(vmask8 a)371 ASTCENC_SIMD_INLINE bool all(vmask8 a)
372 {
373 	return mask(a) == 0xFF;
374 }
375 
376 // ============================================================================
377 // vint8 operators and functions
378 // ============================================================================
379 /**
380  * @brief Overload: vector by vector addition.
381  */
382 ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b)
383 {
384 	return vint8(_mm256_add_epi32(a.m, b.m));
385 }
386 
387 /**
388  * @brief Overload: vector by vector incremental addition.
389  */
390 ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b)
391 {
392 	a = a + b;
393 	return a;
394 }
395 
396 /**
397  * @brief Overload: vector by vector subtraction.
398  */
399 ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b)
400 {
401 	return vint8(_mm256_sub_epi32(a.m, b.m));
402 }
403 
404 /**
405  * @brief Overload: vector by vector multiplication.
406  */
407 ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b)
408 {
409 	return vint8(_mm256_mullo_epi32(a.m, b.m));
410 }
411 
412 /**
413  * @brief Overload: vector bit invert.
414  */
415 ASTCENC_SIMD_INLINE vint8 operator~(vint8 a)
416 {
417 	return vint8(_mm256_xor_si256(a.m, _mm256_set1_epi32(-1)));
418 }
419 
420 /**
421  * @brief Overload: vector by vector bitwise or.
422  */
423 ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b)
424 {
425 	return vint8(_mm256_or_si256(a.m, b.m));
426 }
427 
428 /**
429  * @brief Overload: vector by vector bitwise and.
430  */
431 ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b)
432 {
433 	return vint8(_mm256_and_si256(a.m, b.m));
434 }
435 
436 /**
437  * @brief Overload: vector by vector bitwise xor.
438  */
439 ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b)
440 {
441 	return vint8(_mm256_xor_si256(a.m, b.m));
442 }
443 
444 /**
445  * @brief Overload: vector by vector equality.
446  */
447 ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b)
448 {
449 	return vmask8(_mm256_cmpeq_epi32(a.m, b.m));
450 }
451 
452 /**
453  * @brief Overload: vector by vector inequality.
454  */
455 ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b)
456 {
457 	return ~vmask8(_mm256_cmpeq_epi32(a.m, b.m));
458 }
459 
460 /**
461  * @brief Overload: vector by vector less than.
462  */
463 ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b)
464 {
465 	return vmask8(_mm256_cmpgt_epi32(b.m, a.m));
466 }
467 
468 /**
469  * @brief Overload: vector by vector greater than.
470  */
471 ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
472 {
473 	return vmask8(_mm256_cmpgt_epi32(a.m, b.m));
474 }
475 
476 /**
477  * @brief Logical shift left.
478  */
lsl(vint8 a)479 template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a)
480 {
481 	return vint8(_mm256_slli_epi32(a.m, s));
482 }
483 
484 /**
485  * @brief Arithmetic shift right.
486  */
asr(vint8 a)487 template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a)
488 {
489 	return vint8(_mm256_srai_epi32(a.m, s));
490 }
491 
492 /**
493  * @brief Logical shift right.
494  */
lsr(vint8 a)495 template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a)
496 {
497 	return vint8(_mm256_srli_epi32(a.m, s));
498 }
499 
500 /**
501  * @brief Return the min vector of two vectors.
502  */
min(vint8 a,vint8 b)503 ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b)
504 {
505 	return vint8(_mm256_min_epi32(a.m, b.m));
506 }
507 
508 /**
509  * @brief Return the max vector of two vectors.
510  */
max(vint8 a,vint8 b)511 ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
512 {
513 	return vint8(_mm256_max_epi32(a.m, b.m));
514 }
515 
516 /**
517  * @brief Return the horizontal minimum of a vector.
518  */
hmin(vint8 a)519 ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
520 {
521 	__m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
522 	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
523 	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
524 	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
525 
526 	__m256i r = astcenc_mm256_set_m128i(m, m);
527 	vint8 vmin(r);
528 	return vmin;
529 }
530 
531 /**
532  * @brief Return the horizontal maximum of a vector.
533  */
hmax(vint8 a)534 ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
535 {
536 	__m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
537 	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
538 	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
539 	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
540 
541 	__m256i r = astcenc_mm256_set_m128i(m, m);
542 	vint8 vmax(r);
543 	return vmax;
544 }
545 
546 /**
547  * @brief Store a vector to a 16B aligned memory address.
548  */
storea(vint8 a,int * p)549 ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
550 {
551 	_mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m);
552 }
553 
554 /**
555  * @brief Store a vector to an unaligned memory address.
556  */
store(vint8 a,int * p)557 ASTCENC_SIMD_INLINE void store(vint8 a, int* p)
558 {
559 	_mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m);
560 }
561 
562 /**
563  * @brief Store lowest N (vector width) bytes into an unaligned address.
564  */
store_nbytes(vint8 a,uint8_t * p)565 ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
566 {
567 	// This is the most logical implementation, but the convenience intrinsic
568 	// is missing on older compilers (supported in g++ 9 and clang++ 9).
569 	// _mm_storeu_si64(ptr, _mm256_extracti128_si256(v.m, 0))
570 	_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
571 }
572 
573 /**
574  * @brief Gather N (vector width) indices from the array.
575  */
gatheri(const int * base,vint8 indices)576 ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
577 {
578 	return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
579 }
580 
581 /**
582  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
583  */
pack_low_bytes(vint8 v)584 ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
585 {
586 	__m256i shuf = _mm256_set_epi8(0, 0, 0, 0,  0,  0,  0,  0,
587 	                               0, 0, 0, 0, 28, 24, 20, 16,
588 	                               0, 0, 0, 0,  0,  0,  0,  0,
589 	                               0, 0, 0, 0, 12,  8,  4,  0);
590 	__m256i a = _mm256_shuffle_epi8(v.m, shuf);
591 	__m128i a0 = _mm256_extracti128_si256(a, 0);
592 	__m128i a1 = _mm256_extracti128_si256(a, 1);
593 	__m128i b = _mm_unpacklo_epi32(a0, a1);
594 
595 	__m256i r = astcenc_mm256_set_m128i(b, b);
596 	return vint8(r);
597 }
598 
599 /**
600  * @brief Return lanes from @c b if @c cond is set, else @c a.
601  */
select(vint8 a,vint8 b,vmask8 cond)602 ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
603 {
604 	__m256i condi = _mm256_castps_si256(cond.m);
605 	return vint8(_mm256_blendv_epi8(a.m, b.m, condi));
606 }
607 
608 // ============================================================================
609 // vfloat4 operators and functions
610 // ============================================================================
611 
612 /**
613  * @brief Overload: vector by vector addition.
614  */
615 ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b)
616 {
617 	return vfloat8(_mm256_add_ps(a.m, b.m));
618 }
619 
620 /**
621  * @brief Overload: vector by vector incremental addition.
622  */
623 ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b)
624 {
625 	a = a + b;
626 	return a;
627 }
628 
629 /**
630  * @brief Overload: vector by vector subtraction.
631  */
632 ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b)
633 {
634 	return vfloat8(_mm256_sub_ps(a.m, b.m));
635 }
636 
637 /**
638  * @brief Overload: vector by vector multiplication.
639  */
640 ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b)
641 {
642 	return vfloat8(_mm256_mul_ps(a.m, b.m));
643 }
644 
645 /**
646  * @brief Overload: vector by scalar multiplication.
647  */
648 ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b)
649 {
650 	return vfloat8(_mm256_mul_ps(a.m, _mm256_set1_ps(b)));
651 }
652 
653 /**
654  * @brief Overload: scalar by vector multiplication.
655  */
656 ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b)
657 {
658 	return vfloat8(_mm256_mul_ps(_mm256_set1_ps(a), b.m));
659 }
660 
661 /**
662  * @brief Overload: vector by vector division.
663  */
664 ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b)
665 {
666 	return vfloat8(_mm256_div_ps(a.m, b.m));
667 }
668 
669 /**
670  * @brief Overload: vector by scalar division.
671  */
672 ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
673 {
674 	return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
675 }
676 
677 
678 /**
679  * @brief Overload: scalar by vector division.
680  */
681 ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
682 {
683 	return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
684 }
685 
686 
687 /**
688  * @brief Overload: vector by vector equality.
689  */
690 ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b)
691 {
692 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ));
693 }
694 
695 /**
696  * @brief Overload: vector by vector inequality.
697  */
698 ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b)
699 {
700 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_NEQ_OQ));
701 }
702 
703 /**
704  * @brief Overload: vector by vector less than.
705  */
706 ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b)
707 {
708 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LT_OQ));
709 }
710 
711 /**
712  * @brief Overload: vector by vector greater than.
713  */
714 ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b)
715 {
716 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GT_OQ));
717 }
718 
719 /**
720  * @brief Overload: vector by vector less than or equal.
721  */
722 ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b)
723 {
724 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LE_OQ));
725 }
726 
727 /**
728  * @brief Overload: vector by vector greater than or equal.
729  */
730 ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b)
731 {
732 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GE_OQ));
733 }
734 
735 /**
736  * @brief Return the min vector of two vectors.
737  *
738  * If either lane value is NaN, @c b will be returned for that lane.
739  */
min(vfloat8 a,vfloat8 b)740 ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
741 {
742 	return vfloat8(_mm256_min_ps(a.m, b.m));
743 }
744 
745 /**
746  * @brief Return the min vector of a vector and a scalar.
747  *
748  * If either lane value is NaN, @c b will be returned for that lane.
749  */
min(vfloat8 a,float b)750 ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b)
751 {
752 	return min(a, vfloat8(b));
753 }
754 
755 /**
756  * @brief Return the max vector of two vectors.
757  *
758  * If either lane value is NaN, @c b will be returned for that lane.
759  */
max(vfloat8 a,vfloat8 b)760 ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
761 {
762 	return vfloat8(_mm256_max_ps(a.m, b.m));
763 }
764 
765 /**
766  * @brief Return the max vector of a vector and a scalar.
767  *
768  * If either lane value is NaN, @c b will be returned for that lane.
769  */
max(vfloat8 a,float b)770 ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b)
771 {
772 	return max(a, vfloat8(b));
773 }
774 
775 /**
776  * @brief Return the clamped value between min and max.
777  *
778  * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
779  * then @c min will be returned for that lane.
780  */
clamp(float min,float max,vfloat8 a)781 ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
782 {
783 	// Do not reorder - second operand will return if either is NaN
784 	a.m = _mm256_max_ps(a.m, _mm256_set1_ps(min));
785 	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
786 	return a;
787 }
788 
789 /**
790  * @brief Return a clamped value between 0.0f and max.
791  *
792  * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
793  * be returned for that lane.
794  */
clampz(float max,vfloat8 a)795 ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
796 {
797 	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
798 	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
799 	return a;
800 }
801 
802 /**
803  * @brief Return a clamped value between 0.0f and 1.0f.
804  *
805  * If @c a is NaN then zero will be returned for that lane.
806  */
clampzo(vfloat8 a)807 ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a)
808 {
809 	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
810 	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(1.0f));
811 	return a;
812 }
813 
814 /**
815  * @brief Return the absolute value of the float vector.
816  */
abs(vfloat8 a)817 ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a)
818 {
819 	__m256 msk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
820 	return vfloat8(_mm256_and_ps(a.m, msk));
821 }
822 
823 /**
824  * @brief Return a float rounded to the nearest integer value.
825  */
round(vfloat8 a)826 ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a)
827 {
828 	constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
829 	return vfloat8(_mm256_round_ps(a.m, flags));
830 }
831 
832 /**
833  * @brief Return the horizontal minimum of a vector.
834  */
hmin(vfloat8 a)835 ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
836 {
837 	__m128 vlow = _mm256_castps256_ps128(a.m);
838 	__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
839 	vlow = _mm_min_ps(vlow, vhigh);
840 
841 	// First do an horizontal reduction.
842 	__m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));
843 	__m128 mins = _mm_min_ps(vlow, shuf);
844 	shuf = _mm_movehl_ps(shuf, mins);
845 	mins = _mm_min_ss(mins, shuf);
846 
847 	// This is the most logical implementation, but the convenience intrinsic
848 	// is missing on older compilers (supported in g++ 9 and clang++ 9).
849 	//__m256i r = _mm256_set_m128(m, m)
850 	__m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(mins), mins, 1);
851 
852 	return vfloat8(_mm256_permute_ps(r, 0));
853 }
854 
855 /**
856  * @brief Return the horizontal minimum of a vector.
857  */
hmin_s(vfloat8 a)858 ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
859 {
860 	return hmin(a).lane<0>();
861 }
862 
863 /**
864  * @brief Return the horizontal maximum of a vector.
865  */
hmax(vfloat8 a)866 ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
867 {
868 	__m128 vlow = _mm256_castps256_ps128(a.m);
869 	__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
870 	vhigh = _mm_max_ps(vlow, vhigh);
871 
872 	// First do an horizontal reduction.
873 	__m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1));
874 	__m128 maxs = _mm_max_ps(vhigh, shuf);
875 	shuf = _mm_movehl_ps(shuf,maxs);
876 	maxs = _mm_max_ss(maxs, shuf);
877 
878 	// This is the most logical implementation, but the convenience intrinsic
879 	// is missing on older compilers (supported in g++ 9 and clang++ 9).
880 	//__m256i r = _mm256_set_m128(m, m)
881 	__m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(maxs), maxs, 1);
882 	return vfloat8(_mm256_permute_ps(r, 0));
883 }
884 
885 /**
886  * @brief Return the horizontal maximum of a vector.
887  */
hmax_s(vfloat8 a)888 ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
889 {
890 	return hmax(a).lane<0>();
891 }
892 
893 /**
894  * @brief Return the horizontal sum of a vector.
895  */
hadd_s(vfloat8 a)896 ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a)
897 {
898 	// Two sequential 4-wide adds gives invariance with 4-wide code
899 	vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
900 	vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
901 	return hadd_s(lo) + hadd_s(hi);
902 }
903 
904 /**
905  * @brief Return lanes from @c b if @c cond is set, else @c a.
906  */
select(vfloat8 a,vfloat8 b,vmask8 cond)907 ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
908 {
909 	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
910 }
911 
912 /**
913  * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
914  */
select_msb(vfloat8 a,vfloat8 b,vmask8 cond)915 ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond)
916 {
917 	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
918 }
919 
920 /**
921  * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
922  *
923  * This is invariant with 4-wide implementations.
924  */
haccumulate(vfloat4 & accum,vfloat8 a)925 ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a)
926 {
927 	vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
928 	haccumulate(accum, lo);
929 
930 	vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
931 	haccumulate(accum, hi);
932 }
933 
934 /**
935  * @brief Accumulate lane-wise sums for a vector.
936  *
937  * This is NOT invariant with 4-wide implementations.
938  */
haccumulate(vfloat8 & accum,vfloat8 a)939 ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a)
940 {
941 	accum += a;
942 }
943 
944 /**
945  * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide.
946  *
947  * This is invariant with 4-wide implementations.
948  */
haccumulate(vfloat4 & accum,vfloat8 a,vmask8 m)949 ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m)
950 {
951 	a = select(vfloat8::zero(), a, m);
952 	haccumulate(accum, a);
953 }
954 
955 /**
956  * @brief Accumulate masked lane-wise sums for a vector.
957  *
958  * This is NOT invariant with 4-wide implementations.
959  */
haccumulate(vfloat8 & accum,vfloat8 a,vmask8 m)960 ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m)
961 {
962 	a = select(vfloat8::zero(), a, m);
963 	haccumulate(accum, a);
964 }
965 
966 /**
967  * @brief Return the sqrt of the lanes in the vector.
968  */
sqrt(vfloat8 a)969 ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a)
970 {
971 	return vfloat8(_mm256_sqrt_ps(a.m));
972 }
973 
974 /**
975  * @brief Load a vector of gathered results from an array;
976  */
gatherf(const float * base,vint8 indices)977 ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
978 {
979 	return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
980 }
981 
982 /**
983  * @brief Store a vector to an unaligned memory address.
984  */
store(vfloat8 a,float * p)985 ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
986 {
987 	_mm256_storeu_ps(p, a.m);
988 }
989 
990 /**
991  * @brief Store a vector to a 32B aligned memory address.
992  */
storea(vfloat8 a,float * p)993 ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
994 {
995 	_mm256_store_ps(p, a.m);
996 }
997 
998 /**
999  * @brief Return a integer value for a float vector, using truncation.
1000  */
float_to_int(vfloat8 a)1001 ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
1002 {
1003 	return vint8(_mm256_cvttps_epi32(a.m));
1004 }
1005 
1006 /**
1007  * @brief Return a integer value for a float vector, using round-to-nearest.
1008  */
float_to_int_rtn(vfloat8 a)1009 ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
1010 {
1011 	a = a + vfloat8(0.5f);
1012 	return vint8(_mm256_cvttps_epi32(a.m));
1013 }
1014 
1015 
1016 /**
1017  * @brief Return a float value for an integer vector.
1018  */
int_to_float(vint8 a)1019 ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a)
1020 {
1021 	return vfloat8(_mm256_cvtepi32_ps(a.m));
1022 }
1023 
1024 /**
1025  * @brief Return a float value as an integer bit pattern (i.e. no conversion).
1026  *
1027  * It is a common trick to convert floats into integer bit patterns, perform
1028  * some bit hackery based on knowledge they are IEEE 754 layout, and then
1029  * convert them back again. This is the first half of that flip.
1030  */
float_as_int(vfloat8 a)1031 ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a)
1032 {
1033 	return vint8(_mm256_castps_si256(a.m));
1034 }
1035 
1036 /**
1037  * @brief Return a integer value as a float bit pattern (i.e. no conversion).
1038  *
1039  * It is a common trick to convert floats into integer bit patterns, perform
1040  * some bit hackery based on knowledge they are IEEE 754 layout, and then
1041  * convert them back again. This is the second half of that flip.
1042  */
int_as_float(vint8 a)1043 ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
1044 {
1045 	return vfloat8(_mm256_castsi256_ps(a.m));
1046 }
1047 
1048 /**
1049  * @brief Prepare a vtable lookup table for use with the native SIMD size.
1050  */
vtable_prepare(vint4 t0,vint8 & t0p)1051 ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
1052 {
1053 	// AVX2 duplicates the table within each 128-bit lane
1054 	__m128i t0n = t0.m;
1055 	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
1056 }
1057 
1058 /**
1059  * @brief Prepare a vtable lookup table for use with the native SIMD size.
1060  */
vtable_prepare(vint4 t0,vint4 t1,vint8 & t0p,vint8 & t1p)1061 ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
1062 {
1063 	// AVX2 duplicates the table within each 128-bit lane
1064 	__m128i t0n = t0.m;
1065 	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
1066 
1067 	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
1068 	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
1069 }
1070 
1071 /**
1072  * @brief Prepare a vtable lookup table for use with the native SIMD size.
1073  */
vtable_prepare(vint4 t0,vint4 t1,vint4 t2,vint4 t3,vint8 & t0p,vint8 & t1p,vint8 & t2p,vint8 & t3p)1074 ASTCENC_SIMD_INLINE void vtable_prepare(
1075 	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
1076 	vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
1077 {
1078 	// AVX2 duplicates the table within each 128-bit lane
1079 	__m128i t0n = t0.m;
1080 	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
1081 
1082 	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
1083 	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
1084 
1085 	__m128i t2n = _mm_xor_si128(t1.m, t2.m);
1086 	t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n));
1087 
1088 	__m128i t3n = _mm_xor_si128(t2.m, t3.m);
1089 	t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n));
1090 }
1091 
1092 /**
1093  * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
1094  */
vtable_8bt_32bi(vint8 t0,vint8 idx)1095 ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
1096 {
1097 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
1098 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1099 
1100 	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
1101 	return vint8(result);
1102 }
1103 
1104 /**
1105  * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
1106  */
vtable_8bt_32bi(vint8 t0,vint8 t1,vint8 idx)1107 ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
1108 {
1109 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
1110 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1111 
1112 	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
1113 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1114 
1115 	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
1116 	result = _mm256_xor_si256(result, result2);
1117 	return vint8(result);
1118 }
1119 
1120 /**
1121  * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
1122  */
vtable_8bt_32bi(vint8 t0,vint8 t1,vint8 t2,vint8 t3,vint8 idx)1123 ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
1124 {
1125 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
1126 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1127 
1128 	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
1129 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1130 
1131 	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
1132 	result = _mm256_xor_si256(result, result2);
1133 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1134 
1135 	result2 = _mm256_shuffle_epi8(t2.m, idxx);
1136 	result = _mm256_xor_si256(result, result2);
1137 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1138 
1139 	result2 = _mm256_shuffle_epi8(t3.m, idxx);
1140 	result = _mm256_xor_si256(result, result2);
1141 
1142 	return vint8(result);
1143 }
1144 
1145 /**
1146  * @brief Return a vector of interleaved RGBA data.
1147  *
1148  * Input vectors have the value stored in the bottom 8 bits of each lane,
1149  * with high  bits set to zero.
1150  *
1151  * Output vector stores a single RGBA texel packed in each lane.
1152  */
interleave_rgba8(vint8 r,vint8 g,vint8 b,vint8 a)1153 ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
1154 {
1155 	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
1156 }
1157 
1158 /**
1159  * @brief Store a vector, skipping masked lanes.
1160  *
1161  * All masked lanes must be at the end of vector, after all non-masked lanes.
1162  */
store_lanes_masked(uint8_t * base,vint8 data,vmask8 mask)1163 ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask)
1164 {
1165 	_mm256_maskstore_epi32(reinterpret_cast<int*>(base), _mm256_castps_si256(mask.m), data.m);
1166 }
1167 
1168 /**
1169  * @brief Debug function to print a vector of ints.
1170  */
print(vint8 a)1171 ASTCENC_SIMD_INLINE void print(vint8 a)
1172 {
1173 	alignas(32) int v[8];
1174 	storea(a, v);
1175 	printf("v8_i32:\n  %8d %8d %8d %8d %8d %8d %8d %8d\n",
1176 	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
1177 }
1178 
1179 /**
1180  * @brief Debug function to print a vector of ints.
1181  */
printx(vint8 a)1182 ASTCENC_SIMD_INLINE void printx(vint8 a)
1183 {
1184 	alignas(32) int v[8];
1185 	storea(a, v);
1186 	printf("v8_i32:\n  %08x %08x %08x %08x %08x %08x %08x %08x\n",
1187 	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
1188 }
1189 
1190 /**
1191  * @brief Debug function to print a vector of floats.
1192  */
print(vfloat8 a)1193 ASTCENC_SIMD_INLINE void print(vfloat8 a)
1194 {
1195 	alignas(32) float v[8];
1196 	storea(a, v);
1197 	printf("v8_f32:\n  %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
1198 	       static_cast<double>(v[0]), static_cast<double>(v[1]),
1199 	       static_cast<double>(v[2]), static_cast<double>(v[3]),
1200 	       static_cast<double>(v[4]), static_cast<double>(v[5]),
1201 	       static_cast<double>(v[6]), static_cast<double>(v[7]));
1202 }
1203 
1204 /**
1205  * @brief Debug function to print a vector of masks.
1206  */
print(vmask8 a)1207 ASTCENC_SIMD_INLINE void print(vmask8 a)
1208 {
1209 	print(select(vint8(0), vint8(1), a));
1210 }
1211 
1212 #endif // #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
1213