1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2020-2021 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 /**
19 * @brief Generic 4x32-bit vector functions.
20 *
21 * This module implements generic 4-wide vector functions that are valid for
22 * all instruction sets, typically implemented using lower level 4-wide
23 * operations that are ISA-specific.
24 */
25
26 #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
27 #define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
28
29 #ifndef ASTCENC_SIMD_INLINE
30 #error "Include astcenc_vecmathlib.h, do not include directly"
31 #endif
32
33 #include <cstdio>
34
35 // ============================================================================
36 // vmask4 operators and functions
37 // ============================================================================
38
39 /**
40 * @brief True if any lanes are enabled, false otherwise.
41 */
any(vmask4 a)42 ASTCENC_SIMD_INLINE bool any(vmask4 a)
43 {
44 return mask(a) != 0;
45 }
46
47 /**
48 * @brief True if all lanes are enabled, false otherwise.
49 */
all(vmask4 a)50 ASTCENC_SIMD_INLINE bool all(vmask4 a)
51 {
52 return mask(a) == 0xF;
53 }
54
55 // ============================================================================
56 // vint4 operators and functions
57 // ============================================================================
58
59 /**
60 * @brief Overload: vector by scalar addition.
61 */
62 ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
63 {
64 return a + vint4(b);
65 }
66
67 /**
68 * @brief Overload: vector by vector incremental addition.
69 */
70 ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
71 {
72 a = a + b;
73 return a;
74 }
75
76 /**
77 * @brief Overload: vector by scalar subtraction.
78 */
79 ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
80 {
81 return a - vint4(b);
82 }
83
84 /**
85 * @brief Overload: vector by scalar multiplication.
86 */
87 ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)
88 {
89 return a * vint4(b);
90 }
91
92 /**
93 * @brief Overload: vector by scalar bitwise or.
94 */
95 ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)
96 {
97 return a | vint4(b);
98 }
99
100 /**
101 * @brief Overload: vector by scalar bitwise and.
102 */
103 ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
104 {
105 return a & vint4(b);
106 }
107
108 /**
109 * @brief Overload: vector by scalar bitwise xor.
110 */
111 ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
112 {
113 return a ^ vint4(b);
114 }
115
116 /**
117 * @brief Return the clamped value between min and max.
118 */
clamp(int minv,int maxv,vint4 a)119 ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
120 {
121 return min(max(a, vint4(minv)), vint4(maxv));
122 }
123
124 /**
125 * @brief Return the horizontal sum of RGB vector lanes as a scalar.
126 */
hadd_rgb_s(vint4 a)127 ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
128 {
129 return a.lane<0>() + a.lane<1>() + a.lane<2>();
130 }
131
132 // ============================================================================
133 // vfloat4 operators and functions
134 // ============================================================================
135
136 /**
137 * @brief Overload: vector by vector incremental addition.
138 */
139 ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
140 {
141 a = a + b;
142 return a;
143 }
144
145 /**
146 * @brief Overload: vector by scalar addition.
147 */
148 ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
149 {
150 return a + vfloat4(b);
151 }
152
153 /**
154 * @brief Overload: vector by scalar subtraction.
155 */
156 ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
157 {
158 return a - vfloat4(b);
159 }
160
161 /**
162 * @brief Overload: vector by scalar multiplication.
163 */
164 ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
165 {
166 return a * vfloat4(b);
167 }
168
169 /**
170 * @brief Overload: scalar by vector multiplication.
171 */
172 ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
173 {
174 return vfloat4(a) * b;
175 }
176
177 /**
178 * @brief Overload: vector by scalar division.
179 */
180 ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
181 {
182 return a / vfloat4(b);
183 }
184
185 /**
186 * @brief Overload: scalar by vector division.
187 */
188 ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
189 {
190 return vfloat4(a) / b;
191 }
192
193 /**
194 * @brief Return the min vector of a vector and a scalar.
195 *
196 * If either lane value is NaN, @c b will be returned for that lane.
197 */
min(vfloat4 a,float b)198 ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
199 {
200 return min(a, vfloat4(b));
201 }
202
203 /**
204 * @brief Return the max vector of a vector and a scalar.
205 *
206 * If either lane value is NaN, @c b will be returned for that lane.
207 */
max(vfloat4 a,float b)208 ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
209 {
210 return max(a, vfloat4(b));
211 }
212
213 /**
214 * @brief Return the clamped value between min and max.
215 *
216 * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
217 * then @c min will be returned for that lane.
218 */
clamp(float minv,float maxv,vfloat4 a)219 ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
220 {
221 // Do not reorder - second operand will return if either is NaN
222 return min(max(a, minv), maxv);
223 }
224
225 /**
226 * @brief Return the clamped value between 0.0f and max.
227 *
228 * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
229 * be returned for that lane.
230 */
clampz(float maxv,vfloat4 a)231 ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
232 {
233 // Do not reorder - second operand will return if either is NaN
234 return min(max(a, vfloat4::zero()), maxv);
235 }
236
237 /**
238 * @brief Return the clamped value between 0.0f and 1.0f.
239 *
240 * If @c a is NaN then zero will be returned for that lane.
241 */
clampzo(vfloat4 a)242 ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
243 {
244 // Do not reorder - second operand will return if either is NaN
245 return min(max(a, vfloat4::zero()), 1.0f);
246 }
247
248 /**
249 * @brief Return the horizontal minimum of a vector.
250 */
hmin_s(vfloat4 a)251 ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
252 {
253 return hmin(a).lane<0>();
254 }
255
256 /**
257 * @brief Return the horizontal min of RGB vector lanes as a scalar.
258 */
hmin_rgb_s(vfloat4 a)259 ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
260 {
261 a.set_lane<3>(a.lane<0>());
262 return hmin_s(a);
263 }
264
265 /**
266 * @brief Return the horizontal maximum of a vector.
267 */
hmax_s(vfloat4 a)268 ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
269 {
270 return hmax(a).lane<0>();
271 }
272
273 /**
274 * @brief Accumulate lane-wise sums for a vector.
275 */
haccumulate(vfloat4 & accum,vfloat4 a)276 ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
277 {
278 accum = accum + a;
279 }
280
281 /**
282 * @brief Accumulate lane-wise sums for a masked vector.
283 */
haccumulate(vfloat4 & accum,vfloat4 a,vmask4 m)284 ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
285 {
286 a = select(vfloat4::zero(), a, m);
287 haccumulate(accum, a);
288 }
289
290 /**
291 * @brief Return the horizontal sum of RGB vector lanes as a scalar.
292 */
hadd_rgb_s(vfloat4 a)293 ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
294 {
295 return a.lane<0>() + a.lane<1>() + a.lane<2>();
296 }
297
298 #if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
299
300 /**
301 * @brief Return the dot product for the full 4 lanes, returning scalar.
302 */
dot_s(vfloat4 a,vfloat4 b)303 ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
304 {
305 vfloat4 m = a * b;
306 return hadd_s(m);
307 }
308
309 /**
310 * @brief Return the dot product for the full 4 lanes, returning vector.
311 */
dot(vfloat4 a,vfloat4 b)312 ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
313 {
314 vfloat4 m = a * b;
315 return vfloat4(hadd_s(m));
316 }
317
318 /**
319 * @brief Return the dot product for the bottom 3 lanes, returning scalar.
320 */
dot3_s(vfloat4 a,vfloat4 b)321 ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
322 {
323 vfloat4 m = a * b;
324 return hadd_rgb_s(m);
325 }
326
327 /**
328 * @brief Return the dot product for the bottom 3 lanes, returning vector.
329 */
dot3(vfloat4 a,vfloat4 b)330 ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
331 {
332 vfloat4 m = a * b;
333 float d3 = hadd_rgb_s(m);
334 return vfloat4(d3, d3, d3, 0.0f);
335 }
336
337 #endif
338
339 #if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
340
341 /**
342 * @brief Population bit count.
343 *
344 * @param v The value to population count.
345 *
346 * @return The number of 1 bits.
347 */
popcount(uint64_t v)348 static inline int popcount(uint64_t v)
349 {
350 uint64_t mask1 = 0x5555555555555555ULL;
351 uint64_t mask2 = 0x3333333333333333ULL;
352 uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
353 v -= (v >> 1) & mask1;
354 v = (v & mask2) + ((v >> 2) & mask2);
355 v += v >> 4;
356 v &= mask3;
357 v *= 0x0101010101010101ULL;
358 v >>= 56;
359 return static_cast<int>(v);
360 }
361
362 #endif
363
364 /**
365 * @brief Apply signed bit transfer.
366 *
367 * @param input0 The first encoded endpoint.
368 * @param input1 The second encoded endpoint.
369 */
bit_transfer_signed(vint4 & input0,vint4 & input1)370 static ASTCENC_SIMD_INLINE void bit_transfer_signed(
371 vint4& input0,
372 vint4& input1
373 ) {
374 input1 = lsr<1>(input1) | (input0 & 0x80);
375 input0 = lsr<1>(input0) & 0x3F;
376
377 vmask4 mask = (input0 & 0x20) != vint4::zero();
378 input0 = select(input0, input0 - 0x40, mask);
379 }
380
381 /**
382 * @brief Debug function to print a vector of ints.
383 */
print(vint4 a)384 ASTCENC_SIMD_INLINE void print(vint4 a)
385 {
386 alignas(16) int v[4];
387 storea(a, v);
388 printf("v4_i32:\n %8d %8d %8d %8d\n",
389 v[0], v[1], v[2], v[3]);
390 }
391
392 /**
393 * @brief Debug function to print a vector of ints.
394 */
printx(vint4 a)395 ASTCENC_SIMD_INLINE void printx(vint4 a)
396 {
397 alignas(16) int v[4];
398 storea(a, v);
399 printf("v4_i32:\n %08x %08x %08x %08x\n",
400 v[0], v[1], v[2], v[3]);
401 }
402
403 /**
404 * @brief Debug function to print a vector of floats.
405 */
print(vfloat4 a)406 ASTCENC_SIMD_INLINE void print(vfloat4 a)
407 {
408 alignas(16) float v[4];
409 storea(a, v);
410 printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
411 static_cast<double>(v[0]), static_cast<double>(v[1]),
412 static_cast<double>(v[2]), static_cast<double>(v[3]));
413 }
414
415 /**
416 * @brief Debug function to print a vector of masks.
417 */
print(vmask4 a)418 ASTCENC_SIMD_INLINE void print(vmask4 a)
419 {
420 print(select(vint4(0), vint4(1), a));
421 }
422
423 #endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
424