• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23 
24 #ifndef __SWR_SIMDINTRIN_H__
25 #define __SWR_SIMDINTRIN_H__
26 
27 #include "os.h"
28 
29 #include <cassert>
30 
31 #include <emmintrin.h>
32 #include <immintrin.h>
33 #include <xmmintrin.h>
34 
35 #if KNOB_SIMD_WIDTH == 8
36 typedef __m256 simdscalar;
37 typedef __m256i simdscalari;
38 typedef uint8_t simdmask;
39 #else
40 #error Unsupported vector width
41 #endif
42 
43 // simd vector
OSALIGNSIMD(union)44 OSALIGNSIMD(union) simdvector
45 {
46     simdscalar  v[4];
47     struct
48     {
49         simdscalar x, y, z, w;
50     };
51 
52     simdscalar& operator[] (const int i) { return v[i]; }
53     const simdscalar& operator[] (const int i) const { return v[i]; }
54 };
55 
56 #if KNOB_SIMD_WIDTH == 8
57 #define _simd128_maskstore_ps _mm_maskstore_ps
58 #define _simd_load_ps _mm256_load_ps
59 #define _simd_load1_ps _mm256_broadcast_ss
60 #define _simd_loadu_ps _mm256_loadu_ps
61 #define _simd_setzero_ps _mm256_setzero_ps
62 #define _simd_set1_ps   _mm256_set1_ps
63 #define _simd_blend_ps  _mm256_blend_ps
64 #define _simd_blendv_ps _mm256_blendv_ps
65 #define _simd_store_ps _mm256_store_ps
66 #define _simd_mul_ps _mm256_mul_ps
67 #define _simd_add_ps _mm256_add_ps
68 #define _simd_sub_ps _mm256_sub_ps
69 #define _simd_rsqrt_ps _mm256_rsqrt_ps
70 #define _simd_min_ps _mm256_min_ps
71 #define _simd_max_ps _mm256_max_ps
72 #define _simd_movemask_ps _mm256_movemask_ps
73 #define _simd_cvtps_epi32 _mm256_cvtps_epi32
74 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
75 #define _simd_cvtepi32_ps _mm256_cvtepi32_ps
76 #define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
77 #define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
78 #define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
79 #define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
80 #define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
81 #define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
82 #define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
83 #define _simd_and_ps _mm256_and_ps
84 #define _simd_or_ps _mm256_or_ps
85 
86 #define _simd_rcp_ps _mm256_rcp_ps
87 #define _simd_div_ps _mm256_div_ps
88 #define _simd_castsi_ps _mm256_castsi256_ps
89 #define _simd_andnot_ps _mm256_andnot_ps
90 #define _simd_round_ps _mm256_round_ps
91 #define _simd_castpd_ps _mm256_castpd_ps
92 #define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
93 #define _simd_stream_ps _mm256_stream_ps
94 
95 #define _simd_load_sd _mm256_load_sd
96 #define _simd_movemask_pd _mm256_movemask_pd
97 #define _simd_castsi_pd _mm256_castsi256_pd
98 
99 // emulated integer simd
100 #define SIMD_EMU_EPI(func, intrin) \
101 INLINE \
102 __m256i func(__m256i a, __m256i b)\
103 {\
104     __m128i aHi = _mm256_extractf128_si256(a, 1);\
105     __m128i bHi = _mm256_extractf128_si256(b, 1);\
106     __m128i aLo = _mm256_castsi256_si128(a);\
107     __m128i bLo = _mm256_castsi256_si128(b);\
108 \
109     __m128i subLo = intrin(aLo, bLo);\
110     __m128i subHi = intrin(aHi, bHi);\
111 \
112     __m256i result = _mm256_castsi128_si256(subLo);\
113             result = _mm256_insertf128_si256(result, subHi, 1);\
114 \
115     return result;\
116 }
117 
118 #if (KNOB_ARCH == KNOB_ARCH_AVX)
119 INLINE
_simdemu_permute_ps(__m256 a,__m256i b)120 __m256 _simdemu_permute_ps(__m256 a, __m256i b)
121 {
122     __m128 aHi = _mm256_extractf128_ps(a, 1);
123     __m128i bHi = _mm256_extractf128_si256(b, 1);
124     __m128 aLo = _mm256_castps256_ps128(a);
125     __m128i bLo = _mm256_castsi256_si128(b);
126 
127     __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3));
128     __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
129     __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
130     __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
131 
132     indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3));
133     resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
134     resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
135     __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
136 
137     __m256 result = _mm256_castps128_ps256(blendLowRes);
138     result = _mm256_insertf128_ps(result, blendHiRes, 1);
139 
140     return result;
141 }
142 
143 INLINE
_simdemu_permute_epi32(__m256i a,__m256i b)144 __m256i _simdemu_permute_epi32(__m256i a, __m256i b)
145 {
146     return _mm256_castps_si256(_simdemu_permute_ps(_mm256_castsi256_ps(a), b));
147 }
148 
149 INLINE
_simdemu_srlv_epi32(__m256i vA,__m256i vCount)150 __m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount)
151 {
152     int32_t aHi, aLow, countHi, countLow;
153     __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
154     __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
155     __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
156     __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
157 
158     aHi = _mm_extract_epi32(vAHi, 0);
159     countHi = _mm_extract_epi32(vCountHi, 0);
160     aHi >>= countHi;
161     vAHi = _mm_insert_epi32(vAHi, aHi, 0);
162 
163     aLow = _mm_extract_epi32(vALow, 0);
164     countLow = _mm_extract_epi32(vCountLow, 0);
165     aLow >>= countLow;
166     vALow = _mm_insert_epi32(vALow, aLow, 0);
167 
168     aHi = _mm_extract_epi32(vAHi, 1);
169     countHi = _mm_extract_epi32(vCountHi, 1);
170     aHi >>= countHi;
171     vAHi = _mm_insert_epi32(vAHi, aHi, 1);
172 
173     aLow = _mm_extract_epi32(vALow, 1);
174     countLow = _mm_extract_epi32(vCountLow, 1);
175     aLow >>= countLow;
176     vALow = _mm_insert_epi32(vALow, aLow, 1);
177 
178     aHi = _mm_extract_epi32(vAHi, 2);
179     countHi = _mm_extract_epi32(vCountHi, 2);
180     aHi >>= countHi;
181     vAHi = _mm_insert_epi32(vAHi, aHi, 2);
182 
183     aLow = _mm_extract_epi32(vALow, 2);
184     countLow = _mm_extract_epi32(vCountLow, 2);
185     aLow >>= countLow;
186     vALow = _mm_insert_epi32(vALow, aLow, 2);
187 
188     aHi = _mm_extract_epi32(vAHi, 3);
189     countHi = _mm_extract_epi32(vCountHi, 3);
190     aHi >>= countHi;
191     vAHi = _mm_insert_epi32(vAHi, aHi, 3);
192 
193     aLow = _mm_extract_epi32(vALow, 3);
194     countLow = _mm_extract_epi32(vCountLow, 3);
195     aLow >>= countLow;
196     vALow = _mm_insert_epi32(vALow, aLow, 3);
197 
198     __m256i ret = _mm256_set1_epi32(0);
199     ret = _mm256_insertf128_si256(ret, vAHi, 1);
200     ret = _mm256_insertf128_si256(ret, vALow, 0);
201     return ret;
202 }
203 
204 
205 INLINE
_simdemu_sllv_epi32(__m256i vA,__m256i vCount)206 __m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount)
207 {
208     int32_t aHi, aLow, countHi, countLow;
209     __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
210     __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
211     __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
212     __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
213 
214     aHi = _mm_extract_epi32(vAHi, 0);
215     countHi = _mm_extract_epi32(vCountHi, 0);
216     aHi <<= countHi;
217     vAHi = _mm_insert_epi32(vAHi, aHi, 0);
218 
219     aLow = _mm_extract_epi32(vALow, 0);
220     countLow = _mm_extract_epi32(vCountLow, 0);
221     aLow <<= countLow;
222     vALow = _mm_insert_epi32(vALow, aLow, 0);
223 
224     aHi = _mm_extract_epi32(vAHi, 1);
225     countHi = _mm_extract_epi32(vCountHi, 1);
226     aHi <<= countHi;
227     vAHi = _mm_insert_epi32(vAHi, aHi, 1);
228 
229     aLow = _mm_extract_epi32(vALow, 1);
230     countLow = _mm_extract_epi32(vCountLow, 1);
231     aLow <<= countLow;
232     vALow = _mm_insert_epi32(vALow, aLow, 1);
233 
234     aHi = _mm_extract_epi32(vAHi, 2);
235     countHi = _mm_extract_epi32(vCountHi, 2);
236     aHi <<= countHi;
237     vAHi = _mm_insert_epi32(vAHi, aHi, 2);
238 
239     aLow = _mm_extract_epi32(vALow, 2);
240     countLow = _mm_extract_epi32(vCountLow, 2);
241     aLow <<= countLow;
242     vALow = _mm_insert_epi32(vALow, aLow, 2);
243 
244     aHi = _mm_extract_epi32(vAHi, 3);
245     countHi = _mm_extract_epi32(vCountHi, 3);
246     aHi <<= countHi;
247     vAHi = _mm_insert_epi32(vAHi, aHi, 3);
248 
249     aLow = _mm_extract_epi32(vALow, 3);
250     countLow = _mm_extract_epi32(vCountLow, 3);
251     aLow <<= countLow;
252     vALow = _mm_insert_epi32(vALow, aLow, 3);
253 
254     __m256i ret = _mm256_set1_epi32(0);
255     ret = _mm256_insertf128_si256(ret, vAHi, 1);
256     ret = _mm256_insertf128_si256(ret, vALow, 0);
257     return ret;
258 }
259 
260 #define _simd_mul_epi32 _simdemu_mul_epi32
261 #define _simd_mullo_epi32 _simdemu_mullo_epi32
262 #define _simd_sub_epi32 _simdemu_sub_epi32
263 #define _simd_sub_epi64 _simdemu_sub_epi64
264 #define _simd_min_epi32 _simdemu_min_epi32
265 #define _simd_min_epu32 _simdemu_min_epu32
266 #define _simd_max_epi32 _simdemu_max_epi32
267 #define _simd_max_epu32 _simdemu_max_epu32
268 #define _simd_add_epi32 _simdemu_add_epi32
269 #define _simd_and_si _simdemu_and_si
270 #define _simd_andnot_si _simdemu_andnot_si
271 #define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
272 #define _simd_cmplt_epi32 _simdemu_cmplt_epi32
273 #define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
274 #define _simd_or_si _simdemu_or_si
275 #define _simd_xor_si _simdemu_xor_si
276 #define _simd_castps_si _mm256_castps_si256
277 #define _simd_adds_epu8 _simdemu_adds_epu8
278 #define _simd_subs_epu8 _simdemu_subs_epu8
279 #define _simd_add_epi8 _simdemu_add_epi8
280 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
281 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
282 #define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
283 #define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
284 #define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
285 #define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
286 #define _simd_movemask_epi8 _simdemu_movemask_epi8
287 #define _simd_permute_ps _simdemu_permute_ps
288 #define _simd_permute_epi32 _simdemu_permute_epi32
289 #define _simd_srlv_epi32 _simdemu_srlv_epi32
290 #define _simd_sllv_epi32 _simdemu_sllv_epi32
291 
SIMD_EMU_EPI(_simdemu_mul_epi32,_mm_mul_epi32)292 SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
293 SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
294 SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32)
295 SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64)
296 SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32)
297 SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32)
298 SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32)
299 SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32)
300 SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32)
301 SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128)
302 SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128)
303 SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32)
304 SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32)
305 SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32)
306 SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128)
307 SIMD_EMU_EPI(_simdemu_xor_si, _mm_xor_si128)
308 SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8)
309 SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
310 SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
311 SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
312 SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
313 SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
314 SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
315 SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
316 SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
317 SIMD_EMU_EPI(_simdemu_unpacklo_epi8, _mm_unpacklo_epi8)
318 SIMD_EMU_EPI(_simdemu_unpackhi_epi8, _mm_unpackhi_epi8)
319 SIMD_EMU_EPI(_simdemu_unpacklo_epi16, _mm_unpacklo_epi16)
320 SIMD_EMU_EPI(_simdemu_unpackhi_epi16, _mm_unpackhi_epi16)
321 
322 #define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8
323 #define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8
324 #define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16
325 #define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16
326 #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
327 #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
328 #define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
329 #define _simd_unpackhi_epi64(a, b) _mm256_castpd_si256(_mm256_unpackhi_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
330 
331 #define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i)
332 #define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i)
333 #define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i)
334 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
335 
336 #define _simd128_fmadd_ps _mm_fmaddemu_ps
337 #define _simd_fmadd_ps _mm_fmaddemu256_ps
338 #define _simd_fmsub_ps _mm_fmsubemu256_ps
339 #define _simd_shuffle_epi8 _simdemu_shuffle_epi8
340 SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
341 
342 INLINE
343 __m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
344 {
345     __m128 res = _mm_mul_ps(a, b);
346     res = _mm_add_ps(res, c);
347     return res;
348 }
349 
350 INLINE
_mm_fmaddemu256_ps(__m256 a,__m256 b,__m256 c)351 __m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
352 {
353     __m256 res = _mm256_mul_ps(a, b);
354     res = _mm256_add_ps(res, c);
355     return res;
356 }
357 
358 INLINE
_mm_fmsubemu256_ps(__m256 a,__m256 b,__m256 c)359 __m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
360 {
361     __m256 res = _mm256_mul_ps(a, b);
362     res = _mm256_sub_ps(res, c);
363     return res;
364 }
365 
366 INLINE
_simd_i32gather_ps(const float * pBase,__m256i vOffsets,const int scale)367 __m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
368 {
369     uint32_t *pOffsets = (uint32_t*)&vOffsets;
370     simdscalar vResult;
371     float* pResult = (float*)&vResult;
372     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
373     {
374         uint32_t offset = pOffsets[i];
375         offset = offset * scale;
376         pResult[i] = *(float*)(((const uint8_t*)pBase + offset));
377     }
378 
379     return vResult;
380 }
381 
382 INLINE
_simd_mask_i32gather_ps(__m256 vSrc,const float * pBase,__m256i vOffsets,__m256 vMask,const int scale)383 __m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
384 {
385     uint32_t *pOffsets = (uint32_t*)&vOffsets;
386     simdscalar vResult = vSrc;
387     float* pResult = (float*)&vResult;
388     DWORD index;
389     uint32_t mask = _simd_movemask_ps(vMask);
390     while (_BitScanForward(&index, mask))
391     {
392         mask &= ~(1 << index);
393         uint32_t offset = pOffsets[index];
394         offset = offset * scale;
395         pResult[index] = *(float*)(((const uint8_t*)pBase + offset));
396     }
397 
398     return vResult;
399 }
400 
401 INLINE
_simd_abs_epi32(__m256i a)402 __m256i _simd_abs_epi32(__m256i a)
403 {
404         __m128i aHi = _mm256_extractf128_si256(a, 1);
405         __m128i aLo = _mm256_castsi256_si128(a);
406         __m128i absLo = _mm_abs_epi32(aLo);
407         __m128i absHi = _mm_abs_epi32(aHi);
408         __m256i result = _mm256_castsi128_si256(absLo);
409         result = _mm256_insertf128_si256(result, absHi, 1);
410         return result;
411 }
412 
413 INLINE
_simdemu_movemask_epi8(__m256i a)414 int _simdemu_movemask_epi8(__m256i a)
415 {
416     __m128i aHi = _mm256_extractf128_si256(a, 1);
417     __m128i aLo = _mm256_castsi256_si128(a);
418 
419     int resHi = _mm_movemask_epi8(aHi);
420     int resLo = _mm_movemask_epi8(aLo);
421 
422     return (resHi << 16) | resLo;
423 }
424 
425 INLINE
_simd_cvtepu8_epi16(__m128i a)426 __m256i _simd_cvtepu8_epi16(__m128i a)
427 {
428     __m128i resultlo = _mm_cvtepu8_epi16(a);
429     __m128i resulthi = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
430 
431     __m256i result = _mm256_castsi128_si256(resultlo);
432 
433     return _mm256_insertf128_si256(result, resulthi, 1);
434 }
435 
436 INLINE
_simd_cvtepu8_epi32(__m128i a)437 __m256i _simd_cvtepu8_epi32(__m128i a)
438 {
439     __m128i resultlo = _mm_cvtepu8_epi32(a);
440     __m128i resulthi = _mm_cvtepu8_epi32(_mm_srli_si128(a, 4));
441 
442     __m256i result = _mm256_castsi128_si256(resultlo);
443 
444     return _mm256_insertf128_si256(result, resulthi, 1);
445 }
446 
447 INLINE
_simd_cvtepu16_epi32(__m128i a)448 __m256i _simd_cvtepu16_epi32(__m128i a)
449 {
450     __m128i resultlo = _mm_cvtepu16_epi32(a);
451     __m128i resulthi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
452 
453     __m256i result = _mm256_castsi128_si256(resultlo);
454 
455     return _mm256_insertf128_si256(result, resulthi, 1);
456 }
457 
458 INLINE
_simd_packus_epi16(__m256i a,__m256i b)459 __m256i _simd_packus_epi16(__m256i a, __m256i b)
460 {
461     __m128i alo = _mm256_extractf128_si256(a, 0);
462     __m128i ahi = _mm256_extractf128_si256(a, 1);
463 
464     __m128i blo = _mm256_extractf128_si256(b, 0);
465     __m128i bhi = _mm256_extractf128_si256(b, 1);
466 
467     __m128i resultlo = _mm_packus_epi16(alo, blo);
468     __m128i resulthi = _mm_packus_epi16(ahi, bhi);
469 
470     __m256i result = _mm256_castsi128_si256(resultlo);
471 
472     return _mm256_insertf128_si256(result, resulthi, 1);
473 }
474 
475 INLINE
_simd_packs_epi16(__m256i a,__m256i b)476 __m256i _simd_packs_epi16(__m256i a, __m256i b)
477 {
478     __m128i alo = _mm256_extractf128_si256(a, 0);
479     __m128i ahi = _mm256_extractf128_si256(a, 1);
480 
481     __m128i blo = _mm256_extractf128_si256(b, 0);
482     __m128i bhi = _mm256_extractf128_si256(b, 1);
483 
484     __m128i resultlo = _mm_packs_epi16(alo, blo);
485     __m128i resulthi = _mm_packs_epi16(ahi, bhi);
486 
487     __m256i result = _mm256_castsi128_si256(resultlo);
488 
489     return _mm256_insertf128_si256(result, resulthi, 1);
490 }
491 
492 INLINE
_simd_packus_epi32(__m256i a,__m256i b)493 __m256i _simd_packus_epi32(__m256i a, __m256i b)
494 {
495     __m128i alo = _mm256_extractf128_si256(a, 0);
496     __m128i ahi = _mm256_extractf128_si256(a, 1);
497 
498     __m128i blo = _mm256_extractf128_si256(b, 0);
499     __m128i bhi = _mm256_extractf128_si256(b, 1);
500 
501     __m128i resultlo = _mm_packus_epi32(alo, blo);
502     __m128i resulthi = _mm_packus_epi32(ahi, bhi);
503 
504     __m256i result = _mm256_castsi128_si256(resultlo);
505 
506     return _mm256_insertf128_si256(result, resulthi, 1);
507 }
508 
509 INLINE
_simd_packs_epi32(__m256i a,__m256i b)510 __m256i _simd_packs_epi32(__m256i a, __m256i b)
511 {
512     __m128i alo = _mm256_extractf128_si256(a, 0);
513     __m128i ahi = _mm256_extractf128_si256(a, 1);
514 
515     __m128i blo = _mm256_extractf128_si256(b, 0);
516     __m128i bhi = _mm256_extractf128_si256(b, 1);
517 
518     __m128i resultlo = _mm_packs_epi32(alo, blo);
519     __m128i resulthi = _mm_packs_epi32(ahi, bhi);
520 
521     __m256i result = _mm256_castsi128_si256(resultlo);
522 
523     return _mm256_insertf128_si256(result, resulthi, 1);
524 }
525 
526 #else
527 
528 #define _simd_mul_epi32 _mm256_mul_epi32
529 #define _simd_mullo_epi32 _mm256_mullo_epi32
530 #define _simd_sub_epi32 _mm256_sub_epi32
531 #define _simd_sub_epi64 _mm256_sub_epi64
532 #define _simd_min_epi32 _mm256_min_epi32
533 #define _simd_max_epi32 _mm256_max_epi32
534 #define _simd_min_epu32 _mm256_min_epu32
535 #define _simd_max_epu32 _mm256_max_epu32
536 #define _simd_add_epi32 _mm256_add_epi32
537 #define _simd_and_si _mm256_and_si256
538 #define _simd_andnot_si _mm256_andnot_si256
539 #define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
540 #define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
541 #define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
542 #define _simd_or_si _mm256_or_si256
543 #define _simd_xor_si _mm256_xor_si256
544 #define _simd_castps_si _mm256_castps_si256
545 
546 #define _simd_unpacklo_epi8 _mm256_unpacklo_epi8
547 #define _simd_unpackhi_epi8 _mm256_unpackhi_epi8
548 #define _simd_unpacklo_epi16 _mm256_unpacklo_epi16
549 #define _simd_unpackhi_epi16 _mm256_unpackhi_epi16
550 #define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
551 #define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
552 #define _simd_unpacklo_epi64 _mm256_unpacklo_epi64
553 #define _simd_unpackhi_epi64 _mm256_unpackhi_epi64
554 
555 #define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
556 #define _simd_slli_epi32 _mm256_slli_epi32
557 #define _simd_srai_epi32 _mm256_srai_epi32
558 #define _simd_srli_epi32 _mm256_srli_epi32
559 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
560 #define _simd128_fmadd_ps _mm_fmadd_ps
561 #define _simd_fmadd_ps _mm256_fmadd_ps
562 #define _simd_fmsub_ps _mm256_fmsub_ps
563 #define _simd_shuffle_epi8 _mm256_shuffle_epi8
564 #define _simd_adds_epu8 _mm256_adds_epu8
565 #define _simd_subs_epu8 _mm256_subs_epu8
566 #define _simd_add_epi8 _mm256_add_epi8
567 #define _simd_i32gather_ps _mm256_i32gather_ps
568 #define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
569 #define _simd_abs_epi32 _mm256_abs_epi32
570 
571 #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
572 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
573 #define _simd_cmpgt_epi8  _mm256_cmpgt_epi8
574 #define _simd_cmpeq_epi8  _mm256_cmpeq_epi8
575 #define _simd_cmpgt_epi16  _mm256_cmpgt_epi16
576 #define _simd_cmpeq_epi16  _mm256_cmpeq_epi16
577 #define _simd_movemask_epi8 _mm256_movemask_epi8
578 #define _simd_permute_ps _mm256_permutevar8x32_ps
579 #define _simd_permute_epi32 _mm256_permutevar8x32_epi32
580 #define _simd_srlv_epi32 _mm256_srlv_epi32
581 #define _simd_sllv_epi32 _mm256_sllv_epi32
582 #define _simd_cvtepu8_epi16 _mm256_cvtepu8_epi16
583 #define _simd_cvtepu8_epi32 _mm256_cvtepu8_epi32
584 #define _simd_cvtepu16_epi32 _mm256_cvtepu16_epi32
585 #define _simd_packus_epi16 _mm256_packus_epi16
586 #define _simd_packs_epi16 _mm256_packs_epi16
587 #define _simd_packus_epi32 _mm256_packus_epi32
588 #define _simd_packs_epi32 _mm256_packs_epi32
589 
590 #endif
591 
592 #define _simd_unpacklo_ps _mm256_unpacklo_ps
593 #define _simd_unpackhi_ps _mm256_unpackhi_ps
594 #define _simd_unpacklo_pd _mm256_unpacklo_pd
595 #define _simd_unpackhi_pd _mm256_unpackhi_pd
596 #define _simd_insertf128_ps _mm256_insertf128_ps
597 #define _simd_insertf128_pd _mm256_insertf128_pd
598 #define _simd_insertf128_si _mm256_insertf128_si256
599 #define _simd_extractf128_ps _mm256_extractf128_ps
600 #define _simd_extractf128_pd _mm256_extractf128_pd
601 #define _simd_extractf128_si _mm256_extractf128_si256
602 #define _simd_permute2f128_ps _mm256_permute2f128_ps
603 #define _simd_permute2f128_pd _mm256_permute2f128_pd
604 #define _simd_permute2f128_si _mm256_permute2f128_si256
605 #define _simd_shuffle_ps _mm256_shuffle_ps
606 #define _simd_shuffle_pd _mm256_shuffle_pd
607 #define _simd_shuffle_epi32(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), imm8))
608 #define _simd_shuffle_epi64(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), imm8))
609 #define _simd_set1_epi32 _mm256_set1_epi32
610 #define _simd_set_epi32 _mm256_set_epi32
611 #define _simd_set1_epi8 _mm256_set1_epi8
612 #define _simd_setzero_si _mm256_setzero_si256
613 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
614 #define _simd_store_si _mm256_store_si256
615 #define _simd_broadcast_ss _mm256_broadcast_ss
616 #define _simd_maskstore_ps _mm256_maskstore_ps
617 #define _simd_load_si _mm256_load_si256
618 #define _simd_loadu_si _mm256_loadu_si256
619 #define _simd_sub_ps _mm256_sub_ps
620 #define _simd_testz_ps _mm256_testz_ps
621 #define _simd_xor_ps _mm256_xor_ps
622 
623 INLINE
_simd_loadu2_si(const __m128i * hiaddr,const __m128i * loaddr)624 simdscalari _simd_loadu2_si(const __m128i *hiaddr, const __m128i *loaddr)
625 {
626     __m128i lo = _mm_loadu_si128(loaddr);
627     __m128i hi = _mm_loadu_si128(hiaddr);
628 
629     return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
630 }
631 
632 INLINE
_simd_storeu2_si(__m128i * hiaddr,__m128i * loaddr,simdscalari a)633 void _simd_storeu2_si(__m128i *hiaddr, __m128i *loaddr, simdscalari a)
634 {
635     _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
636     _mm_storeu_si128(hiaddr, _mm256_extractf128_si256(a, 1));
637 }
638 
639 INLINE
_simd_blendv_epi32(simdscalari a,simdscalari b,simdscalar mask)640 simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
641 {
642     return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
643 }
644 
645 INLINE
_simd_blendv_epi32(simdscalari a,simdscalari b,simdscalari mask)646 simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalari mask)
647 {
648     return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), _simd_castsi_ps(mask)));
649 }
650 
651 // convert bitmask to vector mask
652 INLINE
vMask(int32_t mask)653 simdscalar vMask(int32_t mask)
654 {
655     __m256i vec = _mm256_set1_epi32(mask);
656     const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
657     vec = _simd_and_si(vec, bit);
658     vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
659     return _simd_castsi_ps(vec);
660 }
661 
662 INLINE
_simd_mov(simdscalar & r,unsigned int rlane,simdscalar & s,unsigned int slane)663 void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
664 {
665     OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
666     _mm256_store_ps(rArray, r);
667     _mm256_store_ps(sArray, s);
668     rArray[rlane] = sArray[slane];
669     r = _mm256_load_ps(rArray);
670 }
671 
_simdemu_slli_epi32(__m256i a,uint32_t i)672 INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
673 {
674     __m128i aHi = _mm256_extractf128_si256(a, 1);
675     __m128i aLo = _mm256_castsi256_si128(a);
676 
677     __m128i resHi = _mm_slli_epi32(aHi, i);
678     __m128i resLo = _mm_slli_epi32(aLo, i);
679 
680     __m256i result = _mm256_castsi128_si256(resLo);
681             result = _mm256_insertf128_si256(result, resHi, 1);
682 
683     return result;
684 }
685 
_simdemu_srai_epi32(__m256i a,uint32_t i)686 INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
687 {
688     __m128i aHi = _mm256_extractf128_si256(a, 1);
689     __m128i aLo = _mm256_castsi256_si128(a);
690 
691     __m128i resHi = _mm_srai_epi32(aHi, i);
692     __m128i resLo = _mm_srai_epi32(aLo, i);
693 
694     __m256i result = _mm256_castsi128_si256(resLo);
695             result = _mm256_insertf128_si256(result, resHi, 1);
696 
697     return result;
698 }
699 
_simdemu_srli_epi32(__m256i a,uint32_t i)700 INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
701 {
702     __m128i aHi = _mm256_extractf128_si256(a, 1);
703     __m128i aLo = _mm256_castsi256_si128(a);
704 
705     __m128i resHi = _mm_srli_epi32(aHi, i);
706     __m128i resLo = _mm_srli_epi32(aLo, i);
707 
708     __m256i result = _mm256_castsi128_si256(resLo);
709     result = _mm256_insertf128_si256(result, resHi, 1);
710 
711     return result;
712 }
713 
714 INLINE
_simdvec_transpose(simdvector & v)715 void _simdvec_transpose(simdvector &v)
716 {
717     SWR_ASSERT(false, "Need to implement 8 wide version");
718 }
719 
720 #else
721 #error Unsupported vector width
722 #endif
723 
724 // Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
725 INLINE
_simdvec_load_ps(simdvector & r,const float * p)726 void _simdvec_load_ps(simdvector& r, const float *p)
727 {
728     r[0] = _simd_set1_ps(p[0]);
729     r[1] = _simd_set1_ps(p[1]);
730     r[2] = _simd_set1_ps(p[2]);
731     r[3] = _simd_set1_ps(p[3]);
732 }
733 
734 INLINE
_simdvec_mov(simdvector & r,const simdscalar & s)735 void _simdvec_mov(simdvector& r, const simdscalar& s)
736 {
737     r[0] = s;
738     r[1] = s;
739     r[2] = s;
740     r[3] = s;
741 }
742 
743 INLINE
_simdvec_mov(simdvector & r,const simdvector & v)744 void _simdvec_mov(simdvector& r, const simdvector& v)
745 {
746     r[0] = v[0];
747     r[1] = v[1];
748     r[2] = v[2];
749     r[3] = v[3];
750 }
751 
752 #if 0
753 // just move a lane from the source simdvector to dest simdvector
754 INLINE
755 void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
756 {
757     _simd_mov(r[0], rlane, s[0], slane);
758     _simd_mov(r[1], rlane, s[1], slane);
759     _simd_mov(r[2], rlane, s[2], slane);
760     _simd_mov(r[3], rlane, s[3], slane);
761 }
762 
763 #endif
764 INLINE
_simdvec_dp3_ps(simdscalar & r,const simdvector & v0,const simdvector & v1)765 void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
766 {
767     simdscalar tmp;
768     r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
769 
770     tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
771     r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
772 
773     tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
774     r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
775 }
776 
777 INLINE
_simdvec_dp4_ps(simdscalar & r,const simdvector & v0,const simdvector & v1)778 void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
779 {
780     simdscalar tmp;
781     r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
782 
783     tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
784     r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
785 
786     tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
787     r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
788 
789     tmp = _simd_mul_ps(v0[3], v1[3]);   // (v0.w*v1.w)
790     r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
791 }
792 
793 INLINE
_simdvec_rcp_length_ps(const simdvector & v)794 simdscalar _simdvec_rcp_length_ps(const simdvector& v)
795 {
796     simdscalar length;
797     _simdvec_dp4_ps(length, v, v);
798     return _simd_rsqrt_ps(length);
799 }
800 
801 INLINE
_simdvec_normalize_ps(simdvector & r,const simdvector & v)802 void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
803 {
804     simdscalar vecLength;
805     vecLength = _simdvec_rcp_length_ps(v);
806 
807     r[0] = _simd_mul_ps(v[0], vecLength);
808     r[1] = _simd_mul_ps(v[1], vecLength);
809     r[2] = _simd_mul_ps(v[2], vecLength);
810     r[3] = _simd_mul_ps(v[3], vecLength);
811 }
812 
813 INLINE
_simdvec_mul_ps(simdvector & r,const simdvector & v,const simdscalar & s)814 void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
815 {
816     r[0] = _simd_mul_ps(v[0], s);
817     r[1] = _simd_mul_ps(v[1], s);
818     r[2] = _simd_mul_ps(v[2], s);
819     r[3] = _simd_mul_ps(v[3], s);
820 }
821 
822 INLINE
_simdvec_mul_ps(simdvector & r,const simdvector & v0,const simdvector & v1)823 void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
824 {
825     r[0] = _simd_mul_ps(v0[0], v1[0]);
826     r[1] = _simd_mul_ps(v0[1], v1[1]);
827     r[2] = _simd_mul_ps(v0[2], v1[2]);
828     r[3] = _simd_mul_ps(v0[3], v1[3]);
829 }
830 
831 INLINE
_simdvec_add_ps(simdvector & r,const simdvector & v0,const simdvector & v1)832 void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
833 {
834     r[0] = _simd_add_ps(v0[0], v1[0]);
835     r[1] = _simd_add_ps(v0[1], v1[1]);
836     r[2] = _simd_add_ps(v0[2], v1[2]);
837     r[3] = _simd_add_ps(v0[3], v1[3]);
838 }
839 
840 INLINE
_simdvec_min_ps(simdvector & r,const simdvector & v0,const simdscalar & s)841 void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
842 {
843     r[0] = _simd_min_ps(v0[0], s);
844     r[1] = _simd_min_ps(v0[1], s);
845     r[2] = _simd_min_ps(v0[2], s);
846     r[3] = _simd_min_ps(v0[3], s);
847 }
848 
849 INLINE
_simdvec_max_ps(simdvector & r,const simdvector & v0,const simdscalar & s)850 void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
851 {
852     r[0] = _simd_max_ps(v0[0], s);
853     r[1] = _simd_max_ps(v0[1], s);
854     r[2] = _simd_max_ps(v0[2], s);
855     r[3] = _simd_max_ps(v0[3], s);
856 }
857 
858 // Matrix4x4 * Vector4
859 //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
860 //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
861 //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
862 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
863 INLINE
_simd_mat4x4_vec4_multiply(simdvector & result,const float * pMatrix,const simdvector & v)864 void _simd_mat4x4_vec4_multiply(
865     simdvector& result,
866     const float *pMatrix,
867     const simdvector& v)
868 {
869     simdscalar m;
870     simdscalar r0;
871     simdscalar r1;
872 
873     m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
874     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
875     m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
876     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
877     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
878     m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
879     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
880     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
881     m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
882     r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
883     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
884     result[0] = r0;
885 
886     m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
887     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
888     m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
889     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
890     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
891     m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
892     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
893     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
894     m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
895     r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
896     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
897     result[1] = r0;
898 
899     m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
900     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
901     m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
902     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
903     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
904     m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
905     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
906     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
907     m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
908     r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
909     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
910     result[2] = r0;
911 
912     m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
913     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
914     m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
915     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
916     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
917     m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
918     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
919     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
920     m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
921     r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
922     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
923     result[3] = r0;
924 }
925 
926 // Matrix4x4 * Vector3 - Direction Vector where w = 0.
927 //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
928 //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
929 //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
930 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
931 INLINE
_simd_mat3x3_vec3_w0_multiply(simdvector & result,const float * pMatrix,const simdvector & v)932 void _simd_mat3x3_vec3_w0_multiply(
933     simdvector& result,
934     const float *pMatrix,
935     const simdvector& v)
936 {
937     simdscalar m;
938     simdscalar r0;
939     simdscalar r1;
940 
941     m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
942     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
943     m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
944     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
945     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
946     m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
947     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
948     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
949     result[0] = r0;
950 
951     m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
952     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
953     m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
954     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
955     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
956     m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
957     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
958     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
959     result[1] = r0;
960 
961     m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
962     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
963     m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
964     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
965     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
966     m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
967     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
968     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
969     result[2] = r0;
970 
971     result[3] = _simd_setzero_ps();
972 }
973 
974 // Matrix4x4 * Vector3 - Position vector where w = 1.
975 //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
976 //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
977 //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
978 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
979 INLINE
_simd_mat4x4_vec3_w1_multiply(simdvector & result,const float * pMatrix,const simdvector & v)980 void _simd_mat4x4_vec3_w1_multiply(
981     simdvector& result,
982     const float *pMatrix,
983     const simdvector& v)
984 {
985     simdscalar m;
986     simdscalar r0;
987     simdscalar r1;
988 
989     m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
990     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
991     m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
992     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
993     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
994     m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
995     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
996     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
997     m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
998     r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
999     result[0] = r0;
1000 
1001     m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
1002     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
1003     m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
1004     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
1005     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
1006     m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
1007     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
1008     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1009     m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
1010     r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1011     result[1] = r0;
1012 
1013     m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
1014     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
1015     m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
1016     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
1017     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
1018     m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
1019     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
1020     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1021     m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
1022     r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1023     result[2] = r0;
1024 
1025     m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
1026     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
1027     m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
1028     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
1029     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
1030     m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
1031     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
1032     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1033     m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
1034     result[3]   = _simd_add_ps(r0, m);          // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1035 }
1036 
1037 INLINE
_simd_mat4x3_vec3_w1_multiply(simdvector & result,const float * pMatrix,const simdvector & v)1038 void _simd_mat4x3_vec3_w1_multiply(
1039     simdvector& result,
1040     const float *pMatrix,
1041     const simdvector& v)
1042 {
1043     simdscalar m;
1044     simdscalar r0;
1045     simdscalar r1;
1046 
1047     m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
1048     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
1049     m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
1050     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
1051     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
1052     m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
1053     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
1054     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1055     m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
1056     r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1057     result[0] = r0;
1058 
1059     m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
1060     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
1061     m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
1062     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
1063     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
1064     m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
1065     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
1066     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1067     m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
1068     r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1069     result[1] = r0;
1070 
1071     m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
1072     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
1073     m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
1074     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
1075     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
1076     m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
1077     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
1078     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1079     m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
1080     r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1081     result[2] = r0;
1082     result[3] = _simd_set1_ps(1.0f);
1083 }
1084 
1085 //////////////////////////////////////////////////////////////////////////
1086 /// @brief Compute plane equation vA * vX + vB * vY + vC
vplaneps(simdscalar vA,simdscalar vB,simdscalar vC,simdscalar & vX,simdscalar & vY)1087 INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
1088 {
1089     simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
1090     vOut = _simd_fmadd_ps(vB, vY, vOut);
1091     return vOut;
1092 }
1093 
1094 //////////////////////////////////////////////////////////////////////////
1095 /// @brief Compute plane equation vA * vX + vB * vY + vC
vplaneps128(__m128 vA,__m128 vB,__m128 vC,__m128 & vX,__m128 & vY)1096 INLINE __m128 vplaneps128(__m128 vA, __m128 vB, __m128 vC, __m128 &vX, __m128 &vY)
1097 {
1098     __m128 vOut = _simd128_fmadd_ps(vA, vX, vC);
1099     vOut = _simd128_fmadd_ps(vB, vY, vOut);
1100     return vOut;
1101 }
1102 
1103 //////////////////////////////////////////////////////////////////////////
1104 /// @brief Interpolates a single component.
1105 /// @param vI - barycentric I
1106 /// @param vJ - barycentric J
1107 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
1108 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
InterpolateComponent(simdscalar vI,simdscalar vJ,const float * pInterpBuffer)1109 static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
1110 {
1111     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
1112     const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
1113     const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
1114 
1115     simdscalar vA = _simd_broadcast_ss(pInterpA);
1116     simdscalar vB = _simd_broadcast_ss(pInterpB);
1117     simdscalar vC = _simd_broadcast_ss(pInterpC);
1118 
1119     simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
1120     vC = _simd_mul_ps(vk, vC);
1121 
1122     return vplaneps(vA, vB, vC, vI, vJ);
1123 }
1124 
1125 //////////////////////////////////////////////////////////////////////////
1126 /// @brief Interpolates a single component.
1127 /// @param vI - barycentric I
1128 /// @param vJ - barycentric J
1129 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
1130 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
InterpolateComponent(__m128 vI,__m128 vJ,const float * pInterpBuffer)1131 static INLINE __m128 InterpolateComponent(__m128 vI, __m128 vJ, const float *pInterpBuffer)
1132 {
1133     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
1134     const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
1135     const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
1136 
1137     __m128 vA = _mm_broadcast_ss(pInterpA);
1138     __m128 vB = _mm_broadcast_ss(pInterpB);
1139     __m128 vC = _mm_broadcast_ss(pInterpC);
1140 
1141     __m128 vk = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), vI), vJ);
1142     vC = _mm_mul_ps(vk, vC);
1143 
1144     return vplaneps128(vA, vB, vC, vI, vJ);
1145 }
1146 
_simd128_abs_ps(__m128 a)1147 static INLINE __m128 _simd128_abs_ps(__m128 a)
1148 {
1149     __m128i ai = _mm_castps_si128(a);
1150     return _mm_castsi128_ps(_mm_and_si128(ai, _mm_set1_epi32(0x7fffffff)));
1151 }
1152 
_simd_abs_ps(simdscalar a)1153 static INLINE simdscalar _simd_abs_ps(simdscalar a)
1154 {
1155     simdscalari ai = _simd_castps_si(a);
1156     return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
1157 }
1158 
1159 INLINE
pdep_u32(UINT a,UINT mask)1160 UINT pdep_u32(UINT a, UINT mask)
1161 {
1162 #if KNOB_ARCH >= KNOB_ARCH_AVX2
1163     return _pdep_u32(a, mask);
1164 #else
1165     UINT result = 0;
1166 
1167     // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
1168     // using bsf instead of funky loop
1169     DWORD maskIndex;
1170     while (_BitScanForward(&maskIndex, mask))
1171     {
1172         // 1. isolate lowest set bit of mask
1173         const UINT lowest = 1 << maskIndex;
1174 
1175         // 2. populate LSB from src
1176         const UINT LSB = (UINT)((int)(a << 31) >> 31);
1177 
1178         // 3. copy bit from mask
1179         result |= LSB & lowest;
1180 
1181         // 4. clear lowest bit
1182         mask &= ~lowest;
1183 
1184         // 5. prepare for next iteration
1185         a >>= 1;
1186     }
1187 
1188     return result;
1189 #endif
1190 }
1191 
1192 INLINE
pext_u32(UINT a,UINT mask)1193 UINT pext_u32(UINT a, UINT mask)
1194 {
1195 #if KNOB_ARCH >= KNOB_ARCH_AVX2
1196     return _pext_u32(a, mask);
1197 #else
1198     UINT result = 0;
1199     DWORD maskIndex;
1200     uint32_t currentBit = 0;
1201     while (_BitScanForward(&maskIndex, mask))
1202     {
1203         // 1. isolate lowest set bit of mask
1204         const UINT lowest = 1 << maskIndex;
1205 
1206         // 2. copy bit from mask
1207         result |= ((a & lowest) > 0) << currentBit++;
1208 
1209         // 3. clear lowest bit
1210         mask &= ~lowest;
1211     }
1212     return result;
1213 #endif
1214 }
1215 
1216 #if ENABLE_AVX512_SIMD16
1217 #include "simd16intrin.h"
1218 #endif//ENABLE_AVX512_SIMD16
1219 
1220 #endif//__SWR_SIMDINTRIN_H__
1221