1 /*
2 Copyright (c) 2011 Apple Inc.
3 http://continuousphysics.com/Bullet/
4
5 This software is provided 'as-is', without any express or implied warranty.
6 In no event will the authors be held liable for any damages arising from the use of this software.
7 Permission is granted to anyone to use this software for any purpose,
8 including commercial applications, and to alter it and redistribute it freely,
9 subject to the following restrictions:
10
11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13 3. This notice may not be removed or altered from any source distribution.
14
15 This source version has been altered.
16 */
17
18 #if defined (_WIN32) || defined (__i386__)
19 #define BT_USE_SSE_IN_API
20 #endif
21
22
23 #include "btVector3.h"
24
25
26
27 #if defined BT_USE_SIMD_VECTOR3
28
29 #if DEBUG
30 #include <string.h>//for memset
31 #endif
32
33
34 #ifdef __APPLE__
35 #include <stdint.h>
36 typedef float float4 __attribute__ ((vector_size(16)));
37 #else
38 #define float4 __m128
39 #endif
40 //typedef uint32_t uint4 __attribute__ ((vector_size(16)));
41
42
43 #if defined BT_USE_SSE || defined _WIN32
44
45 #define LOG2_ARRAY_SIZE 6
46 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
47
48 #include <emmintrin.h>
49
50 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
_maxdot_large(const float * vv,const float * vec,unsigned long count,float * dotResult)51 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
52 {
53 const float4 *vertices = (const float4*) vv;
54 static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
55 float4 dotMax = btAssign128( -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY );
56 float4 vvec = _mm_loadu_ps( vec );
57 float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa )); /// zzzz
58 float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy
59
60 long maxIndex = -1L;
61
62 size_t segment = 0;
63 float4 stack_array[ STACK_ARRAY_COUNT ];
64
65 #if DEBUG
66 //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
67 #endif
68
69 size_t index;
70 float4 max;
71 // Faster loop without cleanup code for full tiles
72 for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
73 {
74 max = dotMax;
75
76 for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
77 { // do four dot products at a time. Carefully avoid touching the w element.
78 float4 v0 = vertices[0];
79 float4 v1 = vertices[1];
80 float4 v2 = vertices[2];
81 float4 v3 = vertices[3]; vertices += 4;
82
83 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
84 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
85 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
86 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
87
88 lo0 = lo0*vLo;
89 lo1 = lo1*vLo;
90 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
91 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
92 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
93 z = z*vHi;
94 x = x+y;
95 x = x+z;
96 stack_array[index] = x;
97 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
98
99 v0 = vertices[0];
100 v1 = vertices[1];
101 v2 = vertices[2];
102 v3 = vertices[3]; vertices += 4;
103
104 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
105 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
106 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
107 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
108
109 lo0 = lo0*vLo;
110 lo1 = lo1*vLo;
111 z = _mm_shuffle_ps(hi0, hi1, 0x88);
112 x = _mm_shuffle_ps(lo0, lo1, 0x88);
113 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
114 z = z*vHi;
115 x = x+y;
116 x = x+z;
117 stack_array[index+1] = x;
118 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
119
120 v0 = vertices[0];
121 v1 = vertices[1];
122 v2 = vertices[2];
123 v3 = vertices[3]; vertices += 4;
124
125 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
126 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
127 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
128 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
129
130 lo0 = lo0*vLo;
131 lo1 = lo1*vLo;
132 z = _mm_shuffle_ps(hi0, hi1, 0x88);
133 x = _mm_shuffle_ps(lo0, lo1, 0x88);
134 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
135 z = z*vHi;
136 x = x+y;
137 x = x+z;
138 stack_array[index+2] = x;
139 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
140
141 v0 = vertices[0];
142 v1 = vertices[1];
143 v2 = vertices[2];
144 v3 = vertices[3]; vertices += 4;
145
146 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
147 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
148 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
149 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
150
151 lo0 = lo0*vLo;
152 lo1 = lo1*vLo;
153 z = _mm_shuffle_ps(hi0, hi1, 0x88);
154 x = _mm_shuffle_ps(lo0, lo1, 0x88);
155 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
156 z = z*vHi;
157 x = x+y;
158 x = x+z;
159 stack_array[index+3] = x;
160 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
161
162 // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
163 }
164
165 // If we found a new max
166 if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
167 {
168 // copy the new max across all lanes of our max accumulator
169 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
170 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
171
172 dotMax = max;
173
174 // find first occurrence of that max
175 size_t test;
176 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
177 {}
178 // record where it is.
179 maxIndex = 4*index + segment + indexTable[test];
180 }
181 }
182
183 // account for work we've already done
184 count -= segment;
185
186 // Deal with the last < STACK_ARRAY_COUNT vectors
187 max = dotMax;
188 index = 0;
189
190
191 if( btUnlikely( count > 16) )
192 {
193 for( ; index + 4 <= count / 4; index+=4 )
194 { // do four dot products at a time. Carefully avoid touching the w element.
195 float4 v0 = vertices[0];
196 float4 v1 = vertices[1];
197 float4 v2 = vertices[2];
198 float4 v3 = vertices[3]; vertices += 4;
199
200 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
201 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
202 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
203 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
204
205 lo0 = lo0*vLo;
206 lo1 = lo1*vLo;
207 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
208 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
209 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
210 z = z*vHi;
211 x = x+y;
212 x = x+z;
213 stack_array[index] = x;
214 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
215
216 v0 = vertices[0];
217 v1 = vertices[1];
218 v2 = vertices[2];
219 v3 = vertices[3]; vertices += 4;
220
221 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
222 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
223 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
224 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
225
226 lo0 = lo0*vLo;
227 lo1 = lo1*vLo;
228 z = _mm_shuffle_ps(hi0, hi1, 0x88);
229 x = _mm_shuffle_ps(lo0, lo1, 0x88);
230 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
231 z = z*vHi;
232 x = x+y;
233 x = x+z;
234 stack_array[index+1] = x;
235 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
236
237 v0 = vertices[0];
238 v1 = vertices[1];
239 v2 = vertices[2];
240 v3 = vertices[3]; vertices += 4;
241
242 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
243 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
244 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
245 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
246
247 lo0 = lo0*vLo;
248 lo1 = lo1*vLo;
249 z = _mm_shuffle_ps(hi0, hi1, 0x88);
250 x = _mm_shuffle_ps(lo0, lo1, 0x88);
251 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
252 z = z*vHi;
253 x = x+y;
254 x = x+z;
255 stack_array[index+2] = x;
256 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
257
258 v0 = vertices[0];
259 v1 = vertices[1];
260 v2 = vertices[2];
261 v3 = vertices[3]; vertices += 4;
262
263 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
264 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
265 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
266 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
267
268 lo0 = lo0*vLo;
269 lo1 = lo1*vLo;
270 z = _mm_shuffle_ps(hi0, hi1, 0x88);
271 x = _mm_shuffle_ps(lo0, lo1, 0x88);
272 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
273 z = z*vHi;
274 x = x+y;
275 x = x+z;
276 stack_array[index+3] = x;
277 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
278
279 // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
280 }
281 }
282
283 size_t localCount = (count & -4L) - 4*index;
284 if( localCount )
285 {
286 #ifdef __APPLE__
287 float4 t0, t1, t2, t3, t4;
288 float4 * sap = &stack_array[index + localCount / 4];
289 vertices += localCount; // counter the offset
290 size_t byteIndex = -(localCount) * sizeof(float);
291 //AT&T Code style assembly
292 asm volatile
293 ( ".align 4 \n\
294 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
295 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
296 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
297 movaps %[t0], %[max] // vertices[0] \n\
298 movlhps %[t1], %[max] // x0y0x1y1 \n\
299 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
300 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
301 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
302 movhlps %[t0], %[t1] // z0w0z1w1 \n\
303 movaps %[t3], %[t0] // vertices[2] \n\
304 movlhps %[t4], %[t0] // x2y2x3y3 \n\
305 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
306 movhlps %[t3], %[t4] // z2w2z3w3 \n\
307 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
308 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
309 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
310 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
311 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
312 addps %[t3], %[max] // x + y \n\
313 addps %[t1], %[max] // x + y + z \n\
314 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
315 maxps %[t2], %[max] // record max, restore max \n\
316 add $16, %[byteIndex] // advance loop counter\n\
317 jnz 0b \n\
318 "
319 : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
320 : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
321 : "memory", "cc"
322 );
323 index += localCount/4;
324 #else
325 {
326 for( unsigned int i=0; i<localCount/4; i++,index++)
327 { // do four dot products at a time. Carefully avoid touching the w element.
328 float4 v0 = vertices[0];
329 float4 v1 = vertices[1];
330 float4 v2 = vertices[2];
331 float4 v3 = vertices[3];
332 vertices += 4;
333
334 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
335 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
336 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
337 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
338
339 lo0 = lo0*vLo;
340 lo1 = lo1*vLo;
341 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
342 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
343 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
344 z = z*vHi;
345 x = x+y;
346 x = x+z;
347 stack_array[index] = x;
348 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
349 }
350 }
351 #endif //__APPLE__
352 }
353
354 // process the last few points
355 if( count & 3 )
356 {
357 float4 v0, v1, v2, x, y, z;
358 switch( count & 3 )
359 {
360 case 3:
361 {
362 v0 = vertices[0];
363 v1 = vertices[1];
364 v2 = vertices[2];
365
366 // Calculate 3 dot products, transpose, duplicate v2
367 float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
368 float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
369 lo0 = lo0*vLo;
370 z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
371 z = z*vHi;
372 float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
373 lo1 = lo1*vLo;
374 x = _mm_shuffle_ps(lo0, lo1, 0x88);
375 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
376 }
377 break;
378 case 2:
379 {
380 v0 = vertices[0];
381 v1 = vertices[1];
382 float4 xy = _mm_movelh_ps(v0, v1);
383 z = _mm_movehl_ps(v1, v0);
384 xy = xy*vLo;
385 z = _mm_shuffle_ps( z, z, 0xa8);
386 x = _mm_shuffle_ps( xy, xy, 0xa8);
387 y = _mm_shuffle_ps( xy, xy, 0xfd);
388 z = z*vHi;
389 }
390 break;
391 case 1:
392 {
393 float4 xy = vertices[0];
394 z = _mm_shuffle_ps( xy, xy, 0xaa);
395 xy = xy*vLo;
396 z = z*vHi;
397 x = _mm_shuffle_ps(xy, xy, 0);
398 y = _mm_shuffle_ps(xy, xy, 0x55);
399 }
400 break;
401 }
402 x = x+y;
403 x = x+z;
404 stack_array[index] = x;
405 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
406 index++;
407 }
408
409 // if we found a new max.
410 if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
411 { // we found a new max. Search for it
412 // find max across the max vector, place in all elements of max -- big latency hit here
413 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
414 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
415
416 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
417 // this where it actually makes a difference is handled in the early out at the top of the function,
418 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
419 // complexity, and removed it.
420
421 dotMax = max;
422
423 // scan for the first occurence of max in the array
424 size_t test;
425 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
426 {}
427 maxIndex = 4*index + segment + indexTable[test];
428 }
429
430 _mm_store_ss( dotResult, dotMax);
431 return maxIndex;
432 }
433
434 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
435
_mindot_large(const float * vv,const float * vec,unsigned long count,float * dotResult)436 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
437 {
438 const float4 *vertices = (const float4*) vv;
439 static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
440 float4 dotmin = btAssign128( BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY );
441 float4 vvec = _mm_loadu_ps( vec );
442 float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa )); /// zzzz
443 float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy
444
445 long minIndex = -1L;
446
447 size_t segment = 0;
448 float4 stack_array[ STACK_ARRAY_COUNT ];
449
450 #if DEBUG
451 //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
452 #endif
453
454 size_t index;
455 float4 min;
456 // Faster loop without cleanup code for full tiles
457 for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
458 {
459 min = dotmin;
460
461 for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
462 { // do four dot products at a time. Carefully avoid touching the w element.
463 float4 v0 = vertices[0];
464 float4 v1 = vertices[1];
465 float4 v2 = vertices[2];
466 float4 v3 = vertices[3]; vertices += 4;
467
468 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
469 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
470 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
471 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
472
473 lo0 = lo0*vLo;
474 lo1 = lo1*vLo;
475 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
476 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
477 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
478 z = z*vHi;
479 x = x+y;
480 x = x+z;
481 stack_array[index] = x;
482 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
483
484 v0 = vertices[0];
485 v1 = vertices[1];
486 v2 = vertices[2];
487 v3 = vertices[3]; vertices += 4;
488
489 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
490 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
491 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
492 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
493
494 lo0 = lo0*vLo;
495 lo1 = lo1*vLo;
496 z = _mm_shuffle_ps(hi0, hi1, 0x88);
497 x = _mm_shuffle_ps(lo0, lo1, 0x88);
498 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
499 z = z*vHi;
500 x = x+y;
501 x = x+z;
502 stack_array[index+1] = x;
503 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
504
505 v0 = vertices[0];
506 v1 = vertices[1];
507 v2 = vertices[2];
508 v3 = vertices[3]; vertices += 4;
509
510 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
511 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
512 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
513 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
514
515 lo0 = lo0*vLo;
516 lo1 = lo1*vLo;
517 z = _mm_shuffle_ps(hi0, hi1, 0x88);
518 x = _mm_shuffle_ps(lo0, lo1, 0x88);
519 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
520 z = z*vHi;
521 x = x+y;
522 x = x+z;
523 stack_array[index+2] = x;
524 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
525
526 v0 = vertices[0];
527 v1 = vertices[1];
528 v2 = vertices[2];
529 v3 = vertices[3]; vertices += 4;
530
531 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
532 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
533 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
534 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
535
536 lo0 = lo0*vLo;
537 lo1 = lo1*vLo;
538 z = _mm_shuffle_ps(hi0, hi1, 0x88);
539 x = _mm_shuffle_ps(lo0, lo1, 0x88);
540 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
541 z = z*vHi;
542 x = x+y;
543 x = x+z;
544 stack_array[index+3] = x;
545 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
546
547 // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
548 }
549
550 // If we found a new min
551 if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
552 {
553 // copy the new min across all lanes of our min accumulator
554 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
555 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
556
557 dotmin = min;
558
559 // find first occurrence of that min
560 size_t test;
561 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
562 {}
563 // record where it is.
564 minIndex = 4*index + segment + indexTable[test];
565 }
566 }
567
568 // account for work we've already done
569 count -= segment;
570
571 // Deal with the last < STACK_ARRAY_COUNT vectors
572 min = dotmin;
573 index = 0;
574
575
576 if(btUnlikely( count > 16) )
577 {
578 for( ; index + 4 <= count / 4; index+=4 )
579 { // do four dot products at a time. Carefully avoid touching the w element.
580 float4 v0 = vertices[0];
581 float4 v1 = vertices[1];
582 float4 v2 = vertices[2];
583 float4 v3 = vertices[3]; vertices += 4;
584
585 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
586 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
587 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
588 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
589
590 lo0 = lo0*vLo;
591 lo1 = lo1*vLo;
592 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
593 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
594 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
595 z = z*vHi;
596 x = x+y;
597 x = x+z;
598 stack_array[index] = x;
599 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
600
601 v0 = vertices[0];
602 v1 = vertices[1];
603 v2 = vertices[2];
604 v3 = vertices[3]; vertices += 4;
605
606 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
607 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
608 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
609 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
610
611 lo0 = lo0*vLo;
612 lo1 = lo1*vLo;
613 z = _mm_shuffle_ps(hi0, hi1, 0x88);
614 x = _mm_shuffle_ps(lo0, lo1, 0x88);
615 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
616 z = z*vHi;
617 x = x+y;
618 x = x+z;
619 stack_array[index+1] = x;
620 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
621
622 v0 = vertices[0];
623 v1 = vertices[1];
624 v2 = vertices[2];
625 v3 = vertices[3]; vertices += 4;
626
627 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
628 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
629 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
630 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
631
632 lo0 = lo0*vLo;
633 lo1 = lo1*vLo;
634 z = _mm_shuffle_ps(hi0, hi1, 0x88);
635 x = _mm_shuffle_ps(lo0, lo1, 0x88);
636 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
637 z = z*vHi;
638 x = x+y;
639 x = x+z;
640 stack_array[index+2] = x;
641 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
642
643 v0 = vertices[0];
644 v1 = vertices[1];
645 v2 = vertices[2];
646 v3 = vertices[3]; vertices += 4;
647
648 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
649 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
650 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
651 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
652
653 lo0 = lo0*vLo;
654 lo1 = lo1*vLo;
655 z = _mm_shuffle_ps(hi0, hi1, 0x88);
656 x = _mm_shuffle_ps(lo0, lo1, 0x88);
657 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
658 z = z*vHi;
659 x = x+y;
660 x = x+z;
661 stack_array[index+3] = x;
662 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
663
664 // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
665 }
666 }
667
668 size_t localCount = (count & -4L) - 4*index;
669 if( localCount )
670 {
671
672
673 #ifdef __APPLE__
674 vertices += localCount; // counter the offset
675 float4 t0, t1, t2, t3, t4;
676 size_t byteIndex = -(localCount) * sizeof(float);
677 float4 * sap = &stack_array[index + localCount / 4];
678
679 asm volatile
680 ( ".align 4 \n\
681 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
682 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
683 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
684 movaps %[t0], %[min] // vertices[0] \n\
685 movlhps %[t1], %[min] // x0y0x1y1 \n\
686 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
687 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
688 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
689 movhlps %[t0], %[t1] // z0w0z1w1 \n\
690 movaps %[t3], %[t0] // vertices[2] \n\
691 movlhps %[t4], %[t0] // x2y2x3y3 \n\
692 movhlps %[t3], %[t4] // z2w2z3w3 \n\
693 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
694 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
695 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
696 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
697 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
698 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
699 addps %[t3], %[min] // x + y \n\
700 addps %[t1], %[min] // x + y + z \n\
701 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
702 minps %[t2], %[min] // record min, restore min \n\
703 add $16, %[byteIndex] // advance loop counter\n\
704 jnz 0b \n\
705 "
706 : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
707 : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
708 : "memory", "cc"
709 );
710 index += localCount/4;
711 #else
712 {
713 for( unsigned int i=0; i<localCount/4; i++,index++)
714 { // do four dot products at a time. Carefully avoid touching the w element.
715 float4 v0 = vertices[0];
716 float4 v1 = vertices[1];
717 float4 v2 = vertices[2];
718 float4 v3 = vertices[3];
719 vertices += 4;
720
721 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
722 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
723 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
724 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
725
726 lo0 = lo0*vLo;
727 lo1 = lo1*vLo;
728 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
729 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
730 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
731 z = z*vHi;
732 x = x+y;
733 x = x+z;
734 stack_array[index] = x;
735 min = _mm_min_ps( x, min ); // control the order here so that max is never NaN even if x is nan
736 }
737 }
738
739 #endif
740 }
741
742 // process the last few points
743 if( count & 3 )
744 {
745 float4 v0, v1, v2, x, y, z;
746 switch( count & 3 )
747 {
748 case 3:
749 {
750 v0 = vertices[0];
751 v1 = vertices[1];
752 v2 = vertices[2];
753
754 // Calculate 3 dot products, transpose, duplicate v2
755 float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
756 float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
757 lo0 = lo0*vLo;
758 z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
759 z = z*vHi;
760 float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
761 lo1 = lo1*vLo;
762 x = _mm_shuffle_ps(lo0, lo1, 0x88);
763 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
764 }
765 break;
766 case 2:
767 {
768 v0 = vertices[0];
769 v1 = vertices[1];
770 float4 xy = _mm_movelh_ps(v0, v1);
771 z = _mm_movehl_ps(v1, v0);
772 xy = xy*vLo;
773 z = _mm_shuffle_ps( z, z, 0xa8);
774 x = _mm_shuffle_ps( xy, xy, 0xa8);
775 y = _mm_shuffle_ps( xy, xy, 0xfd);
776 z = z*vHi;
777 }
778 break;
779 case 1:
780 {
781 float4 xy = vertices[0];
782 z = _mm_shuffle_ps( xy, xy, 0xaa);
783 xy = xy*vLo;
784 z = z*vHi;
785 x = _mm_shuffle_ps(xy, xy, 0);
786 y = _mm_shuffle_ps(xy, xy, 0x55);
787 }
788 break;
789 }
790 x = x+y;
791 x = x+z;
792 stack_array[index] = x;
793 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
794 index++;
795 }
796
797 // if we found a new min.
798 if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
799 { // we found a new min. Search for it
800 // find min across the min vector, place in all elements of min -- big latency hit here
801 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
802 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
803
804 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
805 // this where it actually makes a difference is handled in the early out at the top of the function,
806 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
807 // complexity, and removed it.
808
809 dotmin = min;
810
811 // scan for the first occurence of min in the array
812 size_t test;
813 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
814 {}
815 minIndex = 4*index + segment + indexTable[test];
816 }
817
818 _mm_store_ss( dotResult, dotmin);
819 return minIndex;
820 }
821
822
823 #elif defined BT_USE_NEON
824
825 #define ARM_NEON_GCC_COMPATIBILITY 1
826 #include <arm_neon.h>
827 #include <sys/types.h>
828 #include <sys/sysctl.h> //for sysctlbyname
829
830 static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
831 static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
832 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
833 static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
834 static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
835 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
836
837 long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
838 long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
839
840
btGetCpuCapabilities(void)841 static inline uint32_t btGetCpuCapabilities( void )
842 {
843 static uint32_t capabilities = 0;
844 static bool testedCapabilities = false;
845
846 if( 0 == testedCapabilities)
847 {
848 uint32_t hasFeature = 0;
849 size_t featureSize = sizeof( hasFeature );
850 int err = sysctlbyname( "hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0 );
851
852 if( 0 == err && hasFeature)
853 capabilities |= 0x2000;
854
855 testedCapabilities = true;
856 }
857
858 return capabilities;
859 }
860
861
862
863
_maxdot_large_sel(const float * vv,const float * vec,unsigned long count,float * dotResult)864 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
865 {
866
867 if( btGetCpuCapabilities() & 0x2000 )
868 _maxdot_large = _maxdot_large_v1;
869 else
870 _maxdot_large = _maxdot_large_v0;
871
872 return _maxdot_large(vv, vec, count, dotResult);
873 }
874
_mindot_large_sel(const float * vv,const float * vec,unsigned long count,float * dotResult)875 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
876 {
877
878 if( btGetCpuCapabilities() & 0x2000 )
879 _mindot_large = _mindot_large_v1;
880 else
881 _mindot_large = _mindot_large_v0;
882
883 return _mindot_large(vv, vec, count, dotResult);
884 }
885
886
887
888 #if defined __arm__
889 # define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
890 #else
891 //support 64bit arm
892 # define vld1q_f32_aligned_postincrement( _ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; })
893 #endif
894
895
_maxdot_large_v0(const float * vv,const float * vec,unsigned long count,float * dotResult)896 long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
897 {
898 unsigned long i = 0;
899 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
900 float32x2_t vLo = vget_low_f32(vvec);
901 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
902 float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
903 float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
904 uint32x2_t indexLo = (uint32x2_t) {0, 1};
905 uint32x2_t indexHi = (uint32x2_t) {2, 3};
906 uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
907 uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
908 const uint32x2_t four = (uint32x2_t) {4,4};
909
910 for( ; i+8 <= count; i+= 8 )
911 {
912 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
913 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
914 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
915 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
916
917 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
918 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
919 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
920 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
921
922 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
923 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
924 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
925 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
926
927 float32x2_t rLo = vpadd_f32( xy0, xy1);
928 float32x2_t rHi = vpadd_f32( xy2, xy3);
929 rLo = vadd_f32(rLo, zLo);
930 rHi = vadd_f32(rHi, zHi);
931
932 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
933 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
934 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
935 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
936 iLo = vbsl_u32(maskLo, indexLo, iLo);
937 iHi = vbsl_u32(maskHi, indexHi, iHi);
938 indexLo = vadd_u32(indexLo, four);
939 indexHi = vadd_u32(indexHi, four);
940
941 v0 = vld1q_f32_aligned_postincrement( vv );
942 v1 = vld1q_f32_aligned_postincrement( vv );
943 v2 = vld1q_f32_aligned_postincrement( vv );
944 v3 = vld1q_f32_aligned_postincrement( vv );
945
946 xy0 = vmul_f32( vget_low_f32(v0), vLo);
947 xy1 = vmul_f32( vget_low_f32(v1), vLo);
948 xy2 = vmul_f32( vget_low_f32(v2), vLo);
949 xy3 = vmul_f32( vget_low_f32(v3), vLo);
950
951 z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
952 z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
953 zLo = vmul_f32( z0.val[0], vHi);
954 zHi = vmul_f32( z1.val[0], vHi);
955
956 rLo = vpadd_f32( xy0, xy1);
957 rHi = vpadd_f32( xy2, xy3);
958 rLo = vadd_f32(rLo, zLo);
959 rHi = vadd_f32(rHi, zHi);
960
961 maskLo = vcgt_f32( rLo, dotMaxLo );
962 maskHi = vcgt_f32( rHi, dotMaxHi );
963 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
964 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
965 iLo = vbsl_u32(maskLo, indexLo, iLo);
966 iHi = vbsl_u32(maskHi, indexHi, iHi);
967 indexLo = vadd_u32(indexLo, four);
968 indexHi = vadd_u32(indexHi, four);
969 }
970
971 for( ; i+4 <= count; i+= 4 )
972 {
973 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
974 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
975 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
976 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
977
978 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
979 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
980 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
981 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
982
983 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
984 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
985 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
986 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
987
988 float32x2_t rLo = vpadd_f32( xy0, xy1);
989 float32x2_t rHi = vpadd_f32( xy2, xy3);
990 rLo = vadd_f32(rLo, zLo);
991 rHi = vadd_f32(rHi, zHi);
992
993 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
994 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
995 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
996 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
997 iLo = vbsl_u32(maskLo, indexLo, iLo);
998 iHi = vbsl_u32(maskHi, indexHi, iHi);
999 indexLo = vadd_u32(indexLo, four);
1000 indexHi = vadd_u32(indexHi, four);
1001 }
1002
1003 switch( count & 3 )
1004 {
1005 case 3:
1006 {
1007 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1008 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1009 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1010
1011 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1012 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1013 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1014
1015 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1016 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1017 float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1018
1019 float32x2_t rLo = vpadd_f32( xy0, xy1);
1020 float32x2_t rHi = vpadd_f32( xy2, xy2);
1021 rLo = vadd_f32(rLo, zLo);
1022 rHi = vadd_f32(rHi, zHi);
1023
1024 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1025 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
1026 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1027 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
1028 iLo = vbsl_u32(maskLo, indexLo, iLo);
1029 iHi = vbsl_u32(maskHi, indexHi, iHi);
1030 }
1031 break;
1032 case 2:
1033 {
1034 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1035 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1036
1037 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1038 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1039
1040 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1041 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1042
1043 float32x2_t rLo = vpadd_f32( xy0, xy1);
1044 rLo = vadd_f32(rLo, zLo);
1045
1046 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1047 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1048 iLo = vbsl_u32(maskLo, indexLo, iLo);
1049 }
1050 break;
1051 case 1:
1052 {
1053 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1054 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1055 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1056 float32x2_t zLo = vmul_f32( z0, vHi);
1057 float32x2_t rLo = vpadd_f32( xy0, xy0);
1058 rLo = vadd_f32(rLo, zLo);
1059 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1060 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1061 iLo = vbsl_u32(maskLo, indexLo, iLo);
1062 }
1063 break;
1064
1065 default:
1066 break;
1067 }
1068
1069 // select best answer between hi and lo results
1070 uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
1071 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1072 iLo = vbsl_u32(mask, iHi, iLo);
1073
1074 // select best answer between even and odd results
1075 dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1076 iHi = vdup_lane_u32(iLo, 1);
1077 mask = vcgt_f32( dotMaxHi, dotMaxLo );
1078 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1079 iLo = vbsl_u32(mask, iHi, iLo);
1080
1081 *dotResult = vget_lane_f32( dotMaxLo, 0);
1082 return vget_lane_u32(iLo, 0);
1083 }
1084
1085
_maxdot_large_v1(const float * vv,const float * vec,unsigned long count,float * dotResult)1086 long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
1087 {
1088 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1089 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1090 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1091 const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1092 uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1093 uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
1094 float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
1095
1096 unsigned long i = 0;
1097 for( ; i + 8 <= count; i += 8 )
1098 {
1099 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1100 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1101 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1102 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1103
1104 // the next two lines should resolve to a single vswp d, d
1105 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1106 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1107 // the next two lines should resolve to a single vswp d, d
1108 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1109 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1110
1111 xy0 = vmulq_f32(xy0, vLo);
1112 xy1 = vmulq_f32(xy1, vLo);
1113
1114 float32x4x2_t zb = vuzpq_f32( z0, z1);
1115 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1116 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1117 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1118 x = vaddq_f32(x, z);
1119
1120 uint32x4_t mask = vcgtq_f32(x, maxDot);
1121 maxDot = vbslq_f32( mask, x, maxDot);
1122 index = vbslq_u32(mask, local_index, index);
1123 local_index = vaddq_u32(local_index, four);
1124
1125 v0 = vld1q_f32_aligned_postincrement( vv );
1126 v1 = vld1q_f32_aligned_postincrement( vv );
1127 v2 = vld1q_f32_aligned_postincrement( vv );
1128 v3 = vld1q_f32_aligned_postincrement( vv );
1129
1130 // the next two lines should resolve to a single vswp d, d
1131 xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1132 xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1133 // the next two lines should resolve to a single vswp d, d
1134 z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1135 z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1136
1137 xy0 = vmulq_f32(xy0, vLo);
1138 xy1 = vmulq_f32(xy1, vLo);
1139
1140 zb = vuzpq_f32( z0, z1);
1141 z = vmulq_f32( zb.val[0], vHi);
1142 xy = vuzpq_f32( xy0, xy1);
1143 x = vaddq_f32(xy.val[0], xy.val[1]);
1144 x = vaddq_f32(x, z);
1145
1146 mask = vcgtq_f32(x, maxDot);
1147 maxDot = vbslq_f32( mask, x, maxDot);
1148 index = vbslq_u32(mask, local_index, index);
1149 local_index = vaddq_u32(local_index, four);
1150 }
1151
1152 for( ; i + 4 <= count; i += 4 )
1153 {
1154 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1155 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1156 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1157 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1158
1159 // the next two lines should resolve to a single vswp d, d
1160 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1161 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1162 // the next two lines should resolve to a single vswp d, d
1163 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1164 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1165
1166 xy0 = vmulq_f32(xy0, vLo);
1167 xy1 = vmulq_f32(xy1, vLo);
1168
1169 float32x4x2_t zb = vuzpq_f32( z0, z1);
1170 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1171 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1172 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1173 x = vaddq_f32(x, z);
1174
1175 uint32x4_t mask = vcgtq_f32(x, maxDot);
1176 maxDot = vbslq_f32( mask, x, maxDot);
1177 index = vbslq_u32(mask, local_index, index);
1178 local_index = vaddq_u32(local_index, four);
1179 }
1180
1181 switch (count & 3) {
1182 case 3:
1183 {
1184 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1185 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1186 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1187
1188 // the next two lines should resolve to a single vswp d, d
1189 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1190 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1191 // the next two lines should resolve to a single vswp d, d
1192 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1193 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1194
1195 xy0 = vmulq_f32(xy0, vLo);
1196 xy1 = vmulq_f32(xy1, vLo);
1197
1198 float32x4x2_t zb = vuzpq_f32( z0, z1);
1199 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1200 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1201 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1202 x = vaddq_f32(x, z);
1203
1204 uint32x4_t mask = vcgtq_f32(x, maxDot);
1205 maxDot = vbslq_f32( mask, x, maxDot);
1206 index = vbslq_u32(mask, local_index, index);
1207 local_index = vaddq_u32(local_index, four);
1208 }
1209 break;
1210
1211 case 2:
1212 {
1213 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1214 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1215
1216 // the next two lines should resolve to a single vswp d, d
1217 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1218 // the next two lines should resolve to a single vswp d, d
1219 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1220
1221 xy0 = vmulq_f32(xy0, vLo);
1222
1223 float32x4x2_t zb = vuzpq_f32( z0, z0);
1224 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1225 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1226 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1227 x = vaddq_f32(x, z);
1228
1229 uint32x4_t mask = vcgtq_f32(x, maxDot);
1230 maxDot = vbslq_f32( mask, x, maxDot);
1231 index = vbslq_u32(mask, local_index, index);
1232 local_index = vaddq_u32(local_index, four);
1233 }
1234 break;
1235
1236 case 1:
1237 {
1238 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1239
1240 // the next two lines should resolve to a single vswp d, d
1241 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1242 // the next two lines should resolve to a single vswp d, d
1243 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1244
1245 xy0 = vmulq_f32(xy0, vLo);
1246
1247 z = vmulq_f32( z, vHi);
1248 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1249 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1250 x = vaddq_f32(x, z);
1251
1252 uint32x4_t mask = vcgtq_f32(x, maxDot);
1253 maxDot = vbslq_f32( mask, x, maxDot);
1254 index = vbslq_u32(mask, local_index, index);
1255 local_index = vaddq_u32(local_index, four);
1256 }
1257 break;
1258
1259 default:
1260 break;
1261 }
1262
1263
1264 // select best answer between hi and lo results
1265 uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
1266 float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1267 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1268
1269 // select best answer between even and odd results
1270 float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1271 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1272 mask = vcgt_f32( maxDotO, maxDot2 );
1273 maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1274 index2 = vbsl_u32(mask, indexHi, index2);
1275
1276 *dotResult = vget_lane_f32( maxDot2, 0);
1277 return vget_lane_u32(index2, 0);
1278
1279 }
1280
_mindot_large_v0(const float * vv,const float * vec,unsigned long count,float * dotResult)1281 long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
1282 {
1283 unsigned long i = 0;
1284 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1285 float32x2_t vLo = vget_low_f32(vvec);
1286 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1287 float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1288 float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1289 uint32x2_t indexLo = (uint32x2_t) {0, 1};
1290 uint32x2_t indexHi = (uint32x2_t) {2, 3};
1291 uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1292 uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1293 const uint32x2_t four = (uint32x2_t) {4,4};
1294
1295 for( ; i+8 <= count; i+= 8 )
1296 {
1297 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1298 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1299 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1300 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1301
1302 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1303 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1304 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1305 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1306
1307 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1308 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1309 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1310 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1311
1312 float32x2_t rLo = vpadd_f32( xy0, xy1);
1313 float32x2_t rHi = vpadd_f32( xy2, xy3);
1314 rLo = vadd_f32(rLo, zLo);
1315 rHi = vadd_f32(rHi, zHi);
1316
1317 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1318 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1319 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1320 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1321 iLo = vbsl_u32(maskLo, indexLo, iLo);
1322 iHi = vbsl_u32(maskHi, indexHi, iHi);
1323 indexLo = vadd_u32(indexLo, four);
1324 indexHi = vadd_u32(indexHi, four);
1325
1326 v0 = vld1q_f32_aligned_postincrement( vv );
1327 v1 = vld1q_f32_aligned_postincrement( vv );
1328 v2 = vld1q_f32_aligned_postincrement( vv );
1329 v3 = vld1q_f32_aligned_postincrement( vv );
1330
1331 xy0 = vmul_f32( vget_low_f32(v0), vLo);
1332 xy1 = vmul_f32( vget_low_f32(v1), vLo);
1333 xy2 = vmul_f32( vget_low_f32(v2), vLo);
1334 xy3 = vmul_f32( vget_low_f32(v3), vLo);
1335
1336 z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1337 z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1338 zLo = vmul_f32( z0.val[0], vHi);
1339 zHi = vmul_f32( z1.val[0], vHi);
1340
1341 rLo = vpadd_f32( xy0, xy1);
1342 rHi = vpadd_f32( xy2, xy3);
1343 rLo = vadd_f32(rLo, zLo);
1344 rHi = vadd_f32(rHi, zHi);
1345
1346 maskLo = vclt_f32( rLo, dotMinLo );
1347 maskHi = vclt_f32( rHi, dotMinHi );
1348 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1349 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1350 iLo = vbsl_u32(maskLo, indexLo, iLo);
1351 iHi = vbsl_u32(maskHi, indexHi, iHi);
1352 indexLo = vadd_u32(indexLo, four);
1353 indexHi = vadd_u32(indexHi, four);
1354 }
1355
1356 for( ; i+4 <= count; i+= 4 )
1357 {
1358 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1359 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1360 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1361 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1362
1363 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1364 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1365 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1366 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1367
1368 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1369 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1370 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1371 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1372
1373 float32x2_t rLo = vpadd_f32( xy0, xy1);
1374 float32x2_t rHi = vpadd_f32( xy2, xy3);
1375 rLo = vadd_f32(rLo, zLo);
1376 rHi = vadd_f32(rHi, zHi);
1377
1378 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1379 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1380 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1381 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1382 iLo = vbsl_u32(maskLo, indexLo, iLo);
1383 iHi = vbsl_u32(maskHi, indexHi, iHi);
1384 indexLo = vadd_u32(indexLo, four);
1385 indexHi = vadd_u32(indexHi, four);
1386 }
1387 switch( count & 3 )
1388 {
1389 case 3:
1390 {
1391 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1392 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1393 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1394
1395 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1396 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1397 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1398
1399 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1400 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1401 float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1402
1403 float32x2_t rLo = vpadd_f32( xy0, xy1);
1404 float32x2_t rHi = vpadd_f32( xy2, xy2);
1405 rLo = vadd_f32(rLo, zLo);
1406 rHi = vadd_f32(rHi, zHi);
1407
1408 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1409 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1410 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1411 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1412 iLo = vbsl_u32(maskLo, indexLo, iLo);
1413 iHi = vbsl_u32(maskHi, indexHi, iHi);
1414 }
1415 break;
1416 case 2:
1417 {
1418 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1419 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1420
1421 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1422 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1423
1424 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1425 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1426
1427 float32x2_t rLo = vpadd_f32( xy0, xy1);
1428 rLo = vadd_f32(rLo, zLo);
1429
1430 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1431 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1432 iLo = vbsl_u32(maskLo, indexLo, iLo);
1433 }
1434 break;
1435 case 1:
1436 {
1437 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1438 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1439 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1440 float32x2_t zLo = vmul_f32( z0, vHi);
1441 float32x2_t rLo = vpadd_f32( xy0, xy0);
1442 rLo = vadd_f32(rLo, zLo);
1443 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1444 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1445 iLo = vbsl_u32(maskLo, indexLo, iLo);
1446 }
1447 break;
1448
1449 default:
1450 break;
1451 }
1452
1453 // select best answer between hi and lo results
1454 uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
1455 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1456 iLo = vbsl_u32(mask, iHi, iLo);
1457
1458 // select best answer between even and odd results
1459 dotMinHi = vdup_lane_f32(dotMinLo, 1);
1460 iHi = vdup_lane_u32(iLo, 1);
1461 mask = vclt_f32( dotMinHi, dotMinLo );
1462 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1463 iLo = vbsl_u32(mask, iHi, iLo);
1464
1465 *dotResult = vget_lane_f32( dotMinLo, 0);
1466 return vget_lane_u32(iLo, 0);
1467 }
1468
_mindot_large_v1(const float * vv,const float * vec,unsigned long count,float * dotResult)1469 long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
1470 {
1471 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1472 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1473 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1474 const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1475 uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1476 uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
1477 float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
1478
1479 unsigned long i = 0;
1480 for( ; i + 8 <= count; i += 8 )
1481 {
1482 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1483 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1484 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1485 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1486
1487 // the next two lines should resolve to a single vswp d, d
1488 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1489 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1490 // the next two lines should resolve to a single vswp d, d
1491 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1492 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1493
1494 xy0 = vmulq_f32(xy0, vLo);
1495 xy1 = vmulq_f32(xy1, vLo);
1496
1497 float32x4x2_t zb = vuzpq_f32( z0, z1);
1498 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1499 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1500 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1501 x = vaddq_f32(x, z);
1502
1503 uint32x4_t mask = vcltq_f32(x, minDot);
1504 minDot = vbslq_f32( mask, x, minDot);
1505 index = vbslq_u32(mask, local_index, index);
1506 local_index = vaddq_u32(local_index, four);
1507
1508 v0 = vld1q_f32_aligned_postincrement( vv );
1509 v1 = vld1q_f32_aligned_postincrement( vv );
1510 v2 = vld1q_f32_aligned_postincrement( vv );
1511 v3 = vld1q_f32_aligned_postincrement( vv );
1512
1513 // the next two lines should resolve to a single vswp d, d
1514 xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1515 xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1516 // the next two lines should resolve to a single vswp d, d
1517 z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1518 z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1519
1520 xy0 = vmulq_f32(xy0, vLo);
1521 xy1 = vmulq_f32(xy1, vLo);
1522
1523 zb = vuzpq_f32( z0, z1);
1524 z = vmulq_f32( zb.val[0], vHi);
1525 xy = vuzpq_f32( xy0, xy1);
1526 x = vaddq_f32(xy.val[0], xy.val[1]);
1527 x = vaddq_f32(x, z);
1528
1529 mask = vcltq_f32(x, minDot);
1530 minDot = vbslq_f32( mask, x, minDot);
1531 index = vbslq_u32(mask, local_index, index);
1532 local_index = vaddq_u32(local_index, four);
1533 }
1534
1535 for( ; i + 4 <= count; i += 4 )
1536 {
1537 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1538 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1539 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1540 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1541
1542 // the next two lines should resolve to a single vswp d, d
1543 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1544 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1545 // the next two lines should resolve to a single vswp d, d
1546 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1547 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1548
1549 xy0 = vmulq_f32(xy0, vLo);
1550 xy1 = vmulq_f32(xy1, vLo);
1551
1552 float32x4x2_t zb = vuzpq_f32( z0, z1);
1553 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1554 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1555 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1556 x = vaddq_f32(x, z);
1557
1558 uint32x4_t mask = vcltq_f32(x, minDot);
1559 minDot = vbslq_f32( mask, x, minDot);
1560 index = vbslq_u32(mask, local_index, index);
1561 local_index = vaddq_u32(local_index, four);
1562 }
1563
1564 switch (count & 3) {
1565 case 3:
1566 {
1567 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1568 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1569 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1570
1571 // the next two lines should resolve to a single vswp d, d
1572 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1573 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1574 // the next two lines should resolve to a single vswp d, d
1575 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1576 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1577
1578 xy0 = vmulq_f32(xy0, vLo);
1579 xy1 = vmulq_f32(xy1, vLo);
1580
1581 float32x4x2_t zb = vuzpq_f32( z0, z1);
1582 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1583 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1584 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1585 x = vaddq_f32(x, z);
1586
1587 uint32x4_t mask = vcltq_f32(x, minDot);
1588 minDot = vbslq_f32( mask, x, minDot);
1589 index = vbslq_u32(mask, local_index, index);
1590 local_index = vaddq_u32(local_index, four);
1591 }
1592 break;
1593
1594 case 2:
1595 {
1596 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1597 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1598
1599 // the next two lines should resolve to a single vswp d, d
1600 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1601 // the next two lines should resolve to a single vswp d, d
1602 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1603
1604 xy0 = vmulq_f32(xy0, vLo);
1605
1606 float32x4x2_t zb = vuzpq_f32( z0, z0);
1607 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1608 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1609 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1610 x = vaddq_f32(x, z);
1611
1612 uint32x4_t mask = vcltq_f32(x, minDot);
1613 minDot = vbslq_f32( mask, x, minDot);
1614 index = vbslq_u32(mask, local_index, index);
1615 local_index = vaddq_u32(local_index, four);
1616 }
1617 break;
1618
1619 case 1:
1620 {
1621 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1622
1623 // the next two lines should resolve to a single vswp d, d
1624 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1625 // the next two lines should resolve to a single vswp d, d
1626 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1627
1628 xy0 = vmulq_f32(xy0, vLo);
1629
1630 z = vmulq_f32( z, vHi);
1631 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1632 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1633 x = vaddq_f32(x, z);
1634
1635 uint32x4_t mask = vcltq_f32(x, minDot);
1636 minDot = vbslq_f32( mask, x, minDot);
1637 index = vbslq_u32(mask, local_index, index);
1638 local_index = vaddq_u32(local_index, four);
1639 }
1640 break;
1641
1642 default:
1643 break;
1644 }
1645
1646
1647 // select best answer between hi and lo results
1648 uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
1649 float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1650 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1651
1652 // select best answer between even and odd results
1653 float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1654 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1655 mask = vclt_f32( minDotO, minDot2 );
1656 minDot2 = vbsl_f32(mask, minDotO, minDot2);
1657 index2 = vbsl_u32(mask, indexHi, index2);
1658
1659 *dotResult = vget_lane_f32( minDot2, 0);
1660 return vget_lane_u32(index2, 0);
1661
1662 }
1663
1664 #else
1665 #error Unhandled __APPLE__ arch
1666 #endif
1667
1668 #endif /* __APPLE__ */
1669
1670
1671