• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  Copyright (c) 2011 Apple Inc.
3  http://continuousphysics.com/Bullet/
4 
5  This software is provided 'as-is', without any express or implied warranty.
6  In no event will the authors be held liable for any damages arising from the use of this software.
7  Permission is granted to anyone to use this software for any purpose,
8  including commercial applications, and to alter it and redistribute it freely,
9  subject to the following restrictions:
10 
11  1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12  2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13  3. This notice may not be removed or altered from any source distribution.
14 
15  This source version has been altered.
16  */
17 
18 #if defined (_WIN32) || defined (__i386__)
19 #define BT_USE_SSE_IN_API
20 #endif
21 
22 
23 #include "btVector3.h"
24 
25 
26 
27 #if defined BT_USE_SIMD_VECTOR3
28 
29 #if DEBUG
30 #include <string.h>//for memset
31 #endif
32 
33 
34 #ifdef __APPLE__
35 #include <stdint.h>
36 typedef  float float4 __attribute__ ((vector_size(16)));
37 #else
38 #define float4 __m128
39 #endif
40 //typedef  uint32_t uint4 __attribute__ ((vector_size(16)));
41 
42 
43 #if defined BT_USE_SSE || defined _WIN32
44 
45 #define LOG2_ARRAY_SIZE     6
46 #define STACK_ARRAY_COUNT   (1UL << LOG2_ARRAY_SIZE)
47 
48 #include <emmintrin.h>
49 
50 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
_maxdot_large(const float * vv,const float * vec,unsigned long count,float * dotResult)51 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
52 {
53     const float4 *vertices = (const float4*) vv;
54     static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
55     float4 dotMax = btAssign128( -BT_INFINITY,  -BT_INFINITY,  -BT_INFINITY,  -BT_INFINITY );
56     float4 vvec = _mm_loadu_ps( vec );
57     float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));          /// zzzz
58     float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
59 
60     long maxIndex = -1L;
61 
62     size_t segment = 0;
63     float4 stack_array[ STACK_ARRAY_COUNT ];
64 
65 #if DEBUG
66     //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
67 #endif
68 
69     size_t index;
70     float4 max;
71     // Faster loop without cleanup code for full tiles
72     for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
73     {
74         max = dotMax;
75 
76         for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
77         { // do four dot products at a time. Carefully avoid touching the w element.
78             float4 v0 = vertices[0];
79             float4 v1 = vertices[1];
80             float4 v2 = vertices[2];
81             float4 v3 = vertices[3];            vertices += 4;
82 
83             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
84             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
85             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
86             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
87 
88             lo0 = lo0*vLo;
89             lo1 = lo1*vLo;
90             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
91             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
92             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
93             z = z*vHi;
94             x = x+y;
95             x = x+z;
96             stack_array[index] = x;
97             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
98 
99             v0 = vertices[0];
100             v1 = vertices[1];
101             v2 = vertices[2];
102             v3 = vertices[3];            vertices += 4;
103 
104             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
105             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
106             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
107             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
108 
109             lo0 = lo0*vLo;
110             lo1 = lo1*vLo;
111             z = _mm_shuffle_ps(hi0, hi1, 0x88);
112             x = _mm_shuffle_ps(lo0, lo1, 0x88);
113             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
114             z = z*vHi;
115             x = x+y;
116             x = x+z;
117             stack_array[index+1] = x;
118             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
119 
120             v0 = vertices[0];
121             v1 = vertices[1];
122             v2 = vertices[2];
123             v3 = vertices[3];            vertices += 4;
124 
125             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
126             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
127             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
128             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
129 
130             lo0 = lo0*vLo;
131             lo1 = lo1*vLo;
132             z = _mm_shuffle_ps(hi0, hi1, 0x88);
133             x = _mm_shuffle_ps(lo0, lo1, 0x88);
134             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
135             z = z*vHi;
136             x = x+y;
137             x = x+z;
138             stack_array[index+2] = x;
139             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
140 
141             v0 = vertices[0];
142             v1 = vertices[1];
143             v2 = vertices[2];
144             v3 = vertices[3];            vertices += 4;
145 
146             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
147             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
148             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
149             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
150 
151             lo0 = lo0*vLo;
152             lo1 = lo1*vLo;
153             z = _mm_shuffle_ps(hi0, hi1, 0x88);
154             x = _mm_shuffle_ps(lo0, lo1, 0x88);
155             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
156             z = z*vHi;
157             x = x+y;
158             x = x+z;
159             stack_array[index+3] = x;
160             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
161 
162             // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
163         }
164 
165         // If we found a new max
166         if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
167         {
168             // copy the new max across all lanes of our max accumulator
169             max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
170             max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
171 
172             dotMax = max;
173 
174             // find first occurrence of that max
175             size_t test;
176             for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
177             {}
178             // record where it is.
179             maxIndex = 4*index + segment + indexTable[test];
180         }
181     }
182 
183     // account for work we've already done
184     count -= segment;
185 
186     // Deal with the last < STACK_ARRAY_COUNT vectors
187     max = dotMax;
188     index = 0;
189 
190 
191     if( btUnlikely( count > 16) )
192     {
193         for( ; index + 4 <= count / 4; index+=4 )
194         { // do four dot products at a time. Carefully avoid touching the w element.
195             float4 v0 = vertices[0];
196             float4 v1 = vertices[1];
197             float4 v2 = vertices[2];
198             float4 v3 = vertices[3];            vertices += 4;
199 
200             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
201             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
202             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
203             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
204 
205             lo0 = lo0*vLo;
206             lo1 = lo1*vLo;
207             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
208             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
209             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
210             z = z*vHi;
211             x = x+y;
212             x = x+z;
213             stack_array[index] = x;
214             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
215 
216             v0 = vertices[0];
217             v1 = vertices[1];
218             v2 = vertices[2];
219             v3 = vertices[3];            vertices += 4;
220 
221             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
222             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
223             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
224             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
225 
226             lo0 = lo0*vLo;
227             lo1 = lo1*vLo;
228             z = _mm_shuffle_ps(hi0, hi1, 0x88);
229             x = _mm_shuffle_ps(lo0, lo1, 0x88);
230             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
231             z = z*vHi;
232             x = x+y;
233             x = x+z;
234             stack_array[index+1] = x;
235             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
236 
237             v0 = vertices[0];
238             v1 = vertices[1];
239             v2 = vertices[2];
240             v3 = vertices[3];            vertices += 4;
241 
242             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
243             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
244             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
245             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
246 
247             lo0 = lo0*vLo;
248             lo1 = lo1*vLo;
249             z = _mm_shuffle_ps(hi0, hi1, 0x88);
250             x = _mm_shuffle_ps(lo0, lo1, 0x88);
251             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
252             z = z*vHi;
253             x = x+y;
254             x = x+z;
255             stack_array[index+2] = x;
256             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
257 
258             v0 = vertices[0];
259             v1 = vertices[1];
260             v2 = vertices[2];
261             v3 = vertices[3];            vertices += 4;
262 
263             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
264             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
265             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
266             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
267 
268             lo0 = lo0*vLo;
269             lo1 = lo1*vLo;
270             z = _mm_shuffle_ps(hi0, hi1, 0x88);
271             x = _mm_shuffle_ps(lo0, lo1, 0x88);
272             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
273             z = z*vHi;
274             x = x+y;
275             x = x+z;
276             stack_array[index+3] = x;
277             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
278 
279             // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
280         }
281     }
282 
283     size_t localCount = (count & -4L) - 4*index;
284     if( localCount )
285     {
286 #ifdef __APPLE__
287         float4 t0, t1, t2, t3, t4;
288         float4 * sap = &stack_array[index + localCount / 4];
289           vertices += localCount;      // counter the offset
290          size_t byteIndex = -(localCount) * sizeof(float);
291         //AT&T Code style assembly
292         asm volatile
293         (   ".align 4                                                                   \n\
294              0: movaps  %[max], %[t2]                            // move max out of the way to avoid propagating NaNs in max \n\
295           movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
296           movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
297           movaps  %[t0], %[max]                               // vertices[0]      \n\
298           movlhps %[t1], %[max]                               // x0y0x1y1         \n\
299          movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
300          movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
301           mulps   %[vLo], %[max]                              // x0y0x1y1 * vLo   \n\
302          movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
303          movaps  %[t3], %[t0]                                // vertices[2]      \n\
304          movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
305          mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
306           movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
307           shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
308           mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
309          movaps  %[max], %[t3]                               // x0y0x1y1 * vLo   \n\
310          shufps  $0x88, %[t0], %[max]                        // x0x1x2x3 * vLo.x \n\
311          shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
312          addps   %[t3], %[max]                               // x + y            \n\
313          addps   %[t1], %[max]                               // x + y + z        \n\
314          movaps  %[max], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
315          maxps   %[t2], %[max]                               // record max, restore max   \n\
316          add     $16, %[byteIndex]                           // advance loop counter\n\
317          jnz     0b                                          \n\
318      "
319          : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
320          : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
321          : "memory", "cc"
322          );
323         index += localCount/4;
324 #else
325         {
326             for( unsigned int i=0; i<localCount/4; i++,index++)
327             { // do four dot products at a time. Carefully avoid touching the w element.
328                 float4 v0 = vertices[0];
329                 float4 v1 = vertices[1];
330                 float4 v2 = vertices[2];
331                 float4 v3 = vertices[3];
332                 vertices += 4;
333 
334                 float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
335                 float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
336                 float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
337                 float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
338 
339                 lo0 = lo0*vLo;
340                 lo1 = lo1*vLo;
341                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
342                 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
343                 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
344                 z = z*vHi;
345                 x = x+y;
346                 x = x+z;
347                 stack_array[index] = x;
348                 max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
349             }
350         }
351 #endif //__APPLE__
352     }
353 
354     // process the last few points
355     if( count & 3 )
356     {
357         float4 v0, v1, v2, x, y, z;
358         switch( count & 3 )
359         {
360             case 3:
361             {
362                 v0 = vertices[0];
363                 v1 = vertices[1];
364                 v2 = vertices[2];
365 
366                 // Calculate 3 dot products, transpose, duplicate v2
367                 float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
368                 float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
369                 lo0 = lo0*vLo;
370                 z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
371                 z = z*vHi;
372                 float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
373                 lo1 = lo1*vLo;
374                 x = _mm_shuffle_ps(lo0, lo1, 0x88);
375                 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
376             }
377                 break;
378             case 2:
379             {
380                 v0 = vertices[0];
381                 v1 = vertices[1];
382                 float4 xy = _mm_movelh_ps(v0, v1);
383                 z = _mm_movehl_ps(v1, v0);
384                 xy = xy*vLo;
385                 z = _mm_shuffle_ps( z, z,  0xa8);
386                 x = _mm_shuffle_ps( xy, xy, 0xa8);
387                 y = _mm_shuffle_ps( xy, xy, 0xfd);
388                 z = z*vHi;
389             }
390                 break;
391             case 1:
392             {
393                 float4 xy = vertices[0];
394                 z =  _mm_shuffle_ps( xy, xy, 0xaa);
395                 xy = xy*vLo;
396                 z = z*vHi;
397                 x = _mm_shuffle_ps(xy, xy, 0);
398                 y = _mm_shuffle_ps(xy, xy, 0x55);
399             }
400                 break;
401         }
402         x = x+y;
403         x = x+z;
404         stack_array[index] = x;
405         max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
406         index++;
407     }
408 
409     // if we found a new max.
410     if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
411     { // we found a new max. Search for it
412       // find max across the max vector, place in all elements of max -- big latency hit here
413         max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
414         max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
415 
416         // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
417         // this where it actually makes a difference is handled in the early out at the top of the function,
418         // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
419         // complexity, and removed it.
420 
421         dotMax = max;
422 
423         // scan for the first occurence of max in the array
424         size_t test;
425         for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
426         {}
427         maxIndex = 4*index + segment + indexTable[test];
428     }
429 
430     _mm_store_ss( dotResult, dotMax);
431     return maxIndex;
432 }
433 
434 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
435 
_mindot_large(const float * vv,const float * vec,unsigned long count,float * dotResult)436 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
437 {
438     const float4 *vertices = (const float4*) vv;
439     static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
440     float4 dotmin = btAssign128( BT_INFINITY,  BT_INFINITY,  BT_INFINITY,  BT_INFINITY );
441     float4 vvec = _mm_loadu_ps( vec );
442     float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));          /// zzzz
443     float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
444 
445     long minIndex = -1L;
446 
447     size_t segment = 0;
448     float4 stack_array[ STACK_ARRAY_COUNT ];
449 
450 #if DEBUG
451     //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
452 #endif
453 
454     size_t index;
455     float4 min;
456     // Faster loop without cleanup code for full tiles
457     for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
458     {
459         min = dotmin;
460 
461         for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
462         { // do four dot products at a time. Carefully avoid touching the w element.
463             float4 v0 = vertices[0];
464             float4 v1 = vertices[1];
465             float4 v2 = vertices[2];
466             float4 v3 = vertices[3];            vertices += 4;
467 
468             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
469             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
470             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
471             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
472 
473             lo0 = lo0*vLo;
474             lo1 = lo1*vLo;
475             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
476             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
477             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
478             z = z*vHi;
479             x = x+y;
480             x = x+z;
481             stack_array[index] = x;
482             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
483 
484             v0 = vertices[0];
485             v1 = vertices[1];
486             v2 = vertices[2];
487             v3 = vertices[3];            vertices += 4;
488 
489             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
490             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
491             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
492             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
493 
494             lo0 = lo0*vLo;
495             lo1 = lo1*vLo;
496             z = _mm_shuffle_ps(hi0, hi1, 0x88);
497             x = _mm_shuffle_ps(lo0, lo1, 0x88);
498             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
499             z = z*vHi;
500             x = x+y;
501             x = x+z;
502             stack_array[index+1] = x;
503             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
504 
505             v0 = vertices[0];
506             v1 = vertices[1];
507             v2 = vertices[2];
508             v3 = vertices[3];            vertices += 4;
509 
510             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
511             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
512             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
513             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
514 
515             lo0 = lo0*vLo;
516             lo1 = lo1*vLo;
517             z = _mm_shuffle_ps(hi0, hi1, 0x88);
518             x = _mm_shuffle_ps(lo0, lo1, 0x88);
519             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
520             z = z*vHi;
521             x = x+y;
522             x = x+z;
523             stack_array[index+2] = x;
524             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
525 
526             v0 = vertices[0];
527             v1 = vertices[1];
528             v2 = vertices[2];
529             v3 = vertices[3];            vertices += 4;
530 
531             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
532             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
533             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
534             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
535 
536             lo0 = lo0*vLo;
537             lo1 = lo1*vLo;
538             z = _mm_shuffle_ps(hi0, hi1, 0x88);
539             x = _mm_shuffle_ps(lo0, lo1, 0x88);
540             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
541             z = z*vHi;
542             x = x+y;
543             x = x+z;
544             stack_array[index+3] = x;
545             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
546 
547             // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
548         }
549 
550         // If we found a new min
551         if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
552         {
553             // copy the new min across all lanes of our min accumulator
554             min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
555             min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
556 
557             dotmin = min;
558 
559             // find first occurrence of that min
560             size_t test;
561             for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
562             {}
563             // record where it is.
564             minIndex = 4*index + segment + indexTable[test];
565         }
566     }
567 
568     // account for work we've already done
569     count -= segment;
570 
571     // Deal with the last < STACK_ARRAY_COUNT vectors
572     min = dotmin;
573     index = 0;
574 
575 
576     if(btUnlikely( count > 16) )
577     {
578         for( ; index + 4 <= count / 4; index+=4 )
579         { // do four dot products at a time. Carefully avoid touching the w element.
580             float4 v0 = vertices[0];
581             float4 v1 = vertices[1];
582             float4 v2 = vertices[2];
583             float4 v3 = vertices[3];            vertices += 4;
584 
585             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
586             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
587             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
588             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
589 
590             lo0 = lo0*vLo;
591             lo1 = lo1*vLo;
592             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
593             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
594             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
595             z = z*vHi;
596             x = x+y;
597             x = x+z;
598             stack_array[index] = x;
599             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
600 
601             v0 = vertices[0];
602             v1 = vertices[1];
603             v2 = vertices[2];
604             v3 = vertices[3];            vertices += 4;
605 
606             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
607             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
608             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
609             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
610 
611             lo0 = lo0*vLo;
612             lo1 = lo1*vLo;
613             z = _mm_shuffle_ps(hi0, hi1, 0x88);
614             x = _mm_shuffle_ps(lo0, lo1, 0x88);
615             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
616             z = z*vHi;
617             x = x+y;
618             x = x+z;
619             stack_array[index+1] = x;
620             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
621 
622             v0 = vertices[0];
623             v1 = vertices[1];
624             v2 = vertices[2];
625             v3 = vertices[3];            vertices += 4;
626 
627             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
628             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
629             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
630             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
631 
632             lo0 = lo0*vLo;
633             lo1 = lo1*vLo;
634             z = _mm_shuffle_ps(hi0, hi1, 0x88);
635             x = _mm_shuffle_ps(lo0, lo1, 0x88);
636             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
637             z = z*vHi;
638             x = x+y;
639             x = x+z;
640             stack_array[index+2] = x;
641             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
642 
643             v0 = vertices[0];
644             v1 = vertices[1];
645             v2 = vertices[2];
646             v3 = vertices[3];            vertices += 4;
647 
648             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
649             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
650             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
651             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
652 
653             lo0 = lo0*vLo;
654             lo1 = lo1*vLo;
655             z = _mm_shuffle_ps(hi0, hi1, 0x88);
656             x = _mm_shuffle_ps(lo0, lo1, 0x88);
657             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
658             z = z*vHi;
659             x = x+y;
660             x = x+z;
661             stack_array[index+3] = x;
662             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
663 
664             // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
665         }
666     }
667 
668     size_t localCount = (count & -4L) - 4*index;
669     if( localCount )
670     {
671 
672 
673 #ifdef __APPLE__
674         vertices += localCount;      // counter the offset
675         float4 t0, t1, t2, t3, t4;
676         size_t byteIndex = -(localCount) * sizeof(float);
677         float4 * sap = &stack_array[index + localCount / 4];
678 
679         asm volatile
680         (   ".align 4                                                                   \n\
681              0: movaps  %[min], %[t2]                            // move min out of the way to avoid propagating NaNs in min \n\
682              movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
683              movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
684              movaps  %[t0], %[min]                               // vertices[0]      \n\
685              movlhps %[t1], %[min]                               // x0y0x1y1         \n\
686              movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
687              movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
688              mulps   %[vLo], %[min]                              // x0y0x1y1 * vLo   \n\
689              movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
690              movaps  %[t3], %[t0]                                // vertices[2]      \n\
691              movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
692              movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
693              mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
694              shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
695              mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
696              movaps  %[min], %[t3]                               // x0y0x1y1 * vLo   \n\
697              shufps  $0x88, %[t0], %[min]                        // x0x1x2x3 * vLo.x \n\
698              shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
699              addps   %[t3], %[min]                               // x + y            \n\
700              addps   %[t1], %[min]                               // x + y + z        \n\
701              movaps  %[min], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
702              minps   %[t2], %[min]                               // record min, restore min   \n\
703              add     $16, %[byteIndex]                           // advance loop counter\n\
704              jnz     0b                                          \n\
705              "
706          : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
707          : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
708          : "memory", "cc"
709          );
710         index += localCount/4;
711 #else
712         {
713             for( unsigned int i=0; i<localCount/4; i++,index++)
714             { // do four dot products at a time. Carefully avoid touching the w element.
715                 float4 v0 = vertices[0];
716                 float4 v1 = vertices[1];
717                 float4 v2 = vertices[2];
718                 float4 v3 = vertices[3];
719                 vertices += 4;
720 
721                 float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
722                 float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
723                 float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
724                 float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
725 
726                 lo0 = lo0*vLo;
727                 lo1 = lo1*vLo;
728                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
729                 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
730                 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
731                 z = z*vHi;
732                 x = x+y;
733                 x = x+z;
734                 stack_array[index] = x;
735                 min = _mm_min_ps( x, min );         // control the order here so that max is never NaN even if x is nan
736             }
737         }
738 
739 #endif
740     }
741 
742     // process the last few points
743     if( count & 3 )
744     {
745         float4 v0, v1, v2, x, y, z;
746         switch( count & 3 )
747         {
748             case 3:
749             {
750                 v0 = vertices[0];
751                 v1 = vertices[1];
752                 v2 = vertices[2];
753 
754                 // Calculate 3 dot products, transpose, duplicate v2
755                 float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
756                 float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
757                 lo0 = lo0*vLo;
758                 z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
759                 z = z*vHi;
760                 float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
761                 lo1 = lo1*vLo;
762                 x = _mm_shuffle_ps(lo0, lo1, 0x88);
763                 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
764             }
765                 break;
766             case 2:
767             {
768                 v0 = vertices[0];
769                 v1 = vertices[1];
770                 float4 xy = _mm_movelh_ps(v0, v1);
771                 z = _mm_movehl_ps(v1, v0);
772                 xy = xy*vLo;
773                 z = _mm_shuffle_ps( z, z,  0xa8);
774                 x = _mm_shuffle_ps( xy, xy, 0xa8);
775                 y = _mm_shuffle_ps( xy, xy, 0xfd);
776                 z = z*vHi;
777             }
778                 break;
779             case 1:
780             {
781                 float4 xy = vertices[0];
782                 z =  _mm_shuffle_ps( xy, xy, 0xaa);
783                 xy = xy*vLo;
784                 z = z*vHi;
785                 x = _mm_shuffle_ps(xy, xy, 0);
786                 y = _mm_shuffle_ps(xy, xy, 0x55);
787             }
788                 break;
789         }
790         x = x+y;
791         x = x+z;
792         stack_array[index] = x;
793         min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
794         index++;
795     }
796 
797     // if we found a new min.
798     if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
799     { // we found a new min. Search for it
800       // find min across the min vector, place in all elements of min -- big latency hit here
801         min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
802         min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
803 
804         // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
805         // this where it actually makes a difference is handled in the early out at the top of the function,
806         // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
807         // complexity, and removed it.
808 
809         dotmin = min;
810 
811         // scan for the first occurence of min in the array
812         size_t test;
813         for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
814         {}
815         minIndex = 4*index + segment + indexTable[test];
816     }
817 
818     _mm_store_ss( dotResult, dotmin);
819     return minIndex;
820 }
821 
822 
823 #elif defined BT_USE_NEON
824 
825 #define ARM_NEON_GCC_COMPATIBILITY  1
826 #include <arm_neon.h>
827 #include <sys/types.h>
828 #include <sys/sysctl.h> //for sysctlbyname
829 
830 static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
831 static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
832 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
833 static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
834 static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
835 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
836 
837 long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
838 long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
839 
840 
btGetCpuCapabilities(void)841 static inline uint32_t btGetCpuCapabilities( void )
842 {
843     static uint32_t capabilities = 0;
844     static bool testedCapabilities = false;
845 
846     if( 0 == testedCapabilities)
847     {
848         uint32_t hasFeature = 0;
849         size_t featureSize = sizeof( hasFeature );
850         int err = sysctlbyname( "hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0 );
851 
852         if( 0 == err && hasFeature)
853             capabilities |= 0x2000;
854 
855 		testedCapabilities = true;
856     }
857 
858     return capabilities;
859 }
860 
861 
862 
863 
_maxdot_large_sel(const float * vv,const float * vec,unsigned long count,float * dotResult)864 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
865 {
866 
867     if( btGetCpuCapabilities() & 0x2000 )
868         _maxdot_large = _maxdot_large_v1;
869     else
870         _maxdot_large = _maxdot_large_v0;
871 
872     return _maxdot_large(vv, vec, count, dotResult);
873 }
874 
_mindot_large_sel(const float * vv,const float * vec,unsigned long count,float * dotResult)875 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
876 {
877 
878     if( btGetCpuCapabilities() & 0x2000 )
879         _mindot_large = _mindot_large_v1;
880     else
881         _mindot_large = _mindot_large_v0;
882 
883     return _mindot_large(vv, vec, count, dotResult);
884 }
885 
886 
887 
888 #if defined __arm__
889 # define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
890 #else
891 //support 64bit arm
892 # define vld1q_f32_aligned_postincrement( _ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; })
893 #endif
894 
895 
_maxdot_large_v0(const float * vv,const float * vec,unsigned long count,float * dotResult)896 long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
897 {
898     unsigned long i = 0;
899     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
900     float32x2_t vLo = vget_low_f32(vvec);
901     float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
902     float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
903     float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
904     uint32x2_t indexLo = (uint32x2_t) {0, 1};
905     uint32x2_t indexHi = (uint32x2_t) {2, 3};
906     uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
907     uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
908     const uint32x2_t four = (uint32x2_t) {4,4};
909 
910     for( ; i+8 <= count; i+= 8 )
911     {
912         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
913         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
914         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
915         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
916 
917         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
918         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
919         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
920         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
921 
922         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
923         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
924         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
925         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
926 
927         float32x2_t rLo = vpadd_f32( xy0, xy1);
928         float32x2_t rHi = vpadd_f32( xy2, xy3);
929         rLo = vadd_f32(rLo, zLo);
930         rHi = vadd_f32(rHi, zHi);
931 
932         uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
933         uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
934         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
935         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
936         iLo = vbsl_u32(maskLo, indexLo, iLo);
937         iHi = vbsl_u32(maskHi, indexHi, iHi);
938         indexLo = vadd_u32(indexLo, four);
939         indexHi = vadd_u32(indexHi, four);
940 
941         v0 = vld1q_f32_aligned_postincrement( vv );
942         v1 = vld1q_f32_aligned_postincrement( vv );
943         v2 = vld1q_f32_aligned_postincrement( vv );
944         v3 = vld1q_f32_aligned_postincrement( vv );
945 
946         xy0 = vmul_f32( vget_low_f32(v0), vLo);
947         xy1 = vmul_f32( vget_low_f32(v1), vLo);
948         xy2 = vmul_f32( vget_low_f32(v2), vLo);
949         xy3 = vmul_f32( vget_low_f32(v3), vLo);
950 
951         z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
952         z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
953         zLo = vmul_f32( z0.val[0], vHi);
954         zHi = vmul_f32( z1.val[0], vHi);
955 
956         rLo = vpadd_f32( xy0, xy1);
957         rHi = vpadd_f32( xy2, xy3);
958         rLo = vadd_f32(rLo, zLo);
959         rHi = vadd_f32(rHi, zHi);
960 
961         maskLo = vcgt_f32( rLo, dotMaxLo );
962         maskHi = vcgt_f32( rHi, dotMaxHi );
963         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
964         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
965         iLo = vbsl_u32(maskLo, indexLo, iLo);
966         iHi = vbsl_u32(maskHi, indexHi, iHi);
967         indexLo = vadd_u32(indexLo, four);
968         indexHi = vadd_u32(indexHi, four);
969     }
970 
971     for( ; i+4 <= count; i+= 4 )
972     {
973         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
974         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
975         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
976         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
977 
978         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
979         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
980         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
981         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
982 
983         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
984         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
985         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
986         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
987 
988         float32x2_t rLo = vpadd_f32( xy0, xy1);
989         float32x2_t rHi = vpadd_f32( xy2, xy3);
990         rLo = vadd_f32(rLo, zLo);
991         rHi = vadd_f32(rHi, zHi);
992 
993         uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
994         uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
995         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
996         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
997         iLo = vbsl_u32(maskLo, indexLo, iLo);
998         iHi = vbsl_u32(maskHi, indexHi, iHi);
999         indexLo = vadd_u32(indexLo, four);
1000         indexHi = vadd_u32(indexHi, four);
1001     }
1002 
1003     switch( count & 3 )
1004     {
1005         case 3:
1006         {
1007             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1008             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1009             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1010 
1011             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1012             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1013             float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1014 
1015             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1016             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1017             float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1018 
1019             float32x2_t rLo = vpadd_f32( xy0, xy1);
1020             float32x2_t rHi = vpadd_f32( xy2, xy2);
1021             rLo = vadd_f32(rLo, zLo);
1022             rHi = vadd_f32(rHi, zHi);
1023 
1024             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1025             uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
1026             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1027             dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
1028             iLo = vbsl_u32(maskLo, indexLo, iLo);
1029             iHi = vbsl_u32(maskHi, indexHi, iHi);
1030         }
1031             break;
1032         case 2:
1033         {
1034             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1035             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1036 
1037             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1038             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1039 
1040             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1041             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1042 
1043             float32x2_t rLo = vpadd_f32( xy0, xy1);
1044             rLo = vadd_f32(rLo, zLo);
1045 
1046             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1047             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1048             iLo = vbsl_u32(maskLo, indexLo, iLo);
1049         }
1050             break;
1051         case 1:
1052         {
1053             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1054             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1055             float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1056             float32x2_t zLo = vmul_f32( z0, vHi);
1057             float32x2_t rLo = vpadd_f32( xy0, xy0);
1058             rLo = vadd_f32(rLo, zLo);
1059             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1060             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1061             iLo = vbsl_u32(maskLo, indexLo, iLo);
1062         }
1063             break;
1064 
1065         default:
1066             break;
1067     }
1068 
1069     // select best answer between hi and lo results
1070     uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
1071     dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1072     iLo = vbsl_u32(mask, iHi, iLo);
1073 
1074     // select best answer between even and odd results
1075     dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1076     iHi = vdup_lane_u32(iLo, 1);
1077     mask = vcgt_f32( dotMaxHi, dotMaxLo );
1078     dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1079     iLo = vbsl_u32(mask, iHi, iLo);
1080 
1081     *dotResult = vget_lane_f32( dotMaxLo, 0);
1082     return vget_lane_u32(iLo, 0);
1083 }
1084 
1085 
_maxdot_large_v1(const float * vv,const float * vec,unsigned long count,float * dotResult)1086 long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
1087 {
1088     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1089     float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1090     float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1091     const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1092     uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1093     uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
1094     float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
1095 
1096     unsigned long i = 0;
1097     for( ; i + 8 <= count; i += 8 )
1098     {
1099         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1100         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1101         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1102         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1103 
1104         // the next two lines should resolve to a single vswp d, d
1105         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1106         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1107         // the next two lines should resolve to a single vswp d, d
1108         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1109         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1110 
1111         xy0 = vmulq_f32(xy0, vLo);
1112         xy1 = vmulq_f32(xy1, vLo);
1113 
1114         float32x4x2_t zb = vuzpq_f32( z0, z1);
1115         float32x4_t z = vmulq_f32( zb.val[0], vHi);
1116         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1117         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1118         x = vaddq_f32(x, z);
1119 
1120         uint32x4_t mask = vcgtq_f32(x, maxDot);
1121         maxDot = vbslq_f32( mask, x, maxDot);
1122         index = vbslq_u32(mask, local_index, index);
1123         local_index = vaddq_u32(local_index, four);
1124 
1125         v0 = vld1q_f32_aligned_postincrement( vv );
1126         v1 = vld1q_f32_aligned_postincrement( vv );
1127         v2 = vld1q_f32_aligned_postincrement( vv );
1128         v3 = vld1q_f32_aligned_postincrement( vv );
1129 
1130         // the next two lines should resolve to a single vswp d, d
1131         xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1132         xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1133         // the next two lines should resolve to a single vswp d, d
1134         z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1135         z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1136 
1137         xy0 = vmulq_f32(xy0, vLo);
1138         xy1 = vmulq_f32(xy1, vLo);
1139 
1140         zb = vuzpq_f32( z0, z1);
1141         z = vmulq_f32( zb.val[0], vHi);
1142         xy = vuzpq_f32( xy0, xy1);
1143         x = vaddq_f32(xy.val[0], xy.val[1]);
1144         x = vaddq_f32(x, z);
1145 
1146         mask = vcgtq_f32(x, maxDot);
1147         maxDot = vbslq_f32( mask, x, maxDot);
1148         index = vbslq_u32(mask, local_index, index);
1149         local_index = vaddq_u32(local_index, four);
1150     }
1151 
1152     for( ; i + 4 <= count; i += 4 )
1153     {
1154         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1155         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1156         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1157         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1158 
1159         // the next two lines should resolve to a single vswp d, d
1160         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1161         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1162         // the next two lines should resolve to a single vswp d, d
1163         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1164         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1165 
1166         xy0 = vmulq_f32(xy0, vLo);
1167         xy1 = vmulq_f32(xy1, vLo);
1168 
1169         float32x4x2_t zb = vuzpq_f32( z0, z1);
1170         float32x4_t z = vmulq_f32( zb.val[0], vHi);
1171         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1172         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1173         x = vaddq_f32(x, z);
1174 
1175         uint32x4_t mask = vcgtq_f32(x, maxDot);
1176         maxDot = vbslq_f32( mask, x, maxDot);
1177         index = vbslq_u32(mask, local_index, index);
1178         local_index = vaddq_u32(local_index, four);
1179     }
1180 
1181     switch (count & 3) {
1182         case 3:
1183         {
1184             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1185             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1186             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1187 
1188             // the next two lines should resolve to a single vswp d, d
1189             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1190             float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1191             // the next two lines should resolve to a single vswp d, d
1192             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1193             float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1194 
1195             xy0 = vmulq_f32(xy0, vLo);
1196             xy1 = vmulq_f32(xy1, vLo);
1197 
1198             float32x4x2_t zb = vuzpq_f32( z0, z1);
1199             float32x4_t z = vmulq_f32( zb.val[0], vHi);
1200             float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1201             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1202             x = vaddq_f32(x, z);
1203 
1204             uint32x4_t mask = vcgtq_f32(x, maxDot);
1205             maxDot = vbslq_f32( mask, x, maxDot);
1206             index = vbslq_u32(mask, local_index, index);
1207             local_index = vaddq_u32(local_index, four);
1208         }
1209             break;
1210 
1211         case 2:
1212         {
1213             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1214             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1215 
1216             // the next two lines should resolve to a single vswp d, d
1217             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1218             // the next two lines should resolve to a single vswp d, d
1219             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1220 
1221             xy0 = vmulq_f32(xy0, vLo);
1222 
1223             float32x4x2_t zb = vuzpq_f32( z0, z0);
1224             float32x4_t z = vmulq_f32( zb.val[0], vHi);
1225             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1226             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1227             x = vaddq_f32(x, z);
1228 
1229             uint32x4_t mask = vcgtq_f32(x, maxDot);
1230             maxDot = vbslq_f32( mask, x, maxDot);
1231             index = vbslq_u32(mask, local_index, index);
1232             local_index = vaddq_u32(local_index, four);
1233         }
1234             break;
1235 
1236         case 1:
1237         {
1238             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1239 
1240             // the next two lines should resolve to a single vswp d, d
1241             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1242             // the next two lines should resolve to a single vswp d, d
1243             float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1244 
1245             xy0 = vmulq_f32(xy0, vLo);
1246 
1247             z = vmulq_f32( z, vHi);
1248             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1249             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1250             x = vaddq_f32(x, z);
1251 
1252             uint32x4_t mask = vcgtq_f32(x, maxDot);
1253             maxDot = vbslq_f32( mask, x, maxDot);
1254             index = vbslq_u32(mask, local_index, index);
1255             local_index = vaddq_u32(local_index, four);
1256         }
1257             break;
1258 
1259         default:
1260             break;
1261     }
1262 
1263 
1264     // select best answer between hi and lo results
1265     uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
1266     float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1267     uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1268 
1269     // select best answer between even and odd results
1270     float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1271     uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1272     mask = vcgt_f32( maxDotO, maxDot2 );
1273     maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1274     index2 = vbsl_u32(mask, indexHi, index2);
1275 
1276     *dotResult = vget_lane_f32( maxDot2, 0);
1277     return vget_lane_u32(index2, 0);
1278 
1279 }
1280 
_mindot_large_v0(const float * vv,const float * vec,unsigned long count,float * dotResult)1281 long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
1282 {
1283     unsigned long i = 0;
1284     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1285     float32x2_t vLo = vget_low_f32(vvec);
1286     float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1287     float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1288     float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1289     uint32x2_t indexLo = (uint32x2_t) {0, 1};
1290     uint32x2_t indexHi = (uint32x2_t) {2, 3};
1291     uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1292     uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1293     const uint32x2_t four = (uint32x2_t) {4,4};
1294 
1295     for( ; i+8 <= count; i+= 8 )
1296     {
1297         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1298         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1299         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1300         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1301 
1302         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1303         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1304         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1305         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1306 
1307         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1308         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1309         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1310         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1311 
1312         float32x2_t rLo = vpadd_f32( xy0, xy1);
1313         float32x2_t rHi = vpadd_f32( xy2, xy3);
1314         rLo = vadd_f32(rLo, zLo);
1315         rHi = vadd_f32(rHi, zHi);
1316 
1317         uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1318         uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1319         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1320         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1321         iLo = vbsl_u32(maskLo, indexLo, iLo);
1322         iHi = vbsl_u32(maskHi, indexHi, iHi);
1323         indexLo = vadd_u32(indexLo, four);
1324         indexHi = vadd_u32(indexHi, four);
1325 
1326         v0 = vld1q_f32_aligned_postincrement( vv );
1327         v1 = vld1q_f32_aligned_postincrement( vv );
1328         v2 = vld1q_f32_aligned_postincrement( vv );
1329         v3 = vld1q_f32_aligned_postincrement( vv );
1330 
1331         xy0 = vmul_f32( vget_low_f32(v0), vLo);
1332         xy1 = vmul_f32( vget_low_f32(v1), vLo);
1333         xy2 = vmul_f32( vget_low_f32(v2), vLo);
1334         xy3 = vmul_f32( vget_low_f32(v3), vLo);
1335 
1336         z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1337         z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1338         zLo = vmul_f32( z0.val[0], vHi);
1339         zHi = vmul_f32( z1.val[0], vHi);
1340 
1341         rLo = vpadd_f32( xy0, xy1);
1342         rHi = vpadd_f32( xy2, xy3);
1343         rLo = vadd_f32(rLo, zLo);
1344         rHi = vadd_f32(rHi, zHi);
1345 
1346         maskLo = vclt_f32( rLo, dotMinLo );
1347         maskHi = vclt_f32( rHi, dotMinHi );
1348         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1349         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1350         iLo = vbsl_u32(maskLo, indexLo, iLo);
1351         iHi = vbsl_u32(maskHi, indexHi, iHi);
1352         indexLo = vadd_u32(indexLo, four);
1353         indexHi = vadd_u32(indexHi, four);
1354     }
1355 
1356     for( ; i+4 <= count; i+= 4 )
1357     {
1358         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1359         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1360         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1361         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1362 
1363         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1364         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1365         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1366         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1367 
1368         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1369         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1370         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1371         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1372 
1373         float32x2_t rLo = vpadd_f32( xy0, xy1);
1374         float32x2_t rHi = vpadd_f32( xy2, xy3);
1375         rLo = vadd_f32(rLo, zLo);
1376         rHi = vadd_f32(rHi, zHi);
1377 
1378         uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1379         uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1380         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1381         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1382         iLo = vbsl_u32(maskLo, indexLo, iLo);
1383         iHi = vbsl_u32(maskHi, indexHi, iHi);
1384         indexLo = vadd_u32(indexLo, four);
1385         indexHi = vadd_u32(indexHi, four);
1386     }
1387     switch( count & 3 )
1388     {
1389         case 3:
1390         {
1391             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1392             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1393             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1394 
1395             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1396             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1397             float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1398 
1399             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1400             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1401             float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1402 
1403             float32x2_t rLo = vpadd_f32( xy0, xy1);
1404             float32x2_t rHi = vpadd_f32( xy2, xy2);
1405             rLo = vadd_f32(rLo, zLo);
1406             rHi = vadd_f32(rHi, zHi);
1407 
1408             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1409             uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1410             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1411             dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1412             iLo = vbsl_u32(maskLo, indexLo, iLo);
1413             iHi = vbsl_u32(maskHi, indexHi, iHi);
1414         }
1415             break;
1416         case 2:
1417         {
1418             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1419             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1420 
1421             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1422             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1423 
1424             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1425             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1426 
1427             float32x2_t rLo = vpadd_f32( xy0, xy1);
1428             rLo = vadd_f32(rLo, zLo);
1429 
1430             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1431             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1432             iLo = vbsl_u32(maskLo, indexLo, iLo);
1433         }
1434             break;
1435         case 1:
1436         {
1437             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1438             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1439             float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1440             float32x2_t zLo = vmul_f32( z0, vHi);
1441             float32x2_t rLo = vpadd_f32( xy0, xy0);
1442             rLo = vadd_f32(rLo, zLo);
1443             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1444             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1445             iLo = vbsl_u32(maskLo, indexLo, iLo);
1446         }
1447             break;
1448 
1449         default:
1450             break;
1451     }
1452 
1453     // select best answer between hi and lo results
1454     uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
1455     dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1456     iLo = vbsl_u32(mask, iHi, iLo);
1457 
1458     // select best answer between even and odd results
1459     dotMinHi = vdup_lane_f32(dotMinLo, 1);
1460     iHi = vdup_lane_u32(iLo, 1);
1461     mask = vclt_f32( dotMinHi, dotMinLo );
1462     dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1463     iLo = vbsl_u32(mask, iHi, iLo);
1464 
1465     *dotResult = vget_lane_f32( dotMinLo, 0);
1466     return vget_lane_u32(iLo, 0);
1467 }
1468 
_mindot_large_v1(const float * vv,const float * vec,unsigned long count,float * dotResult)1469 long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
1470 {
1471     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1472     float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1473     float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1474     const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1475     uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1476     uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
1477     float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
1478 
1479     unsigned long i = 0;
1480     for( ; i + 8 <= count; i += 8 )
1481     {
1482         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1483         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1484         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1485         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1486 
1487         // the next two lines should resolve to a single vswp d, d
1488         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1489         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1490         // the next two lines should resolve to a single vswp d, d
1491         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1492         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1493 
1494         xy0 = vmulq_f32(xy0, vLo);
1495         xy1 = vmulq_f32(xy1, vLo);
1496 
1497         float32x4x2_t zb = vuzpq_f32( z0, z1);
1498         float32x4_t z = vmulq_f32( zb.val[0], vHi);
1499         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1500         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1501         x = vaddq_f32(x, z);
1502 
1503         uint32x4_t mask = vcltq_f32(x, minDot);
1504         minDot = vbslq_f32( mask, x, minDot);
1505         index = vbslq_u32(mask, local_index, index);
1506         local_index = vaddq_u32(local_index, four);
1507 
1508         v0 = vld1q_f32_aligned_postincrement( vv );
1509         v1 = vld1q_f32_aligned_postincrement( vv );
1510         v2 = vld1q_f32_aligned_postincrement( vv );
1511         v3 = vld1q_f32_aligned_postincrement( vv );
1512 
1513         // the next two lines should resolve to a single vswp d, d
1514         xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1515         xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1516         // the next two lines should resolve to a single vswp d, d
1517         z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1518         z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1519 
1520         xy0 = vmulq_f32(xy0, vLo);
1521         xy1 = vmulq_f32(xy1, vLo);
1522 
1523         zb = vuzpq_f32( z0, z1);
1524         z = vmulq_f32( zb.val[0], vHi);
1525         xy = vuzpq_f32( xy0, xy1);
1526         x = vaddq_f32(xy.val[0], xy.val[1]);
1527         x = vaddq_f32(x, z);
1528 
1529         mask = vcltq_f32(x, minDot);
1530         minDot = vbslq_f32( mask, x, minDot);
1531         index = vbslq_u32(mask, local_index, index);
1532         local_index = vaddq_u32(local_index, four);
1533     }
1534 
1535     for( ; i + 4 <= count; i += 4 )
1536     {
1537         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1538         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1539         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1540         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1541 
1542         // the next two lines should resolve to a single vswp d, d
1543         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1544         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1545         // the next two lines should resolve to a single vswp d, d
1546         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1547         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1548 
1549         xy0 = vmulq_f32(xy0, vLo);
1550         xy1 = vmulq_f32(xy1, vLo);
1551 
1552         float32x4x2_t zb = vuzpq_f32( z0, z1);
1553         float32x4_t z = vmulq_f32( zb.val[0], vHi);
1554         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1555         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1556         x = vaddq_f32(x, z);
1557 
1558         uint32x4_t mask = vcltq_f32(x, minDot);
1559         minDot = vbslq_f32( mask, x, minDot);
1560         index = vbslq_u32(mask, local_index, index);
1561         local_index = vaddq_u32(local_index, four);
1562     }
1563 
1564     switch (count & 3) {
1565         case 3:
1566         {
1567             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1568             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1569             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1570 
1571             // the next two lines should resolve to a single vswp d, d
1572             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1573             float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1574             // the next two lines should resolve to a single vswp d, d
1575             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1576             float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1577 
1578             xy0 = vmulq_f32(xy0, vLo);
1579             xy1 = vmulq_f32(xy1, vLo);
1580 
1581             float32x4x2_t zb = vuzpq_f32( z0, z1);
1582             float32x4_t z = vmulq_f32( zb.val[0], vHi);
1583             float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1584             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1585             x = vaddq_f32(x, z);
1586 
1587             uint32x4_t mask = vcltq_f32(x, minDot);
1588             minDot = vbslq_f32( mask, x, minDot);
1589             index = vbslq_u32(mask, local_index, index);
1590             local_index = vaddq_u32(local_index, four);
1591         }
1592             break;
1593 
1594         case 2:
1595         {
1596             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1597             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1598 
1599             // the next two lines should resolve to a single vswp d, d
1600             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1601             // the next two lines should resolve to a single vswp d, d
1602             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1603 
1604             xy0 = vmulq_f32(xy0, vLo);
1605 
1606             float32x4x2_t zb = vuzpq_f32( z0, z0);
1607             float32x4_t z = vmulq_f32( zb.val[0], vHi);
1608             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1609             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1610             x = vaddq_f32(x, z);
1611 
1612             uint32x4_t mask = vcltq_f32(x, minDot);
1613             minDot = vbslq_f32( mask, x, minDot);
1614             index = vbslq_u32(mask, local_index, index);
1615             local_index = vaddq_u32(local_index, four);
1616         }
1617             break;
1618 
1619         case 1:
1620         {
1621             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1622 
1623             // the next two lines should resolve to a single vswp d, d
1624             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1625             // the next two lines should resolve to a single vswp d, d
1626             float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1627 
1628             xy0 = vmulq_f32(xy0, vLo);
1629 
1630             z = vmulq_f32( z, vHi);
1631             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1632             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1633             x = vaddq_f32(x, z);
1634 
1635             uint32x4_t mask = vcltq_f32(x, minDot);
1636             minDot = vbslq_f32( mask, x, minDot);
1637             index = vbslq_u32(mask, local_index, index);
1638             local_index = vaddq_u32(local_index, four);
1639         }
1640             break;
1641 
1642         default:
1643             break;
1644     }
1645 
1646 
1647     // select best answer between hi and lo results
1648     uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
1649     float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1650     uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1651 
1652     // select best answer between even and odd results
1653     float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1654     uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1655     mask = vclt_f32( minDotO, minDot2 );
1656     minDot2 = vbsl_f32(mask, minDotO, minDot2);
1657     index2 = vbsl_u32(mask, indexHi, index2);
1658 
1659     *dotResult = vget_lane_f32( minDot2, 0);
1660     return vget_lane_u32(index2, 0);
1661 
1662 }
1663 
1664 #else
1665     #error Unhandled __APPLE__ arch
1666 #endif
1667 
1668 #endif  /* __APPLE__ */
1669 
1670 
1671