Home
last modified time | relevance | path

Searched refs:vsqrtx (Results 1 – 25 of 33) sorted by relevance

12

/external/XNNPACK/src/math/
Dsqrt-neonfma-nr3fma.c28 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr3fma() local
35 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma()
37 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma()
39 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma()
41 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma()
43 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma()
44 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma()
46 const float32x4_t vy = vsqrtx; in xnn_math_f32_sqrt__neonfma_nr3fma()
Dsqrt-neonfma-nr2fma1adj.c28 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj() local
35 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
37 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
39 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
41 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
46 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
47 vsqrtx = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
49 const float32x4_t vy = vsqrtx; in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
Dsqrt-fma3-nr2fma.c29 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__fma3_nr2fma() local
36 __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr2fma()
38 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__fma3_nr2fma()
40 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr2fma()
41 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__fma3_nr2fma()
43 const __m256 vy = vsqrtx; in xnn_math_f32_sqrt__fma3_nr2fma()
Dsqrt-avx512f-nr2fma.c29 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__avx512f_nr2fma() local
36 __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr2fma()
38 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__avx512f_nr2fma()
40 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr2fma()
41 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__avx512f_nr2fma()
43 const __m512 vy = vsqrtx; in xnn_math_f32_sqrt__avx512f_nr2fma()
Dsqrt-avx512f-nr1fma1adj.c29 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj() local
36 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr1fma1adj()
38 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj()
43 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj()
44 vsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj()
46 const __m512 vy = vsqrtx; in xnn_math_f32_sqrt__avx512f_nr1fma1adj()
Dsqrt-fma3-nr1fma1adj.c29 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__fma3_nr1fma1adj() local
36 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr1fma1adj()
38 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__fma3_nr1fma1adj()
43 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_math_f32_sqrt__fma3_nr1fma1adj()
44 vsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_math_f32_sqrt__fma3_nr1fma1adj()
46 const __m256 vy = vsqrtx; in xnn_math_f32_sqrt__fma3_nr1fma1adj()
Dsqrt-neonfma-nr2fma.c28 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr2fma() local
35 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma()
37 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma()
39 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma()
40 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma()
42 const float32x4_t vy = vsqrtx; in xnn_math_f32_sqrt__neonfma_nr2fma()
Dsqrt-neonfma-nr1rsqrts1fma1adj.c33 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj() local
40 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj()
42 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj()
47 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj()
48 vsqrtx = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj()
50 const float32x4_t vy = vsqrtx; in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj()
/external/XNNPACK/src/f32-vsqrt/gen/
Davx512f-nr1fma1adj-x16.c34 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() local
36 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
38 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
39 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
40 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
54 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() local
56 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
58 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
59 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
60 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
Dfma3-nr1fma1adj-x8.c35 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() local
37 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
39 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
40 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
41 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
54 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() local
56 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
58 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
59 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
60 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
Davx512f-nr1fma1adj-x32.c65 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() local
67 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
69 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
70 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
71 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
85 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() local
87 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
89 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
90 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
91 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
Dfma3-nr1fma1adj-x16.c66 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() local
68 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
70 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
71 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
72 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
85 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() local
87 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
89 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
90 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
91 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
Davx512f-nr1fma1adj-x48.c75 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() local
77 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
79 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
80 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
81 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
95 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() local
97 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
99 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
100 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
101 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
Dfma3-nr1fma1adj-x24.c76 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() local
78 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
80 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
81 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
82 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
95 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() local
97 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
99 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
100 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
101 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
Davx512f-nr1fma1adj-x64.c85 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() local
87 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
89 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
90 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
91 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
105 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() local
107 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
109 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
110 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
111 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
Dfma3-nr1fma1adj-x32.c86 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() local
88 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
90 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
91 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
92 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
105 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() local
107 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
109 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
110 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
111 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
Davx512f-nr1fma1adj-x80.c95 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() local
97 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
99 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
100 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
101 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
115 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() local
117 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
119 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
120 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
121 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
Dfma3-nr1fma1adj-x40.c96 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() local
98 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
100 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
101 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
102 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
115 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() local
117 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
119 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
120 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
121 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
Davx512f-nr1fma1adj-x96.c105 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() local
107 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
109 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
110 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
111 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
125 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() local
127 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
129 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
130 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
131 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
Dfma3-nr1fma1adj-x48.c106 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() local
108 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
110 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
111 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
112 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
125 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() local
127 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
129 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
130 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
131 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
Davx512f-nr1fma1adj-x112.c115 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() local
117 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
119 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
120 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
121 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
135 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() local
137 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
139 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
140 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
141 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
Dfma3-nr1fma1adj-x56.c116 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() local
118 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
120 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
121 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
122 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
135 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() local
137 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
139 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
140 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
141 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
Davx512f-nr1fma1adj-x128.c125 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() local
127 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
129 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
130 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
131 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
145 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() local
147 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
149 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
150 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
151 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
/external/XNNPACK/src/f32-vsqrt/
Davx512f-nr1fma1adj.c.in40 __m512 vsqrtx${ABC[N]} = _mm512_mul_ps(vrsqrtx${ABC[N]}, vx${ABC[N]});
44 … const __m512 vresidual${ABC[N]} = _mm512_fnmadd_ps(vsqrtx${ABC[N]}, vhalfrsqrtx${ABC[N]}, vhalf);
48 vsqrtx${ABC[N]} = _mm512_fmadd_ps(vsqrtx${ABC[N]}, vresidual${ABC[N]}, vsqrtx${ABC[N]});
51 …const __m512 vadjustment${ABC[N]} = _mm512_fnmadd_ps(vsqrtx${ABC[N]}, vsqrtx${ABC[N]}, vx${ABC[N]}…
54 … __m512 vy${ABC[N]} = _mm512_fmadd_ps(vhalfrsqrtx${ABC[N]}, vadjustment${ABC[N]}, vsqrtx${ABC[N]});
66 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); variable
68 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf);
70 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx);
71 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx);
72 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx);
[all …]
Dfma3-nr1fma1adj.c.in41 __m256 vsqrtx${ABC[N]} = _mm256_mul_ps(vrsqrtx${ABC[N]}, vx${ABC[N]});
45 … const __m256 vresidual${ABC[N]} = _mm256_fnmadd_ps(vsqrtx${ABC[N]}, vhalfrsqrtx${ABC[N]}, vhalf);
49 vsqrtx${ABC[N]} = _mm256_fmadd_ps(vsqrtx${ABC[N]}, vresidual${ABC[N]}, vsqrtx${ABC[N]});
52 …const __m256 vadjustment${ABC[N]} = _mm256_fnmadd_ps(vsqrtx${ABC[N]}, vsqrtx${ABC[N]}, vx${ABC[N]}…
55 … __m256 vy${ABC[N]} = _mm256_fmadd_ps(vhalfrsqrtx${ABC[N]}, vadjustment${ABC[N]}, vsqrtx${ABC[N]});
67 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); variable
69 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf);
71 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx);
72 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx);
73 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx);
[all …]

12