Home
last modified time | relevance | path

Searched refs:vsqrtx (Results 1 – 25 of 51) sorted by relevance

123

/external/XNNPACK/src/f32-vsqrt/gen/
Dneonfma-nr2fma1adj-x4.c32 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4() local
34 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4()
36 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4()
37 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4()
39 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4()
40 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4()
41 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4()
47 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4() local
49 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4()
51 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4()
[all …]
Dneonfma-nr2fma1adj-x8.c69 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8() local
71 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8()
73 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8()
74 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8()
76 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8()
77 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8()
78 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8()
84 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8() local
86 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8()
88 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8()
[all …]
Davx512f-nr1fma1adj-x16.c34 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() local
36 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
38 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
39 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
40 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
54 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() local
56 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
58 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
59 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
60 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
Dneonfma-nr1rsqrts1fma1adj-x4.c35 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4() local
37 const float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4()
39 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4()
40 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4()
41 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4()
50 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4() local
52 const float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4()
54 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4()
55 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4()
56 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4()
Dfma3-nr1fma1adj-x8.c33 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() local
35 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
37 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
38 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
39 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
52 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() local
54 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
56 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
57 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
58 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
Dneonfma-nr2fma1adj-x12.c82 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() local
84 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
86 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
87 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
89 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
90 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
91 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
97 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() local
99 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
101 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
[all …]
Dneonfma-nr2fma1adj-x16.c95 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() local
97 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
99 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
100 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
102 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
103 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
104 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
110 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() local
112 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
114 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
[all …]
Dneonfma-nr2fma1adj-x40.c70 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() local
82 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
103 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
114 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
135 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
146 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
157 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
173 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() local
175 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
177 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
[all …]
Davx512f-nr1fma1adj-x32.c65 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() local
67 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
69 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
70 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
71 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
85 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() local
87 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
89 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
90 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
91 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
Dneonfma-nr2fma1adj-x20.c108 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() local
110 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
112 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
113 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
115 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
116 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
117 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
123 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() local
125 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
127 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
[all …]
Dfma3-nr1fma1adj-x16.c64 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() local
66 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
68 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
69 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
70 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
83 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() local
85 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
87 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
88 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
89 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
Dneonfma-nr1rsqrts1fma1adj-x8.c73 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8() local
75 const float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8()
77 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8()
78 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8()
79 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8()
88 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8() local
90 const float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8()
92 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8()
93 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8()
94 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8()
Dfma3-nr1fma1adj-x24.c74 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() local
76 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
78 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
79 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
80 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
93 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() local
95 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
97 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
98 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
99 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
/external/XNNPACK/src/f32-vsqrt/
Dneonfma-nr2fma1adj.c.in37 float32x4_t vsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vx${ABC[N:N+4]});
41 …float32x4_t vresidual${ABC[N:N+4]} = vfmsq_f32(vhalf, vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]…
45vsqrtx${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]});
48 vresidual${ABC[N:N+4]} = vfmsq_f32(vhalf, vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]});
52vsqrtx${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]});
55 …at32x4_t vadjustment${ABC[N:N+4]} = vfmsq_f32(vx${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]}, vsqrtx${ABC[N:…
58 …const float32x4_t vy${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]}, vadju…
66 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); variable
68 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx);
70 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx);
[all …]
Dfma3-nr1fma1adj.c.in39 __m256 vsqrtx${ABC[N]} = _mm256_mul_ps(vrsqrtx${ABC[N]}, vx${ABC[N]});
43 … const __m256 vresidual${ABC[N]} = _mm256_fnmadd_ps(vsqrtx${ABC[N]}, vhalfrsqrtx${ABC[N]}, vhalf);
47 vsqrtx${ABC[N]} = _mm256_fmadd_ps(vsqrtx${ABC[N]}, vresidual${ABC[N]}, vsqrtx${ABC[N]});
50 …const __m256 vadjustment${ABC[N]} = _mm256_fnmadd_ps(vsqrtx${ABC[N]}, vsqrtx${ABC[N]}, vx${ABC[N]}…
53 … __m256 vy${ABC[N]} = _mm256_fmadd_ps(vhalfrsqrtx${ABC[N]}, vadjustment${ABC[N]}, vsqrtx${ABC[N]});
65 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); variable
67 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf);
69 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx);
70 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx);
71 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx);
[all …]
Davx512f-nr1fma1adj.c.in40 __m512 vsqrtx${ABC[N]} = _mm512_mul_ps(vrsqrtx${ABC[N]}, vx${ABC[N]});
44 … const __m512 vresidual${ABC[N]} = _mm512_fnmadd_ps(vsqrtx${ABC[N]}, vhalfrsqrtx${ABC[N]}, vhalf);
48 vsqrtx${ABC[N]} = _mm512_fmadd_ps(vsqrtx${ABC[N]}, vresidual${ABC[N]}, vsqrtx${ABC[N]});
51 …const __m512 vadjustment${ABC[N]} = _mm512_fnmadd_ps(vsqrtx${ABC[N]}, vsqrtx${ABC[N]}, vx${ABC[N]}…
54 … __m512 vy${ABC[N]} = _mm512_fmadd_ps(vhalfrsqrtx${ABC[N]}, vadjustment${ABC[N]}, vsqrtx${ABC[N]});
66 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); variable
68 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf);
70 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx);
71 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx);
72 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx);
[all …]
Dneonfma-nr1rsqrts1fma1adj.c.in46 float32x4_t vsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vx${ABC[N:N+4]});
50 …const float32x4_t vresidual${ABC[N:N+4]} = vfmsq_f32(vhalf, vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[…
54vsqrtx${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]});
57 …at32x4_t vadjustment${ABC[N:N+4]} = vfmsq_f32(vx${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]}, vsqrtx${ABC[N:…
60 …const float32x4_t vy${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]}, vadju…
71 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); variable
73 const float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx);
75 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx);
76 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx);
77 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment);
[all …]
/external/XNNPACK/src/math/
Dsqrt-neonfma-nr3fma.c28 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr3fma() local
35 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma()
37 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma()
39 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma()
41 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma()
43 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma()
44 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma()
46 const float32x4_t vy = vsqrtx; in xnn_math_f32_sqrt__neonfma_nr3fma()
Dsqrt-neonfma-nr2fma1adj.c28 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj() local
35 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
37 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
39 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
41 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
46 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
47 vsqrtx = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
49 const float32x4_t vy = vsqrtx; in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
Dsqrt-fma3-nr1fma1adj.c29 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__fma3_nr1fma1adj() local
36 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr1fma1adj()
38 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__fma3_nr1fma1adj()
43 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_math_f32_sqrt__fma3_nr1fma1adj()
44 vsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_math_f32_sqrt__fma3_nr1fma1adj()
46 const __m256 vy = vsqrtx; in xnn_math_f32_sqrt__fma3_nr1fma1adj()
Dsqrt-avx512f-nr1fma1adj.c29 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj() local
36 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr1fma1adj()
38 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj()
43 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj()
44 vsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj()
46 const __m512 vy = vsqrtx; in xnn_math_f32_sqrt__avx512f_nr1fma1adj()
Dsqrt-neonfma-nr2fma.c28 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr2fma() local
35 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma()
37 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma()
39 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma()
40 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma()
42 const float32x4_t vy = vsqrtx; in xnn_math_f32_sqrt__neonfma_nr2fma()
Dsqrt-fma3-nr2fma.c29 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__fma3_nr2fma() local
36 __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr2fma()
38 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__fma3_nr2fma()
40 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr2fma()
41 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__fma3_nr2fma()
43 const __m256 vy = vsqrtx; in xnn_math_f32_sqrt__fma3_nr2fma()
Dsqrt-avx512f-nr2fma.c29 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__avx512f_nr2fma() local
36 __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr2fma()
38 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__avx512f_nr2fma()
40 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr2fma()
41 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__avx512f_nr2fma()
43 const __m512 vy = vsqrtx; in xnn_math_f32_sqrt__avx512f_nr2fma()
Dsqrt-neonfma-nr1rsqrts1fma1adj.c33 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj() local
40 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj()
42 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj()
47 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj()
48 vsqrtx = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj()
50 const float32x4_t vy = vsqrtx; in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj()

123