| /external/XNNPACK/src/f32-vsqrt/gen/ |
| D | neonfma-nr2fma1adj-x4.c | 32 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4() local 34 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4() 36 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4() 37 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4() 39 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4() 40 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4() 41 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4() 47 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4() local 49 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4() 51 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4() [all …]
|
| D | neonfma-nr2fma1adj-x8.c | 69 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8() local 71 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8() 73 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8() 74 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8() 76 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8() 77 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8() 78 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8() 84 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8() local 86 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8() 88 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8() [all …]
|
| D | avx512f-nr1fma1adj-x16.c | 34 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() local 36 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 38 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 39 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 40 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 54 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() local 56 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 58 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 59 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 60 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
|
| D | neonfma-nr1rsqrts1fma1adj-x4.c | 35 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4() local 37 const float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4() 39 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4() 40 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4() 41 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4() 50 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4() local 52 const float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4() 54 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4() 55 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4() 56 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4()
|
| D | fma3-nr1fma1adj-x8.c | 33 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() local 35 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 37 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 38 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 39 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 52 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() local 54 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 56 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 57 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 58 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
|
| D | neonfma-nr2fma1adj-x12.c | 82 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() local 84 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 86 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 87 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 89 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 90 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 91 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 97 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() local 99 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 101 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() [all …]
|
| D | neonfma-nr2fma1adj-x16.c | 95 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() local 97 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 99 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 100 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 102 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 103 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 104 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 110 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() local 112 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 114 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() [all …]
|
| D | neonfma-nr2fma1adj-x40.c | 70 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() local 82 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 103 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 114 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 135 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 146 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 157 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 173 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() local 175 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 177 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() [all …]
|
| D | avx512f-nr1fma1adj-x32.c | 65 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() local 67 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 69 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 70 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 71 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 85 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() local 87 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 89 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 90 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 91 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
|
| D | neonfma-nr2fma1adj-x20.c | 108 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() local 110 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 112 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 113 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 115 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 116 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 117 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 123 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() local 125 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 127 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() [all …]
|
| D | fma3-nr1fma1adj-x16.c | 64 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() local 66 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 68 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 69 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 70 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 83 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() local 85 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 87 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 88 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 89 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
|
| D | neonfma-nr1rsqrts1fma1adj-x8.c | 73 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8() local 75 const float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8() 77 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8() 78 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8() 79 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8() 88 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8() local 90 const float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8() 92 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8() 93 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8() 94 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8()
|
| D | fma3-nr1fma1adj-x24.c | 74 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() local 76 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 78 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 79 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 80 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 93 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() local 95 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 97 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 98 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 99 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
|
| /external/XNNPACK/src/f32-vsqrt/ |
| D | neonfma-nr2fma1adj.c.in | 37 float32x4_t vsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vx${ABC[N:N+4]}); 41 …float32x4_t vresidual${ABC[N:N+4]} = vfmsq_f32(vhalf, vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]… 45 … vsqrtx${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]}); 48 vresidual${ABC[N:N+4]} = vfmsq_f32(vhalf, vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]}); 52 … vsqrtx${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]}); 55 …at32x4_t vadjustment${ABC[N:N+4]} = vfmsq_f32(vx${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]}, vsqrtx${ABC[N:… 58 …const float32x4_t vy${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]}, vadju… 66 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); variable 68 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); 70 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); [all …]
|
| D | fma3-nr1fma1adj.c.in | 39 __m256 vsqrtx${ABC[N]} = _mm256_mul_ps(vrsqrtx${ABC[N]}, vx${ABC[N]}); 43 … const __m256 vresidual${ABC[N]} = _mm256_fnmadd_ps(vsqrtx${ABC[N]}, vhalfrsqrtx${ABC[N]}, vhalf); 47 vsqrtx${ABC[N]} = _mm256_fmadd_ps(vsqrtx${ABC[N]}, vresidual${ABC[N]}, vsqrtx${ABC[N]}); 50 …const __m256 vadjustment${ABC[N]} = _mm256_fnmadd_ps(vsqrtx${ABC[N]}, vsqrtx${ABC[N]}, vx${ABC[N]}… 53 … __m256 vy${ABC[N]} = _mm256_fmadd_ps(vhalfrsqrtx${ABC[N]}, vadjustment${ABC[N]}, vsqrtx${ABC[N]}); 65 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); variable 67 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); 69 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); 70 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); 71 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); [all …]
|
| D | avx512f-nr1fma1adj.c.in | 40 __m512 vsqrtx${ABC[N]} = _mm512_mul_ps(vrsqrtx${ABC[N]}, vx${ABC[N]}); 44 … const __m512 vresidual${ABC[N]} = _mm512_fnmadd_ps(vsqrtx${ABC[N]}, vhalfrsqrtx${ABC[N]}, vhalf); 48 vsqrtx${ABC[N]} = _mm512_fmadd_ps(vsqrtx${ABC[N]}, vresidual${ABC[N]}, vsqrtx${ABC[N]}); 51 …const __m512 vadjustment${ABC[N]} = _mm512_fnmadd_ps(vsqrtx${ABC[N]}, vsqrtx${ABC[N]}, vx${ABC[N]}… 54 … __m512 vy${ABC[N]} = _mm512_fmadd_ps(vhalfrsqrtx${ABC[N]}, vadjustment${ABC[N]}, vsqrtx${ABC[N]}); 66 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); variable 68 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); 70 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); 71 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); 72 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); [all …]
|
| D | neonfma-nr1rsqrts1fma1adj.c.in | 46 float32x4_t vsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vx${ABC[N:N+4]}); 50 …const float32x4_t vresidual${ABC[N:N+4]} = vfmsq_f32(vhalf, vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[… 54 … vsqrtx${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]}); 57 …at32x4_t vadjustment${ABC[N:N+4]} = vfmsq_f32(vx${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]}, vsqrtx${ABC[N:… 60 …const float32x4_t vy${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]}, vadju… 71 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); variable 73 const float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); 75 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); 76 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); 77 const float32x4_t vy = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); [all …]
|
| /external/XNNPACK/src/math/ |
| D | sqrt-neonfma-nr3fma.c | 28 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr3fma() local 35 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma() 37 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma() 39 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma() 41 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma() 43 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma() 44 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma() 46 const float32x4_t vy = vsqrtx; in xnn_math_f32_sqrt__neonfma_nr3fma()
|
| D | sqrt-neonfma-nr2fma1adj.c | 28 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj() local 35 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj() 37 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj() 39 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj() 41 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj() 46 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj() 47 vsqrtx = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_math_f32_sqrt__neonfma_nr2fma1adj() 49 const float32x4_t vy = vsqrtx; in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
|
| D | sqrt-fma3-nr1fma1adj.c | 29 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__fma3_nr1fma1adj() local 36 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr1fma1adj() 38 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__fma3_nr1fma1adj() 43 const __m256 vadjustment = _mm256_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_math_f32_sqrt__fma3_nr1fma1adj() 44 vsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_math_f32_sqrt__fma3_nr1fma1adj() 46 const __m256 vy = vsqrtx; in xnn_math_f32_sqrt__fma3_nr1fma1adj()
|
| D | sqrt-avx512f-nr1fma1adj.c | 29 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj() local 36 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr1fma1adj() 38 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj() 43 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj() 44 vsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj() 46 const __m512 vy = vsqrtx; in xnn_math_f32_sqrt__avx512f_nr1fma1adj()
|
| D | sqrt-neonfma-nr2fma.c | 28 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr2fma() local 35 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma() 37 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma() 39 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma() 40 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma() 42 const float32x4_t vy = vsqrtx; in xnn_math_f32_sqrt__neonfma_nr2fma()
|
| D | sqrt-fma3-nr2fma.c | 29 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__fma3_nr2fma() local 36 __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr2fma() 38 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__fma3_nr2fma() 40 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr2fma() 41 vsqrtx = _mm256_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__fma3_nr2fma() 43 const __m256 vy = vsqrtx; in xnn_math_f32_sqrt__fma3_nr2fma()
|
| D | sqrt-avx512f-nr2fma.c | 29 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__avx512f_nr2fma() local 36 __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr2fma() 38 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__avx512f_nr2fma() 40 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr2fma() 41 vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__avx512f_nr2fma() 43 const __m512 vy = vsqrtx; in xnn_math_f32_sqrt__avx512f_nr2fma()
|
| D | sqrt-neonfma-nr1rsqrts1fma1adj.c | 33 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj() local 40 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj() 42 vsqrtx = vfmaq_f32(vsqrtx, vresidual, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj() 47 const float32x4_t vadjustment = vfmsq_f32(vx, vsqrtx, vsqrtx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj() 48 vsqrtx = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj() 50 const float32x4_t vy = vsqrtx; in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj()
|