/external/XNNPACK/src/f32-vsqrt/gen/ |
D | avx512f-nr1fma1adj-x96.c | 35 const __m512 vx5 = _mm512_loadu_ps(x + 80); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() local 43 const __m512 vrsqrtx5 = _mm512_rsqrt14_ps(vx5); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 55 __m512 vsqrtx5 = _mm512_mul_ps(vrsqrtx5, vx5); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 83 const __m512 vadjustment5 = _mm512_fnmadd_ps(vsqrtx5, vsqrtx5, vx5); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
|
D | fma3-nr1fma1adj-x48.c | 36 const __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() local 44 const __m256 vrsqrtx5 = _mm256_rsqrt_ps(vx5); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 56 __m256 vsqrtx5 = _mm256_mul_ps(vrsqrtx5, vx5); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 84 const __m256 vadjustment5 = _mm256_fnmadd_ps(vsqrtx5, vsqrtx5, vx5); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
|
D | avx512f-nr1fma1adj-x112.c | 35 const __m512 vx5 = _mm512_loadu_ps(x + 80); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() local 44 const __m512 vrsqrtx5 = _mm512_rsqrt14_ps(vx5); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 57 __m512 vsqrtx5 = _mm512_mul_ps(vrsqrtx5, vx5); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 90 const __m512 vadjustment5 = _mm512_fnmadd_ps(vsqrtx5, vsqrtx5, vx5); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
|
D | fma3-nr1fma1adj-x56.c | 36 const __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() local 45 const __m256 vrsqrtx5 = _mm256_rsqrt_ps(vx5); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 58 __m256 vsqrtx5 = _mm256_mul_ps(vrsqrtx5, vx5); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 91 const __m256 vadjustment5 = _mm256_fnmadd_ps(vsqrtx5, vsqrtx5, vx5); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
|
D | avx512f-nr1fma1adj-x128.c | 35 const __m512 vx5 = _mm512_loadu_ps(x + 80); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() local 45 const __m512 vrsqrtx5 = _mm512_rsqrt14_ps(vx5); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 59 __m512 vsqrtx5 = _mm512_mul_ps(vrsqrtx5, vx5); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 97 const __m512 vadjustment5 = _mm512_fnmadd_ps(vsqrtx5, vsqrtx5, vx5); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
|
D | fma3-nr1fma1adj-x64.c | 36 const __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() local 46 const __m256 vrsqrtx5 = _mm256_rsqrt_ps(vx5); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() 60 __m256 vsqrtx5 = _mm256_mul_ps(vrsqrtx5, vx5); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() 98 const __m256 vadjustment5 = _mm256_fnmadd_ps(vsqrtx5, vsqrtx5, vx5); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64()
|
/external/XNNPACK/src/f32-velu/gen/ |
D | velu-avx2-rr1-lut16-p3-gather-x48.c | 48 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48() local 56 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48() 148 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48() 155 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48()
|
D | velu-avx2-rr1-lut8-p4-perm-x48.c | 48 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48() local 56 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48() 148 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48() 155 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48()
|
D | velu-avx2-rr1-lut4-p4-perm-x48.c | 49 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48() local 57 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48() 149 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48() 156 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48()
|
D | velu-avx2-rr1-p6-x48.c | 48 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() local 56 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() 150 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() 157 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48()
|
D | velu-scalar-rr2-lut16-p3-x6.c | 49 float vx5 = x[5]; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() local 57 const float vz5 = vx5 * vprescale; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() 174 float vy5 = vx5 * vbeta; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() 191 if XNN_UNPREDICTABLE(vx5 < 0.0f) { in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6()
|
D | velu-scalar-rr2-p6-x6.c | 49 float vx5 = x[5]; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() local 57 const float vz5 = vx5 * vprescale; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() 184 float vy5 = vx5 * vbeta; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() 201 if XNN_UNPREDICTABLE(vx5 < 0.0f) { in xnn_f32_velu_ukernel__scalar_rr2_p6_x6()
|
D | velu-avx2-rr1-lut4-p4-perm-x56.c | 49 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56() local 58 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56() 163 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56() 172 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56()
|
D | velu-avx2-rr1-lut8-p4-perm-x56.c | 48 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56() local 57 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56() 162 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56() 171 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56()
|
D | velu-avx2-rr1-lut16-p3-gather-x56.c | 48 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56() local 57 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56() 162 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56() 171 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56()
|
D | velu-avx2-rr1-p6-x56.c | 48 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_p6_x56() local 57 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_p6_x56() 164 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_p6_x56() 173 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_p6_x56()
|
D | velu-avx2-rr1-p6-x64.c | 48 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_p6_x64() local 58 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_p6_x64() 178 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_p6_x64() 189 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_p6_x64()
|
D | velu-avx2-rr1-lut4-p4-perm-x64.c | 49 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x64() local 59 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x64() 177 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x64() 188 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x64()
|
D | velu-avx2-rr1-lut8-p4-perm-x64.c | 48 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64() local 58 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64() 176 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64() 187 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64()
|
D | velu-avx2-rr1-lut16-p3-gather-x64.c | 48 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64() local 58 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64() 176 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64() 187 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64()
|
D | velu-avx2-rr1-p6-x72.c | 48 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_p6_x72() local 59 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_p6_x72() 192 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_p6_x72() 205 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_p6_x72()
|
D | velu-avx2-rr1-lut4-p4-perm-x72.c | 49 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72() local 60 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72() 191 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72() 204 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72()
|
D | velu-avx-rr2-lut4-p4-perm-x48.c | 52 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48() local 60 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48() 179 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48() 186 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48()
|
D | velu-avx2-rr1-lut16-p3-gather-x72.c | 48 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72() local 59 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72() 190 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72() 203 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72()
|
D | velu-avx-rr2-p6-x48.c | 50 __m256 vx5 = _mm256_loadu_ps(x + 40); in xnn_f32_velu_ukernel__avx_rr2_p6_x48() local 58 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale)); in xnn_f32_velu_ukernel__avx_rr2_p6_x48() 172 vx5 = _mm256_mul_ps(vx5, vbeta); in xnn_f32_velu_ukernel__avx_rr2_p6_x48() 179 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5); in xnn_f32_velu_ukernel__avx_rr2_p6_x48()
|