/external/XNNPACK/src/f32-vsqrt/gen/ |
D | avx512f-nr1fma1adj-x80.c | 34 const __m512 vx4 = _mm512_loadu_ps(x + 64); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() local 41 const __m512 vrsqrtx4 = _mm512_rsqrt14_ps(vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 51 __m512 vsqrtx4 = _mm512_mul_ps(vrsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 75 const __m512 vadjustment4 = _mm512_fnmadd_ps(vsqrtx4, vsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
|
D | avx512f-nr1fma1adj-x96.c | 34 const __m512 vx4 = _mm512_loadu_ps(x + 64); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() local 42 const __m512 vrsqrtx4 = _mm512_rsqrt14_ps(vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 53 __m512 vsqrtx4 = _mm512_mul_ps(vrsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 82 const __m512 vadjustment4 = _mm512_fnmadd_ps(vsqrtx4, vsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
|
D | fma3-nr1fma1adj-x40.c | 33 const __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() local 40 const __m256 vrsqrtx4 = _mm256_rsqrt_ps(vx4); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 50 __m256 vsqrtx4 = _mm256_mul_ps(vrsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 74 const __m256 vadjustment4 = _mm256_fnmadd_ps(vsqrtx4, vsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
|
D | fma3-nr1fma1adj-x48.c | 33 const __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() local 41 const __m256 vrsqrtx4 = _mm256_rsqrt_ps(vx4); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 52 __m256 vsqrtx4 = _mm256_mul_ps(vrsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 81 const __m256 vadjustment4 = _mm256_fnmadd_ps(vsqrtx4, vsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
|
D | avx512f-nr1fma1adj-x112.c | 34 const __m512 vx4 = _mm512_loadu_ps(x + 64); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() local 43 const __m512 vrsqrtx4 = _mm512_rsqrt14_ps(vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 55 __m512 vsqrtx4 = _mm512_mul_ps(vrsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 89 const __m512 vadjustment4 = _mm512_fnmadd_ps(vsqrtx4, vsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
|
D | fma3-nr1fma1adj-x56.c | 33 const __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() local 42 const __m256 vrsqrtx4 = _mm256_rsqrt_ps(vx4); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 54 __m256 vsqrtx4 = _mm256_mul_ps(vrsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 88 const __m256 vadjustment4 = _mm256_fnmadd_ps(vsqrtx4, vsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
|
D | avx512f-nr1fma1adj-x128.c | 34 const __m512 vx4 = _mm512_loadu_ps(x + 64); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() local 44 const __m512 vrsqrtx4 = _mm512_rsqrt14_ps(vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 57 __m512 vsqrtx4 = _mm512_mul_ps(vrsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 96 const __m512 vadjustment4 = _mm512_fnmadd_ps(vsqrtx4, vsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
|
D | fma3-nr1fma1adj-x64.c | 33 const __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() local 43 const __m256 vrsqrtx4 = _mm256_rsqrt_ps(vx4); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() 56 __m256 vsqrtx4 = _mm256_mul_ps(vrsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() 95 const __m256 vadjustment4 = _mm256_fnmadd_ps(vsqrtx4, vsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64()
|
/external/XNNPACK/src/f32-velu/gen/ |
D | velu-avx2-rr1-lut8-p4-perm-x40.c | 43 __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x40() local 50 const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x40() 128 vx4 = _mm256_mul_ps(vx4, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x40() 134 const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x40()
|
D | velu-avx2-rr1-p6-x40.c | 44 __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_velu_ukernel__avx2_rr1_p6_x40() local 51 const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_p6_x40() 131 vx4 = _mm256_mul_ps(vx4, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_p6_x40() 137 const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4); in xnn_f32_velu_ukernel__avx2_rr1_p6_x40()
|
D | velu-avx2-rr1-lut16-p3-gather-x40.c | 44 __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x40() local 51 const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x40() 129 vx4 = _mm256_mul_ps(vx4, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x40() 135 const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x40()
|
D | velu-avx2-rr1-lut4-p4-perm-x40.c | 43 __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x40() local 50 const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x40() 128 vx4 = _mm256_mul_ps(vx4, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x40() 134 const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x40()
|
D | velu-scalar-rr2-lut16-p3-x5.c | 46 float vx4 = x[4]; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5() local 53 const float vz4 = vx4 * vprescale; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5() 152 float vy4 = vx4 * vbeta; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5() 166 if XNN_UNPREDICTABLE(vx4 < 0.0f) { in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5()
|
D | velu-scalar-rr2-p6-x5.c | 46 float vx4 = x[4]; in xnn_f32_velu_ukernel__scalar_rr2_p6_x5() local 53 const float vz4 = vx4 * vprescale; in xnn_f32_velu_ukernel__scalar_rr2_p6_x5() 161 float vy4 = vx4 * vbeta; in xnn_f32_velu_ukernel__scalar_rr2_p6_x5() 175 if XNN_UNPREDICTABLE(vx4 < 0.0f) { in xnn_f32_velu_ukernel__scalar_rr2_p6_x5()
|
D | velu-avx2-rr1-lut8-p4-perm-x48.c | 43 __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48() local 51 const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48() 142 vx4 = _mm256_mul_ps(vx4, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48() 150 const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48()
|
D | velu-avx2-rr1-lut16-p3-gather-x48.c | 44 __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48() local 52 const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48() 143 vx4 = _mm256_mul_ps(vx4, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48() 151 const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48()
|
D | velu-avx2-rr1-lut4-p4-perm-x48.c | 43 __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48() local 51 const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48() 142 vx4 = _mm256_mul_ps(vx4, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48() 150 const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48()
|
D | velu-avx2-rr1-p6-x48.c | 44 __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() local 52 const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() 145 vx4 = _mm256_mul_ps(vx4, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() 153 const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48()
|
D | velu-scalar-rr2-lut16-p3-x6.c | 46 float vx4 = x[4]; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() local 54 const float vz4 = vx4 * vprescale; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() 170 float vy4 = vx4 * vbeta; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() 186 if XNN_UNPREDICTABLE(vx4 < 0.0f) { in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6()
|
D | velu-avx2-rr1-lut16-p3-gather-x56.c | 44 __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56() local 53 const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56() 157 vx4 = _mm256_mul_ps(vx4, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56() 167 const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56()
|
D | velu-avx2-rr1-p6-x56.c | 44 __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_velu_ukernel__avx2_rr1_p6_x56() local 53 const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_p6_x56() 159 vx4 = _mm256_mul_ps(vx4, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_p6_x56() 169 const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4); in xnn_f32_velu_ukernel__avx2_rr1_p6_x56()
|
D | velu-avx-rr2-p6-x40.c | 46 __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_velu_ukernel__avx_rr2_p6_x40() local 53 const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale)); in xnn_f32_velu_ukernel__avx_rr2_p6_x40() 150 vx4 = _mm256_mul_ps(vx4, vbeta); in xnn_f32_velu_ukernel__avx_rr2_p6_x40() 156 const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4); in xnn_f32_velu_ukernel__avx_rr2_p6_x40()
|
D | velu-avx2-rr1-lut8-p4-perm-x56.c | 43 __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56() local 52 const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56() 156 vx4 = _mm256_mul_ps(vx4, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56() 166 const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56()
|
D | velu-avx2-rr1-lut4-p4-perm-x56.c | 43 __m256 vx4 = _mm256_loadu_ps(x + 32); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56() local 52 const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56() 156 vx4 = _mm256_mul_ps(vx4, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56() 166 const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56()
|
/external/XNNPACK/src/x8-lut/gen/ |
D | lut-scalar-x8.c | 31 const size_t vx4 = (size_t) x[4]; in xnn_x8_lut_ukernel__scalar_x8() local 41 const uint32_t vt4 = (uint32_t) t[vx4]; in xnn_x8_lut_ukernel__scalar_x8()
|