/external/XNNPACK/src/f32-raddextexp/ |
D | avx512f-p5-scalef.c.in | 41 __m512 vaccv${K} = _mm512_setzero_ps(); 102 vaccv${K} = _mm512_scalef_ps(vaccv${K}, vdelta_acce${K}); 104 …vaccv${N % ACCUMULATORS} = _mm512_add_ps(vaccv${N % ACCUMULATORS}, _mm512_scalef_ps(vp${N}, vdelta… 127 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); 129 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv${K}, vdelta_acce${K})); 132 __m512 vaccv = vaccv0; 159 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); 160 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); 191 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); 192 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); [all …]
|
D | avx2-p5.c.in | 45 __m256 vaccv${K} = _mm256_setzero_ps(); 120 vaccv${K} = _mm256_mul_ps(vaccv${K}, vaccs${K}); 122 vaccv${N % ACCUMULATORS} = _mm256_fmadd_ps(vp${N}, vs${N}, vaccv${N % ACCUMULATORS}); 148 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); 150 vaccv = _mm256_fmadd_ps(vaccv${K}, vaccs${K}, vaccv); 153 __m256 vaccv = vaccv0; 188 vaccv = _mm256_mul_ps(vaccv, vaccs); 189 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); 232 vaccv = _mm256_mul_ps(vaccv, vaccs); 233 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); [all …]
|
/external/XNNPACK/src/f32-raddextexp/gen/ |
D | avx512f-p5-scalef-x192-acc6.c | 260 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() local 261 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 262 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv2, vdelta_acce2)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 263 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv3, vdelta_acce3)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 264 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv4, vdelta_acce4)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 265 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv5, vdelta_acce5)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 292 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 293 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 324 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 325 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() [all …]
|
D | avx512f-p5-scalef-x128-acc4.c | 198 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() local 199 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 200 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv2, vdelta_acce2)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 201 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv3, vdelta_acce3)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 228 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 229 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 260 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 261 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 269 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4()
|
D | avx512f-p5-scalef-x160-acc5.c | 230 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() local 231 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 232 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv2, vdelta_acce2)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 233 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv3, vdelta_acce3)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 234 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv4, vdelta_acce4)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 261 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 262 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 293 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 294 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 302 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5()
|
D | avx2-p5-x64-acc4.c | 229 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() local 230 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 231 vaccv = _mm256_fmadd_ps(vaccv2, vaccs2, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 232 vaccv = _mm256_fmadd_ps(vaccv3, vaccs3, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 267 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 268 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 311 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 312 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 324 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 325 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4()
|
D | avx512f-p5-scalef-x144-acc3.c | 204 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() local 205 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 206 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv2, vdelta_acce2)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 233 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 234 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 265 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 266 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 274 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3()
|
D | avx512f-p5-scalef-x128-acc2.c | 184 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() local 185 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() 212 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() 213 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() 244 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() 245 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() 253 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2()
|
D | avx2-p5-x96-acc6.c | 299 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() local 300 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() 301 vaccv = _mm256_fmadd_ps(vaccv2, vaccs2, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() 302 vaccv = _mm256_fmadd_ps(vaccv3, vaccs3, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() 303 vaccv = _mm256_fmadd_ps(vaccv4, vaccs4, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() 304 vaccv = _mm256_fmadd_ps(vaccv5, vaccs5, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() 339 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() 340 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() 383 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() 384 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() [all …]
|
D | avx2-p5-x80-acc5.c | 265 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() local 266 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 267 vaccv = _mm256_fmadd_ps(vaccv2, vaccs2, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 268 vaccv = _mm256_fmadd_ps(vaccv3, vaccs3, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 269 vaccv = _mm256_fmadd_ps(vaccv4, vaccs4, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 304 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 305 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 348 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 349 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 361 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() [all …]
|
D | avx2-p5-x72-acc3.c | 234 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() local 235 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 236 vaccv = _mm256_fmadd_ps(vaccv2, vaccs2, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 271 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 272 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 315 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 316 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 328 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 329 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3()
|
D | avx512f-p5-scalef-x160-acc2.c | 208 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() local 209 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() 236 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() 237 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() 268 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() 269 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() 277 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2()
|
D | avx512f-p5-scalef-x128.c | 174 __m512 vaccv = vaccv0; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() local 201 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() 202 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() 233 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() 234 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() 242 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128()
|
D | avx512f-p5-scalef-x192-acc3.c | 240 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() local 241 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 242 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv2, vdelta_acce2)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 269 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 270 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 301 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 302 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 310 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3()
|
D | avx2-p5-x64-acc2.c | 211 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() local 212 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 247 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 248 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 291 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 292 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 304 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 305 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2()
|
D | avx512f-p5-scalef-x144.c | 186 __m512 vaccv = vaccv0; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() local 213 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() 214 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() 245 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() 246 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() 254 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144()
|
D | avx2-p5-x80-acc2.c | 237 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() local 238 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 273 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 274 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 317 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 318 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 330 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 331 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2()
|
D | avx512f-p5-scalef-x192-acc2.c | 232 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() local 233 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() 260 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() 261 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() 292 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() 293 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() 301 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2()
|
D | avx2-p5-x64.c | 197 __m256 vaccv = vaccv0; in xnn_f32_raddextexp_ukernel__avx2_p5_x64() local 232 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 233 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 276 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 277 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 289 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 290 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64()
|
D | avx2-p5-x96-acc3.c | 273 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() local 274 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 275 vaccv = _mm256_fmadd_ps(vaccv2, vaccs2, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 310 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 311 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 354 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 355 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 367 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 368 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3()
|
D | avx512f-p5-scalef-x160.c | 198 __m512 vaccv = vaccv0; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() local 225 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() 226 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() 257 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() 258 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() 266 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160()
|
D | avx2-p5-x72.c | 210 __m256 vaccv = vaccv0; in xnn_f32_raddextexp_ukernel__avx2_p5_x72() local 245 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 246 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 289 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 290 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 302 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 303 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x72()
|
D | avx2-p5-x80.c | 223 __m256 vaccv = vaccv0; in xnn_f32_raddextexp_ukernel__avx2_p5_x80() local 258 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 259 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 302 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 303 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 315 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 316 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80()
|
D | avx512f-p5-scalef-x192.c | 222 __m512 vaccv = vaccv0; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() local 249 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() 250 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() 281 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() 282 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() 290 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192()
|
D | avx2-p5-x96-acc2.c | 263 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() local 264 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 299 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 300 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 343 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 344 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 356 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 357 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2()
|