/external/XNNPACK/src/f16-raddstoreexpminusmax/gen/ |
D | neonfp16arith-rr2-p2-x96.c | 77 float16x8_t vnA = vfmaq_f16(vmagic_bias, vxA, vlog2e); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96() local 90 const float16x8_t vsA = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vnA), 10)); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96() 103 vnA = vsubq_f16(vnA, vmagic_bias); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96() 116 float16x8_t vtA = vfmaq_f16(vxA, vnA, vminus_ln2_hi); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96() 129 vtA = vfmaq_f16(vtA, vnA, vminus_ln2_lo); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96()
|
D | neonfp16arith-rr2-p2-x96-acc3.c | 79 float16x8_t vnA = vfmaq_f16(vmagic_bias, vxA, vlog2e); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc3() local 92 const float16x8_t vsA = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vnA), 10)); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc3() 105 vnA = vsubq_f16(vnA, vmagic_bias); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc3() 118 float16x8_t vtA = vfmaq_f16(vxA, vnA, vminus_ln2_hi); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc3() 131 vtA = vfmaq_f16(vtA, vnA, vminus_ln2_lo); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc3()
|
D | neonfp16arith-rr2-p2-x96-acc2.c | 78 float16x8_t vnA = vfmaq_f16(vmagic_bias, vxA, vlog2e); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc2() local 91 const float16x8_t vsA = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vnA), 10)); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc2() 104 vnA = vsubq_f16(vnA, vmagic_bias); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc2() 117 float16x8_t vtA = vfmaq_f16(vxA, vnA, vminus_ln2_hi); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc2() 130 vtA = vfmaq_f16(vtA, vnA, vminus_ln2_lo); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc2()
|
D | neonfp16arith-rr2-p2-x96-acc6.c | 82 float16x8_t vnA = vfmaq_f16(vmagic_bias, vxA, vlog2e); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc6() local 95 const float16x8_t vsA = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vnA), 10)); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc6() 108 vnA = vsubq_f16(vnA, vmagic_bias); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc6() 121 float16x8_t vtA = vfmaq_f16(vxA, vnA, vminus_ln2_hi); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc6() 134 vtA = vfmaq_f16(vtA, vnA, vminus_ln2_lo); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc6()
|
D | avx2-rr1-p2-x96-acc6.c | 82 __m256 vnA = _mm256_fmadd_ps(vxA, vlog2e, vmagic_bias); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6() local 95 const __m256 vsA = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vnA), 23)); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6() 108 vnA = _mm256_sub_ps(vnA, vmagic_bias); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6() 121 __m256 vtA = _mm256_fmadd_ps(vnA, vminus_ln2, vxA); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6()
|
D | avx2-rr1-p2-x96-acc3.c | 79 __m256 vnA = _mm256_fmadd_ps(vxA, vlog2e, vmagic_bias); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3() local 92 const __m256 vsA = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vnA), 23)); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3() 105 vnA = _mm256_sub_ps(vnA, vmagic_bias); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3() 118 __m256 vtA = _mm256_fmadd_ps(vnA, vminus_ln2, vxA); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3()
|
D | avx2-rr1-p2-x96-acc2.c | 78 __m256 vnA = _mm256_fmadd_ps(vxA, vlog2e, vmagic_bias); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2() local 91 const __m256 vsA = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vnA), 23)); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2() 104 vnA = _mm256_sub_ps(vnA, vmagic_bias); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2() 117 __m256 vtA = _mm256_fmadd_ps(vnA, vminus_ln2, vxA); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2()
|
D | avx2-rr1-p2-x96.c | 77 __m256 vnA = _mm256_fmadd_ps(vxA, vlog2e, vmagic_bias); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96() local 90 const __m256 vsA = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vnA), 23)); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96() 103 vnA = _mm256_sub_ps(vnA, vmagic_bias); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96() 116 __m256 vtA = _mm256_fmadd_ps(vnA, vminus_ln2, vxA); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96()
|