/external/XNNPACK/src/f32-raddstoreexpminusmax/gen/ |
D | scalar-lut64-p2-x2.c | 60 float vn0 = vx0 * vlog2e_x64 + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2() local 73 const uint32_t ve0 = (fp32_to_bits(vn0) & UINT32_C(0xFFFFFFC0)) << 17; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2() 77 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2() 84 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2() 89 float vt0 = vn0 * vminus_ln2_o64_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2() 92 vt0 = vn0 * vminus_ln2_o64_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2()
|
D | scalar-lut64-p2-x2-acc2.c | 61 float vn0 = vx0 * vlog2e_x64 + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2_acc2() local 74 const uint32_t ve0 = (fp32_to_bits(vn0) & UINT32_C(0xFFFFFFC0)) << 17; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2_acc2() 78 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2_acc2() 85 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2_acc2() 90 float vt0 = vn0 * vminus_ln2_o64_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2_acc2() 93 vt0 = vn0 * vminus_ln2_o64_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2_acc2()
|
D | scalar-p5-x2.c | 57 float vn0 = vx0 * vlog2e + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2() local 62 const float vs0 = fp32_from_bits(fp32_to_bits(vn0) << 23); in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2() 66 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2() 71 float vt0 = vn0 * vminus_ln2_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2() 74 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2()
|
D | scalar-p5-x2-acc2.c | 58 float vn0 = vx0 * vlog2e + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2_acc2() local 63 const float vs0 = fp32_from_bits(fp32_to_bits(vn0) << 23); in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2_acc2() 67 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2_acc2() 72 float vt0 = vn0 * vminus_ln2_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2_acc2() 75 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2_acc2()
|
D | scalar-lut64-p2-x4.c | 64 float vn0 = vx0 * vlog2e_x64 + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4() local 79 const uint32_t ve0 = (fp32_to_bits(vn0) & UINT32_C(0xFFFFFFC0)) << 17; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4() 85 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4() 96 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4() 103 float vt0 = vn0 * vminus_ln2_o64_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4() 108 vt0 = vn0 * vminus_ln2_o64_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4()
|
D | scalar-lut64-p2-x4-acc2.c | 65 float vn0 = vx0 * vlog2e_x64 + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc2() local 80 const uint32_t ve0 = (fp32_to_bits(vn0) & UINT32_C(0xFFFFFFC0)) << 17; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc2() 86 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc2() 97 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc2() 104 float vt0 = vn0 * vminus_ln2_o64_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc2() 109 vt0 = vn0 * vminus_ln2_o64_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc2()
|
D | scalar-lut64-p2-x4-acc4.c | 67 float vn0 = vx0 * vlog2e_x64 + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc4() local 82 const uint32_t ve0 = (fp32_to_bits(vn0) & UINT32_C(0xFFFFFFC0)) << 17; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc4() 88 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc4() 99 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc4() 106 float vt0 = vn0 * vminus_ln2_o64_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc4() 111 vt0 = vn0 * vminus_ln2_o64_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc4()
|
/external/XNNPACK/src/f32-sigmoid/gen/ |
D | scalar-lut2048-p1-div-x2.c | 47 float vn0 = vz0 * vminus_log2e + vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2() local 50 const uint32_t ve0 = fp32_to_bits(vn0) << 12; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2() 53 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2() 58 vn0 -= vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2() 61 float vt0 = vn0 * vln2_hi + vz0; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2() 64 vt0 = vn0 * vln2_lo + vt0; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2()
|
D | scalar-lut64-p2-div-x2.c | 47 float vn0 = vz0 * vminus_log2e + vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2() local 50 const uint32_t ve0 = fp32_to_bits(vn0) << 17; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2() 53 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2() 58 vn0 -= vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2() 61 float vt0 = vn0 * vln2_hi + vz0; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2() 64 vt0 = vn0 * vln2_lo + vt0; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2()
|
D | scalar-p5-div-x2.c | 47 float vn0 = vz0 * vminus_log2e + vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_p5_div_x2() local 50 const float vs0 = fp32_from_bits(fp32_to_bits(vn0) << 23); in xnn_f32_sigmoid_ukernel__scalar_p5_div_x2() 53 vn0 -= vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_p5_div_x2() 56 float vt0 = vn0 * vln2_hi + vz0; in xnn_f32_sigmoid_ukernel__scalar_p5_div_x2() 59 vt0 = vn0 * vln2_lo + vt0; in xnn_f32_sigmoid_ukernel__scalar_p5_div_x2()
|
D | avx512f-rr2-lut32-p2-perm2-scalef-div-x32.c | 54 __m512 vn0 = _mm512_fmadd_ps(vz0, vlog2e, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x32() local 57 const __m512 vl0 = _mm512_permutex2var_ps(vtable_lo, _mm512_castps_si512(vn0), vtable_hi); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x32() 60 vn0 = _mm512_sub_ps(vn0, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x32() 63 __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_hi, vz0); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x32() 66 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_lo, vt0); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x32() 78 const __m512 ve0 = _mm512_scalef_ps(vp0, vn0); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x32()
|
D | scalar-lut2048-p1-div-x4.c | 51 float vn0 = vz0 * vminus_log2e + vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x4() local 56 const uint32_t ve0 = fp32_to_bits(vn0) << 12; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x4() 61 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x4() 70 vn0 -= vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x4() 75 float vt0 = vn0 * vln2_hi + vz0; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x4() 80 vt0 = vn0 * vln2_lo + vt0; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x4()
|
D | scalar-lut64-p2-div-x4.c | 51 float vn0 = vz0 * vminus_log2e + vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x4() local 56 const uint32_t ve0 = fp32_to_bits(vn0) << 17; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x4() 61 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x4() 70 vn0 -= vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x4() 75 float vt0 = vn0 * vln2_hi + vz0; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x4() 80 vt0 = vn0 * vln2_lo + vt0; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x4()
|
D | avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x32.c | 54 __m512 vn0 = _mm512_fmadd_ps(vz0, vlog2e, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x32() local 57 const __m512 vl0 = _mm512_permutex2var_ps(vtable_lo, _mm512_castps_si512(vn0), vtable_hi); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x32() 60 vn0 = _mm512_sub_ps(vn0, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x32() 63 __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_hi, vz0); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x32() 66 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_lo, vt0); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x32() 78 const __m512 ve0 = _mm512_scalef_ps(vp0, vn0); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x32()
|
D | avx512f-rr2-lut32-p2-perm2-scalef-div-x48.c | 56 __m512 vn0 = _mm512_fmadd_ps(vz0, vlog2e, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x48() local 60 const __m512 vl0 = _mm512_permutex2var_ps(vtable_lo, _mm512_castps_si512(vn0), vtable_hi); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x48() 64 vn0 = _mm512_sub_ps(vn0, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x48() 68 __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_hi, vz0); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x48() 72 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_lo, vt0); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x48() 88 const __m512 ve0 = _mm512_scalef_ps(vp0, vn0); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x48()
|
/external/XNNPACK/src/f32-velu/gen/ |
D | velu-wasm-rr2-lut16-p3-x2.c | 51 float vn0 = vz0 * vlog2e + vmagic_bias; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x2() local 54 const uint32_t ven0 = fp32_to_bits(vn0) << 19; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x2() 55 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x2() 56 vn0 -= vmagic_bias; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x2() 61 float vt0 = vn0 * vminus_ln2_hi + vz0; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x2() 66 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x2()
|
D | velu-scalar-rr2-lut16-p3-x2.c | 51 float vn0 = vz0 * vlog2e + vmagic_bias; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2() local 54 const uint32_t ven0 = fp32_to_bits(vn0) << 19; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2() 55 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2() 56 vn0 -= vmagic_bias; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2() 61 float vt0 = vn0 * vminus_ln2_hi + vz0; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2() 66 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2()
|
D | velu-wasm-rr2-lut16-p3-x3.c | 53 float vn0 = vz0 * vlog2e + vmagic_bias; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x3() local 57 const uint32_t ven0 = fp32_to_bits(vn0) << 19; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x3() 58 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x3() 59 vn0 -= vmagic_bias; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x3() 67 float vt0 = vn0 * vminus_ln2_hi + vz0; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x3() 74 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x3()
|
D | velu-scalar-rr2-lut16-p3-x3.c | 53 float vn0 = vz0 * vlog2e + vmagic_bias; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x3() local 57 const uint32_t ven0 = fp32_to_bits(vn0) << 19; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x3() 58 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x3() 59 vn0 -= vmagic_bias; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x3() 67 float vt0 = vn0 * vminus_ln2_hi + vz0; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x3() 74 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x3()
|
D | velu-wasm-rr2-p6-x2.c | 51 float vn0 = vz0 * vlog2e + vmagic_bias; in xnn_f32_velu_ukernel__wasm_rr2_p6_x2() local 54 float vs0 = fp32_from_bits(fp32_to_bits(vn0) << 23); in xnn_f32_velu_ukernel__wasm_rr2_p6_x2() 55 vn0 -= vmagic_bias; in xnn_f32_velu_ukernel__wasm_rr2_p6_x2() 59 float vt0 = vn0 * vminus_ln2_hi + vz0; in xnn_f32_velu_ukernel__wasm_rr2_p6_x2() 62 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_velu_ukernel__wasm_rr2_p6_x2()
|
D | velu-scalar-rr2-p6-x2.c | 51 float vn0 = vz0 * vlog2e + vmagic_bias; in xnn_f32_velu_ukernel__scalar_rr2_p6_x2() local 54 float vs0 = fp32_from_bits(fp32_to_bits(vn0) << 23); in xnn_f32_velu_ukernel__scalar_rr2_p6_x2() 55 vn0 -= vmagic_bias; in xnn_f32_velu_ukernel__scalar_rr2_p6_x2() 59 float vt0 = vn0 * vminus_ln2_hi + vz0; in xnn_f32_velu_ukernel__scalar_rr2_p6_x2() 62 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_velu_ukernel__scalar_rr2_p6_x2()
|
D | velu-scalar-rr2-lut16-p3-x4.c | 55 float vn0 = vz0 * vlog2e + vmagic_bias; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4() local 60 const uint32_t ven0 = fp32_to_bits(vn0) << 19; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4() 61 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4() 62 vn0 -= vmagic_bias; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4() 73 float vt0 = vn0 * vminus_ln2_hi + vz0; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4() 82 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4()
|
D | velu-wasm-rr2-lut16-p3-x4.c | 55 float vn0 = vz0 * vlog2e + vmagic_bias; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x4() local 60 const uint32_t ven0 = fp32_to_bits(vn0) << 19; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x4() 61 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x4() 62 vn0 -= vmagic_bias; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x4() 73 float vt0 = vn0 * vminus_ln2_hi + vz0; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x4() 82 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x4()
|
/external/XNNPACK/src/f32-vscaleexpminusmax/gen/ |
D | avx2-p5-x8.c | 54 __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x8() local 58 const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x8() 61 vn0 = _mm256_sub_ps(vn0, vmagic_bias); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x8() 65 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2_hi, vx0); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x8() 67 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2_lo, vt0); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x8()
|
D | avx2-p5-x16.c | 56 __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16() local 61 const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16() 65 vn0 = _mm256_sub_ps(vn0, vmagic_bias); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16() 70 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2_hi, vx0); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16() 73 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2_lo, vt0); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16()
|