/external/XNNPACK/src/f32-raddstoreexpminusmax/gen/ |
D | scalar-lut64-p2-x2-acc2.c | 61 float vn0 = vx0 * vlog2e_x64 + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2_acc2() local 74 const uint32_t ve0 = (fp32_to_bits(vn0) & UINT32_C(0xFFFFFFC0)) << 17; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2_acc2() 78 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2_acc2() 85 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2_acc2() 90 float vt0 = vn0 * vminus_ln2_o64_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2_acc2() 93 vt0 = vn0 * vminus_ln2_o64_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2_acc2()
|
D | scalar-lut64-p2-x2.c | 60 float vn0 = vx0 * vlog2e_x64 + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2() local 73 const uint32_t ve0 = (fp32_to_bits(vn0) & UINT32_C(0xFFFFFFC0)) << 17; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2() 77 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2() 84 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2() 89 float vt0 = vn0 * vminus_ln2_o64_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2() 92 vt0 = vn0 * vminus_ln2_o64_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x2()
|
D | scalar-p5-x2-acc2.c | 58 float vn0 = vx0 * vlog2e + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2_acc2() local 63 const float vs0 = fp32_from_bits(fp32_to_bits(vn0) << 23); in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2_acc2() 67 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2_acc2() 72 float vt0 = vn0 * vminus_ln2_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2_acc2() 75 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2_acc2()
|
D | scalar-p5-x2.c | 57 float vn0 = vx0 * vlog2e + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2() local 62 const float vs0 = fp32_from_bits(fp32_to_bits(vn0) << 23); in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2() 66 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2() 71 float vt0 = vn0 * vminus_ln2_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2() 74 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2()
|
D | scalar-lut64-p2-x4-acc2.c | 65 float vn0 = vx0 * vlog2e_x64 + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc2() local 80 const uint32_t ve0 = (fp32_to_bits(vn0) & UINT32_C(0xFFFFFFC0)) << 17; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc2() 86 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc2() 97 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc2() 104 float vt0 = vn0 * vminus_ln2_o64_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc2() 109 vt0 = vn0 * vminus_ln2_o64_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc2()
|
D | scalar-lut64-p2-x4.c | 64 float vn0 = vx0 * vlog2e_x64 + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4() local 79 const uint32_t ve0 = (fp32_to_bits(vn0) & UINT32_C(0xFFFFFFC0)) << 17; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4() 85 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4() 96 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4() 103 float vt0 = vn0 * vminus_ln2_o64_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4() 108 vt0 = vn0 * vminus_ln2_o64_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4()
|
D | scalar-lut64-p2-x4-acc4.c | 67 float vn0 = vx0 * vlog2e_x64 + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc4() local 82 const uint32_t ve0 = (fp32_to_bits(vn0) & UINT32_C(0xFFFFFFC0)) << 17; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc4() 88 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc4() 99 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc4() 106 float vt0 = vn0 * vminus_ln2_o64_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc4() 111 vt0 = vn0 * vminus_ln2_o64_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x4_acc4()
|
D | scalar-p5-x4.c | 61 float vn0 = vx0 * vlog2e + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4() local 68 const float vs0 = fp32_from_bits(fp32_to_bits(vn0) << 23); in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4() 74 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4() 81 float vt0 = vn0 * vminus_ln2_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4() 86 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4()
|
D | scalar-p5-x4-acc2.c | 62 float vn0 = vx0 * vlog2e + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2() local 69 const float vs0 = fp32_from_bits(fp32_to_bits(vn0) << 23); in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2() 75 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2() 82 float vt0 = vn0 * vminus_ln2_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2() 87 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2()
|
D | scalar-p5-x4-acc4.c | 64 float vn0 = vx0 * vlog2e + vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc4() local 71 const float vs0 = fp32_from_bits(fp32_to_bits(vn0) << 23); in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc4() 77 vn0 -= vmagic_bias; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc4() 84 float vt0 = vn0 * vminus_ln2_hi + vx0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc4() 89 vt0 = vn0 * vminus_ln2_lo + vt0; in xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc4()
|
/external/XNNPACK/src/f32-sigmoid/gen/ |
D | scalar-lut2048-p1-div-x2.c | 66 float vn0 = vz0 * vminus_log2e_x2048 + vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2() local 79 const uint32_t ve0 = (fp32_to_bits(vn0) & ~vindex_mask) << 12; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2() 83 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2() 90 vn0 -= vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2() 95 float vt0 = vn0 * vln2_o2048_hi + vz0; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2() 98 vt0 = vn0 * vln2_o2048_lo + vt0; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2()
|
D | scalar-lut64-p2-div-x2.c | 66 float vn0 = vz0 * vminus_log2e_x64 + vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2() local 79 const uint32_t ve0 = (fp32_to_bits(vn0) & ~vindex_mask) << 17; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2() 83 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2() 90 vn0 -= vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2() 95 float vt0 = vn0 * vln2_o64_hi + vz0; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2() 98 vt0 = vn0 * vln2_o64_lo + vt0; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2()
|
D | scalar-lut2048-p1-div-x4.c | 70 float vn0 = vz0 * vminus_log2e_x2048 + vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x4() local 85 const uint32_t ve0 = (fp32_to_bits(vn0) & ~vindex_mask) << 12; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x4() 91 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x4() 102 vn0 -= vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x4() 109 float vt0 = vn0 * vln2_o2048_hi + vz0; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x4() 114 vt0 = vn0 * vln2_o2048_lo + vt0; in xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x4()
|
D | scalar-p5-div-x2.c | 64 float vn0 = vz0 * vminus_log2e + vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_p5_div_x2() local 69 const float vs0 = fp32_from_bits(fp32_to_bits(vn0) << 23); in xnn_f32_sigmoid_ukernel__scalar_p5_div_x2() 73 vn0 -= vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_p5_div_x2() 78 float vt0 = vn0 * vln2_hi + vz0; in xnn_f32_sigmoid_ukernel__scalar_p5_div_x2() 81 vt0 = vn0 * vln2_lo + vt0; in xnn_f32_sigmoid_ukernel__scalar_p5_div_x2()
|
D | scalar-lut64-p2-div-x4.c | 70 float vn0 = vz0 * vminus_log2e_x64 + vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x4() local 85 const uint32_t ve0 = (fp32_to_bits(vn0) & ~vindex_mask) << 17; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x4() 91 const uint32_t vidx0 = fp32_to_bits(vn0) & vindex_mask; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x4() 102 vn0 -= vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x4() 109 float vt0 = vn0 * vln2_o64_hi + vz0; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x4() 114 vt0 = vn0 * vln2_o64_lo + vt0; in xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x4()
|
D | scalar-p5-div-x4.c | 68 float vn0 = vz0 * vminus_log2e + vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_p5_div_x4() local 75 const float vs0 = fp32_from_bits(fp32_to_bits(vn0) << 23); in xnn_f32_sigmoid_ukernel__scalar_p5_div_x4() 81 vn0 -= vmagic_bias; in xnn_f32_sigmoid_ukernel__scalar_p5_div_x4() 88 float vt0 = vn0 * vln2_hi + vz0; in xnn_f32_sigmoid_ukernel__scalar_p5_div_x4() 93 vt0 = vn0 * vln2_lo + vt0; in xnn_f32_sigmoid_ukernel__scalar_p5_div_x4()
|
D | avx2-rr1-p5-div-x16.c | 64 __m256 vn0 = _mm256_fmadd_ps(vz0, vlog2e, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x16() local 69 const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); in xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x16() 73 vn0 = _mm256_sub_ps(vn0, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x16() 77 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vz0); in xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x16()
|
/external/XNNPACK/src/f32-vscaleexpminusmax/gen/ |
D | avx2-p5-x8.c | 54 __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x8() local 58 const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x8() 61 vn0 = _mm256_sub_ps(vn0, vmagic_bias); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x8() 65 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2_hi, vx0); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x8() 67 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2_lo, vt0); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x8()
|
D | avx2-p5-x16.c | 56 __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16() local 61 const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16() 65 vn0 = _mm256_sub_ps(vn0, vmagic_bias); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16() 70 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2_hi, vx0); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16() 73 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2_lo, vt0); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16()
|
D | avx2-p5-x24.c | 58 __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24() local 64 const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24() 69 vn0 = _mm256_sub_ps(vn0, vmagic_bias); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24() 75 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2_hi, vx0); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24() 79 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2_lo, vt0); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24()
|
D | avx512f-p5-scalef-x16.c | 50 __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x16() local 54 __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_hi, vx0); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x16() 56 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_lo, vt0); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x16() 72 __m512 vf0 = _mm512_scalef_ps(vp0, vn0); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x16()
|
D | avx2-p5-x32.c | 60 __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x32() local 67 const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23)); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x32() 73 vn0 = _mm256_sub_ps(vn0, vmagic_bias); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x32() 80 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2_hi, vx0); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x32() 85 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2_lo, vt0); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x32()
|
D | avx512f-p5-scalef-x32.c | 52 __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x32() local 57 __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_hi, vx0); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x32() 60 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_lo, vt0); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x32() 82 __m512 vf0 = _mm512_scalef_ps(vp0, vn0); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x32()
|
/external/XNNPACK/src/f32-vscaleextexp/gen/ |
D | avx512f-p5-scalef-x16.c | 48 const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x16() local 52 __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_hi, vx0); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x16() 54 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_lo, vt0); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x16() 75 const __m512 ve0 = _mm512_add_ps(vn0, vscalee); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x16()
|
D | avx2-p5-x8.c | 54 …const __m256 vn0 = _mm256_round_ps(_mm256_mul_ps(vx0, vlog2e), _MM_FROUND_TO_NEAREST_INT | _MM_FRO… in xnn_f32_vscaleextexp_ukernel__avx2_p5_x8() local 58 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2_hi, vx0); in xnn_f32_vscaleextexp_ukernel__avx2_p5_x8() 60 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2_lo, vt0); in xnn_f32_vscaleextexp_ukernel__avx2_p5_x8() 81 __m256 ve0 = _mm256_add_ps(vn0, vscalee); in xnn_f32_vscaleextexp_ukernel__avx2_p5_x8()
|