/external/vixl/src/aarch64/ |
D | assembler-aarch64.cc | 315 const VRegister& vn, in NEONTable() argument 320 VIXL_ASSERT(vn.Is16B()); in NEONTable() 322 Emit(op | (vd.IsQ() ? NEON_Q : 0) | Rm(vm) | Rn(vn) | Rd(vd)); in NEONTable() 327 const VRegister& vn, in tbl() argument 330 NEONTable(vd, vn, vm, NEON_TBL_1v); in tbl() 335 const VRegister& vn, in tbl() argument 340 VIXL_ASSERT(AreSameFormat(vn, vn2)); in tbl() 341 VIXL_ASSERT(AreConsecutive(vn, vn2)); in tbl() 342 NEONTable(vd, vn, vm, NEON_TBL_2v); in tbl() 347 const VRegister& vn, in tbl() argument [all …]
|
D | assembler-aarch64.h | 565 void tbl(const VRegister& vd, const VRegister& vn, const VRegister& vm); 569 const VRegister& vn, 575 const VRegister& vn, 582 const VRegister& vn, 589 void tbx(const VRegister& vd, const VRegister& vn, const VRegister& vm); 593 const VRegister& vn, 599 const VRegister& vn, 606 const VRegister& vn, 2213 void fmov(const Register& rd, const VRegister& vn, int index); 2216 void fadd(const VRegister& vd, const VRegister& vn, const VRegister& vm); [all …]
|
/external/XNNPACK/src/f32-sigmoid/gen/ |
D | avx512f-rr2-lut32-p2-perm2-scalef-div-x16.c | 52 __m512 vn = _mm512_fmadd_ps(vz, vlog2e, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x16() local 53 const __m512 vl = _mm512_permutex2var_ps(vtable_lo, _mm512_castps_si512(vn), vtable_hi); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x16() 54 vn = _mm512_sub_ps(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x16() 56 __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2_hi, vz); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x16() 57 vt = _mm512_fmadd_ps(vn, vminus_ln2_lo, vt); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x16() 63 const __m512 ve = _mm512_scalef_ps(vp, vn); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x16() 84 __m512 vn = _mm512_fmadd_ps(vz, vlog2e, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x16() local 85 const __m512 vl = _mm512_permutex2var_ps(vtable_lo, _mm512_castps_si512(vn), vtable_hi); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x16() 86 vn = _mm512_sub_ps(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x16() 88 __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2_hi, vz); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x16() [all …]
|
D | avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x16.c | 52 __m512 vn = _mm512_fmadd_ps(vz, vlog2e, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x16() local 53 const __m512 vl = _mm512_permutex2var_ps(vtable_lo, _mm512_castps_si512(vn), vtable_hi); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x16() 54 vn = _mm512_sub_ps(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x16() 56 __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2_hi, vz); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x16() 57 vt = _mm512_fmadd_ps(vn, vminus_ln2_lo, vt); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x16() 63 const __m512 ve = _mm512_scalef_ps(vp, vn); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x16() 87 __m512 vn = _mm512_fmadd_ps(vz, vlog2e, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x16() local 88 const __m512 vl = _mm512_permutex2var_ps(vtable_lo, _mm512_castps_si512(vn), vtable_hi); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x16() 89 vn = _mm512_sub_ps(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x16() 91 __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2_hi, vz); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x16() [all …]
|
D | avx512f-rr1-lut16-p3-perm-scalef-div-x16.c | 46 __m512 vn = _mm512_fmadd_ps(vz, vlog2e, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_x16() local 47 const __m512 vl = _mm512_permutexvar_ps(_mm512_castps_si512(vn), vtable); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_x16() 48 vn = _mm512_sub_ps(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_x16() 50 __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vz); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_x16() 57 const __m512 ve = _mm512_scalef_ps(vp, vn); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_x16() 78 __m512 vn = _mm512_fmadd_ps(vz, vlog2e, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_x16() local 79 const __m512 vl = _mm512_permutexvar_ps(_mm512_castps_si512(vn), vtable); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_x16() 80 vn = _mm512_sub_ps(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_x16() 82 __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vz); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_x16() 89 const __m512 ve = _mm512_scalef_ps(vp, vn); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_x16()
|
D | wasmsimd-lut64-p2-div-x4.c | 43 v128_t vn = wasm_f32x4_add(vmagic_bias, wasm_f32x4_mul(vz, vminus_log2e)); in xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x4() local 44 const v128_t ve = wasm_i32x4_shl(vn, 17); in xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x4() 46 const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); in xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x4() 56 vn = wasm_f32x4_sub(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x4() 58 v128_t vt = wasm_f32x4_add(vz, wasm_f32x4_mul(vn, vln2_hi)); in xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x4() 59 vt = wasm_f32x4_add(vt, wasm_f32x4_mul(vn, vln2_lo)); in xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x4() 79 v128_t vn = wasm_f32x4_add(vmagic_bias, wasm_f32x4_mul(vz, vminus_log2e)); in xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x4() local 80 const v128_t ve = wasm_i32x4_shl(vn, 17); in xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x4() 82 const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); in xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x4() 92 vn = wasm_f32x4_sub(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x4() [all …]
|
D | avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x16.c | 46 __m512 vn = _mm512_fmadd_ps(vz, vlog2e, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x16() local 47 const __m512 vl = _mm512_permutexvar_ps(_mm512_castps_si512(vn), vtable); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x16() 48 vn = _mm512_sub_ps(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x16() 50 __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vz); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x16() 57 const __m512 ve = _mm512_scalef_ps(vp, vn); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x16() 81 __m512 vn = _mm512_fmadd_ps(vz, vlog2e, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x16() local 82 const __m512 vl = _mm512_permutexvar_ps(_mm512_castps_si512(vn), vtable); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x16() 83 vn = _mm512_sub_ps(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x16() 85 __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vz); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x16() 92 const __m512 ve = _mm512_scalef_ps(vp, vn); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x16()
|
D | avx-rr2-p5-div-x8.c | 47 __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias); in xnn_f32_sigmoid_ukernel__avx_rr2_p5_div_x8() local 49 …const __m128 vs_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn)),… in xnn_f32_sigmoid_ukernel__avx_rr2_p5_div_x8() 50 …const __m128 vs_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn, 1)… in xnn_f32_sigmoid_ukernel__avx_rr2_p5_div_x8() 53 vn = _mm256_sub_ps(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx_rr2_p5_div_x8() 55 __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vz); in xnn_f32_sigmoid_ukernel__avx_rr2_p5_div_x8() 56 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); in xnn_f32_sigmoid_ukernel__avx_rr2_p5_div_x8() 84 __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias); in xnn_f32_sigmoid_ukernel__avx_rr2_p5_div_x8() local 85 …const __m128 vs_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn)),… in xnn_f32_sigmoid_ukernel__avx_rr2_p5_div_x8() 86 …const __m128 vs_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn, 1)… in xnn_f32_sigmoid_ukernel__avx_rr2_p5_div_x8() 89 vn = _mm256_sub_ps(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx_rr2_p5_div_x8() [all …]
|
D | wasmsimd-p5-div-x4.c | 44 v128_t vn = wasm_f32x4_add(vmagic_bias, wasm_f32x4_mul(vz, vminus_log2e)); in xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x4() local 45 const v128_t vs = wasm_i32x4_shl(vn, 23); in xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x4() 46 vn = wasm_f32x4_sub(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x4() 48 v128_t vt = wasm_f32x4_add(vz, wasm_f32x4_mul(vn, vln2_hi)); in xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x4() 49 vt = wasm_f32x4_add(vt, wasm_f32x4_mul(vn, vln2_lo)); in xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x4() 72 v128_t vn = wasm_f32x4_add(vmagic_bias, wasm_f32x4_mul(vz, vminus_log2e)); in xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x4() local 73 const v128_t vs = wasm_i32x4_shl(vn, 23); in xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x4() 74 vn = wasm_f32x4_sub(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x4() 76 v128_t vt = wasm_f32x4_add(vz, wasm_f32x4_mul(vn, vln2_hi)); in xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x4() 77 vt = wasm_f32x4_add(vt, wasm_f32x4_mul(vn, vln2_lo)); in xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x4()
|
D | neon-rr2-lut2048-p1-nr2recps-x4.c | 42 float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e); in xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() local 43 const int32x4_t ve = vshlq_n_s32(vreinterpretq_s32_f32(vn), 12); in xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() 45 … const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); in xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() 55 vn = vsubq_f32(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() 56 float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi); in xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() 57 vt = vmlaq_f32(vt, vn, vln2_lo); in xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() 80 float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e); in xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() local 81 const int32x4_t ve = vshlq_n_s32(vreinterpretq_s32_f32(vn), 12); in xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() 83 … const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); in xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() 93 vn = vsubq_f32(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() [all …]
|
D | avx-rr2-p5-nr2-x8.c | 48 __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias); in xnn_f32_sigmoid_ukernel__avx_rr2_p5_nr2_x8() local 50 …const __m128 vs_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn)),… in xnn_f32_sigmoid_ukernel__avx_rr2_p5_nr2_x8() 51 …const __m128 vs_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn, 1)… in xnn_f32_sigmoid_ukernel__avx_rr2_p5_nr2_x8() 54 vn = _mm256_sub_ps(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx_rr2_p5_nr2_x8() 56 __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vz); in xnn_f32_sigmoid_ukernel__avx_rr2_p5_nr2_x8() 57 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); in xnn_f32_sigmoid_ukernel__avx_rr2_p5_nr2_x8() 88 __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias); in xnn_f32_sigmoid_ukernel__avx_rr2_p5_nr2_x8() local 89 …const __m128 vs_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn)),… in xnn_f32_sigmoid_ukernel__avx_rr2_p5_nr2_x8() 90 …const __m128 vs_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn, 1)… in xnn_f32_sigmoid_ukernel__avx_rr2_p5_nr2_x8() 93 vn = _mm256_sub_ps(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__avx_rr2_p5_nr2_x8() [all …]
|
D | neon-rr2-lut64-p2-nr2recps-x4.c | 42 float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e); in xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() local 43 const int32x4_t ve = vshlq_n_s32(vreinterpretq_s32_f32(vn), 17); in xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() 45 … const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); in xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() 55 vn = vsubq_f32(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() 56 float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi); in xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() 57 vt = vmlaq_f32(vt, vn, vln2_lo); in xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() 81 float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e); in xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() local 82 const int32x4_t ve = vshlq_n_s32(vreinterpretq_s32_f32(vn), 17); in xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() 84 … const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); in xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() 94 vn = vsubq_f32(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() [all …]
|
D | sse41-p5-div-x4.c | 44 __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias); in xnn_f32_sigmoid_ukernel__sse41_p5_div_x4() local 45 const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); in xnn_f32_sigmoid_ukernel__sse41_p5_div_x4() 46 vn = _mm_sub_ps(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__sse41_p5_div_x4() 48 __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz); in xnn_f32_sigmoid_ukernel__sse41_p5_div_x4() 49 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); in xnn_f32_sigmoid_ukernel__sse41_p5_div_x4() 75 __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias); in xnn_f32_sigmoid_ukernel__sse41_p5_div_x4() local 76 const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); in xnn_f32_sigmoid_ukernel__sse41_p5_div_x4() 77 vn = _mm_sub_ps(vn, vmagic_bias); in xnn_f32_sigmoid_ukernel__sse41_p5_div_x4() 79 __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz); in xnn_f32_sigmoid_ukernel__sse41_p5_div_x4() 80 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); in xnn_f32_sigmoid_ukernel__sse41_p5_div_x4()
|
/external/XNNPACK/src/f32-sigmoid/ |
D | scalar-lut2048-p1-div.c.in | 47 float vn${N} = vz${N} * vminus_log2e + vmagic_bias; 50 const uint32_t ve${N} = fp32_to_bits(vn${N}) << 12; 53 const uint32_t vidx${N} = fp32_to_bits(vn${N}) & vindex_mask; 57 vn${N} -= vmagic_bias; 60 float vt${N} = vn${N} * vln2_hi + vz${N}; 63 vt${N} = vn${N} * vln2_lo + vt${N}; 97 float vn = vz * vminus_log2e + vmagic_bias; variable 98 const uint32_t ve = fp32_to_bits(vn) << 12; 99 const uint32_t vidx = fp32_to_bits(vn) & vindex_mask; 101 vn -= vmagic_bias; [all …]
|
D | scalar-lut64-p2-div.c.in | 47 float vn${N} = vz${N} * vminus_log2e + vmagic_bias; 50 const uint32_t ve${N} = fp32_to_bits(vn${N}) << 17; 53 const uint32_t vidx${N} = fp32_to_bits(vn${N}) & vindex_mask; 57 vn${N} -= vmagic_bias; 60 float vt${N} = vn${N} * vln2_hi + vz${N}; 63 vt${N} = vn${N} * vln2_lo + vt${N}; 100 float vn = vz * vminus_log2e + vmagic_bias; variable 101 const uint32_t ve = fp32_to_bits(vn) << 17; 102 const uint32_t vidx = fp32_to_bits(vn) & vindex_mask; 104 vn -= vmagic_bias; [all …]
|
/external/XNNPACK/src/f32-raddstoreexpminusmax/gen/ |
D | neonfma-lut64-p2-x4.c | 57 float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e_x64); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x4() local 69 …const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))),… in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x4() 72 … const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x4() 84 vn = vsubq_f32(vn, vmagic_bias); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x4() 88 float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2_o64_hi); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x4() 89 vt = vfmaq_f32(vt, vn, vminus_ln2_o64_lo); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x4() 133 float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e_x64); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x4() local 145 …const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))),… in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x4() 148 … const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x4() 160 vn = vsubq_f32(vn, vmagic_bias); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x4() [all …]
|
D | neon-lut64-p2-x4.c | 58 float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e_x64); in xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x4() local 70 …const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))),… in xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x4() 73 … const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); in xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x4() 85 vn = vsubq_f32(vn, vmagic_bias); in xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x4() 89 float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_o64_hi); in xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x4() 90 vt = vmlaq_f32(vt, vn, vminus_ln2_o64_lo); in xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x4() 134 float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e_x64); in xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x4() local 146 …const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))),… in xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x4() 149 … const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); in xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x4() 161 vn = vsubq_f32(vn, vmagic_bias); in xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x4() [all …]
|
D | neonfma-p5-x4.c | 56 float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x4() local 60 const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x4() 63 vn = vsubq_f32(vn, vmagic_bias); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x4() 67 float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2_hi); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x4() 68 vt = vfmaq_f32(vt, vn, vminus_ln2_lo); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x4() 113 float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x4() local 117 const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x4() 120 vn = vsubq_f32(vn, vmagic_bias); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x4() 124 float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2_hi); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x4() 125 vt = vfmaq_f32(vt, vn, vminus_ln2_lo); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_p5_x4()
|
/external/XNNPACK/src/f32-velu/gen/ |
D | velu-avx-rr2-p6-x8.c | 50 __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias); in xnn_f32_velu_ukernel__avx_rr2_p6_x8() local 51 …const __m128 vs_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn)),… in xnn_f32_velu_ukernel__avx_rr2_p6_x8() 52 …const __m128 vs_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn, 1)… in xnn_f32_velu_ukernel__avx_rr2_p6_x8() 53 vn = _mm256_sub_ps(vn, vmagic_bias); in xnn_f32_velu_ukernel__avx_rr2_p6_x8() 55 __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vz); in xnn_f32_velu_ukernel__avx_rr2_p6_x8() 57 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); in xnn_f32_velu_ukernel__avx_rr2_p6_x8() 85 __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias); in xnn_f32_velu_ukernel__avx_rr2_p6_x8() local 86 …const __m128 vs_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn)),… in xnn_f32_velu_ukernel__avx_rr2_p6_x8() 87 …const __m128 vs_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn, 1)… in xnn_f32_velu_ukernel__avx_rr2_p6_x8() 88 vn = _mm256_sub_ps(vn, vmagic_bias); in xnn_f32_velu_ukernel__avx_rr2_p6_x8() [all …]
|
D | velu-avx512f-rr1-lut16-p3-perm-x16.c | 49 __m512 vn = _mm512_fmadd_ps(vz, vlog2e, vmagic_bias); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x16() local 50 const __m512i ven = _mm512_slli_epi32(_mm512_castps_si512(vn), 19); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x16() 51 const __m512i vl = _mm512_permutexvar_epi32(_mm512_castps_si512(vn), vtable); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x16() 53 vn = _mm512_sub_ps(vn, vmagic_bias); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x16() 55 __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vz); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x16() 82 __m512 vn = _mm512_fmadd_ps(vz, vlog2e, vmagic_bias); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x16() local 83 const __m512i ven = _mm512_slli_epi32(_mm512_castps_si512(vn), 19); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x16() 84 const __m512i vl = _mm512_permutexvar_epi32(_mm512_castps_si512(vn), vtable); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x16() 86 vn = _mm512_sub_ps(vn, vmagic_bias); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x16() 88 __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vz); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x16()
|
D | velu-neon-rr2-lut16-p3-x4.c | 50 float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vlog2e); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() local 51 …const uint64x2_t vidx = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn), vin… in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() 52 const int32x4_t ven = vshlq_n_s32(vreinterpretq_s32_f32(vn), 19); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() 61 vn = vsubq_f32(vn, vmagic_bias); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() 64 float32x4_t vt = vmlaq_f32(vz, vn, vminus_ln2_hi); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() 65 vt = vmlaq_f32(vt, vn, vminus_ln2_lo); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() 87 float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vlog2e); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() local 88 …const uint64x2_t vidx = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn), vin… in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() 89 const int32x4_t ven = vshlq_n_s32(vreinterpretq_s32_f32(vn), 19); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() 98 vn = vsubq_f32(vn, vmagic_bias); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() [all …]
|
D | velu-avx-rr2-lut4-p4-perm-x8.c | 52 __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x8() local 53 __m256 ven = _mm256_andnot_ps(vindex_mask, vn); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x8() 54 const __m256 vl = _mm256_permutevar_ps(vtable, _mm256_castps_si256(vn)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x8() 56 vn = _mm256_sub_ps(vn, vmagic_bias); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x8() 59 __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vz); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x8() 61 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x8() 88 __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x8() local 89 __m256 ven = _mm256_andnot_ps(vindex_mask, vn); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x8() 90 const __m256 vl = _mm256_permutevar_ps(vtable, _mm256_castps_si256(vn)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x8() 92 vn = _mm256_sub_ps(vn, vmagic_bias); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x8() [all …]
|
D | velu-wasmsimd-arm-rr2-lut16-p3-x4.c | 51 v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vlog2e), vmagic_bias); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() local 52 const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() 53 const v128_t ven = wasm_i32x4_shl(vn, 19); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() 64 vn = wasm_f32x4_sub(vn, vmagic_bias); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() 66 v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vz); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() 67 vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() 89 v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vlog2e), vmagic_bias); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() local 90 const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() 91 const v128_t ven = wasm_i32x4_shl(vn, 19); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() 102 vn = wasm_f32x4_sub(vn, vmagic_bias); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() [all …]
|
D | velu-wasmsimd-x86-rr2-lut16-p3-x4.c | 51 v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vlog2e), vmagic_bias); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() local 52 const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() 53 const v128_t ven = wasm_i32x4_shl(vn, 19); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() 64 vn = wasm_f32x4_sub(vn, vmagic_bias); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() 66 v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vz); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() 68 vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() 92 v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vlog2e), vmagic_bias); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() local 93 const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() 94 const v128_t ven = wasm_i32x4_shl(vn, 19); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() 105 vn = wasm_f32x4_sub(vn, vmagic_bias); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() [all …]
|
/external/XNNPACK/src/f32-velu/ |
D | scalar-rr2-lut16-p3.c.in | 53 float vn${N} = vz${N} * vlog2e + vmagic_bias; 56 const uint32_t ven${N} = fp32_to_bits(vn${N}) << 19; 57 const uint32_t vidx${N} = fp32_to_bits(vn${N}) & vindex_mask; 58 vn${N} -= vmagic_bias; 61 float vt${N} = vn${N} * vminus_ln2_hi + vz${N}; 65 vt${N} = vn${N} * vminus_ln2_lo + vt${N}; 114 float vn = vz * vlog2e + vmagic_bias; 115 const uint32_t ven = fp32_to_bits(vn) << 19; 116 const uint32_t vidx = fp32_to_bits(vn) & vindex_mask; 117 vn -= vmagic_bias; [all …]
|