/external/XNNPACK/src/f32-velu/gen/ |
D | velu-avx-rr2-lut4-p4-perm-x40.c | 78 __m256 ven4 = _mm256_andnot_ps(vindex_mask, vn4); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x40() local 80 …128 ven4_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven4)), 21)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x40() 91 …8 ven4_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven4, 1)), 21)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x40() 102 ven4 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven4_lo), ven4_hi, 1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x40() 113 __m256 vs4 = _mm256_mul_ps(vl4, ven4); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x40()
|
D | velu-avx-rr2-lut4-p4-perm-x48.c | 81 __m256 ven4 = _mm256_andnot_ps(vindex_mask, vn4); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48() local 83 …128 ven4_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven4)), 21)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48() 97 …8 ven4_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven4, 1)), 21)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48() 110 ven4 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven4_lo), ven4_hi, 1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48() 123 __m256 vs4 = _mm256_mul_ps(vl4, ven4); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48()
|
D | velu-scalar-rr2-lut16-p3-x5.c | 75 const uint32_t ven4 = fp32_to_bits(vn4) << 19; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5() local 88 float vs4 = fp32_from_bits(xnn_table_exp2minus_k_over_16[vidx4] + ven4); in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5()
|
D | velu-wasm-rr2-lut16-p3-x5.c | 75 const uint32_t ven4 = fp32_to_bits(vn4) << 19; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x5() local 88 float vs4 = fp32_from_bits(xnn_table_exp2minus_k_over_16[vidx4] + ven4); in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x5()
|
D | velu-scalar-rr2-lut16-p3-x6.c | 78 const uint32_t ven4 = fp32_to_bits(vn4) << 19; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() local 94 float vs4 = fp32_from_bits(xnn_table_exp2minus_k_over_16[vidx4] + ven4); in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6()
|
D | velu-wasm-rr2-lut16-p3-x6.c | 78 const uint32_t ven4 = fp32_to_bits(vn4) << 19; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6() local 94 float vs4 = fp32_from_bits(xnn_table_exp2minus_k_over_16[vidx4] + ven4); in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6()
|
D | velu-avx512f-rr1-lut16-p3-perm-x80.c | 70 const __m512i ven4 = _mm512_slli_epi32(_mm512_castps_si512(vn4), 19); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x80() local 81 __m512 vs4 = _mm512_castsi512_ps(_mm512_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x80()
|
D | velu-avx2-rr1-lut16-p3-gather-x40.c | 81 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 19); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x40() local 92 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x40()
|
D | velu-avx2-rr1-lut8-p4-perm-x40.c | 74 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 20); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x40() local 86 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x40()
|
D | velu-avx2-rr1-lut4-p4-perm-x40.c | 75 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 21); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x40() local 87 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x40()
|
D | velu-avx2-rr1-lut16-p3-gather-x48.c | 86 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 19); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48() local 99 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48()
|
D | velu-avx512f-rr1-lut16-p3-perm-x96.c | 73 const __m512i ven4 = _mm512_slli_epi32(_mm512_castps_si512(vn4), 19); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x96() local 86 __m512 vs4 = _mm512_castsi512_ps(_mm512_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x96()
|
D | velu-avx2-rr1-lut8-p4-perm-x48.c | 77 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 20); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48() local 92 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48()
|
D | velu-avx2-rr1-lut4-p4-perm-x48.c | 78 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 21); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48() local 93 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48()
|
D | velu-avx512f-rr1-lut16-p3-perm-x112.c | 76 const __m512i ven4 = _mm512_slli_epi32(_mm512_castps_si512(vn4), 19); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x112() local 91 __m512 vs4 = _mm512_castsi512_ps(_mm512_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x112()
|
D | velu-avx512f-rr1-lut16-p3-perm-x128.c | 79 const __m512i ven4 = _mm512_slli_epi32(_mm512_castps_si512(vn4), 19); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x128() local 96 __m512 vs4 = _mm512_castsi512_ps(_mm512_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x128()
|
D | velu-avx2-rr1-lut4-p4-perm-x56.c | 81 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 21); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56() local 99 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56()
|
D | velu-avx2-rr1-lut8-p4-perm-x56.c | 80 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 20); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56() local 98 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56()
|
D | velu-avx2-rr1-lut16-p3-gather-x56.c | 91 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 19); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56() local 106 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56()
|
D | velu-avx2-rr1-lut4-p4-perm-x64.c | 84 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 21); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x64() local 105 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x64()
|
D | velu-avx2-rr1-lut8-p4-perm-x64.c | 83 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 20); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64() local 104 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64()
|
D | velu-avx2-rr1-lut16-p3-gather-x64.c | 96 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 19); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64() local 113 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64()
|
D | velu-avx2-rr1-lut4-p4-perm-x72.c | 87 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 21); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72() local 111 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72()
|
D | velu-avx2-rr1-lut16-p3-gather-x72.c | 101 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 19); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72() local 120 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72()
|
D | velu-avx2-rr1-lut8-p4-perm-x72.c | 86 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 20); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x72() local 110 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x72()
|