/external/XNNPACK/src/f32-velu/gen/ |
D | velu-avx-rr2-lut4-p4-perm-x16.c | 60 __m256 ven1 = _mm256_andnot_ps(vindex_mask, vn1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x16() local 62 …128 ven1_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven1)), 21)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x16() 67 …8 ven1_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven1, 1)), 21)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x16() 72 ven1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven1_lo), ven1_hi, 1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x16() 77 __m256 vs1 = _mm256_mul_ps(vl1, ven1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x16()
|
D | velu-avx-rr2-lut4-p4-perm-x24.c | 63 __m256 ven1 = _mm256_andnot_ps(vindex_mask, vn1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x24() local 65 …128 ven1_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven1)), 21)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x24() 73 …8 ven1_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven1, 1)), 21)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x24() 80 ven1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven1_lo), ven1_hi, 1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x24() 87 __m256 vs1 = _mm256_mul_ps(vl1, ven1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x24()
|
D | velu-avx-rr2-lut4-p4-perm-x32.c | 66 __m256 ven1 = _mm256_andnot_ps(vindex_mask, vn1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32() local 68 …128 ven1_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven1)), 21)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32() 79 …8 ven1_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven1, 1)), 21)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32() 88 ven1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven1_lo), ven1_hi, 1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32() 97 __m256 vs1 = _mm256_mul_ps(vl1, ven1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32()
|
D | velu-wasm-rr2-lut16-p3-x2.c | 57 const uint32_t ven1 = fp32_to_bits(vn1) << 19; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x2() local 64 float vs1 = fp32_from_bits(xnn_table_exp2minus_k_over_16[vidx1] + ven1); in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x2()
|
D | velu-avx-rr2-lut4-p4-perm-x40.c | 69 __m256 ven1 = _mm256_andnot_ps(vindex_mask, vn1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x40() local 71 …128 ven1_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven1)), 21)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x40() 85 …8 ven1_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven1, 1)), 21)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x40() 96 ven1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven1_lo), ven1_hi, 1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x40() 107 __m256 vs1 = _mm256_mul_ps(vl1, ven1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x40()
|
D | velu-scalar-rr2-lut16-p3-x2.c | 57 const uint32_t ven1 = fp32_to_bits(vn1) << 19; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2() local 64 float vs1 = fp32_from_bits(xnn_table_exp2minus_k_over_16[vidx1] + ven1); in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2()
|
D | velu-wasm-rr2-lut16-p3-x3.c | 60 const uint32_t ven1 = fp32_to_bits(vn1) << 19; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x3() local 70 float vs1 = fp32_from_bits(xnn_table_exp2minus_k_over_16[vidx1] + ven1); in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x3()
|
D | velu-scalar-rr2-lut16-p3-x3.c | 60 const uint32_t ven1 = fp32_to_bits(vn1) << 19; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x3() local 70 float vs1 = fp32_from_bits(xnn_table_exp2minus_k_over_16[vidx1] + ven1); in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x3()
|
D | velu-avx-rr2-lut4-p4-perm-x48.c | 72 __m256 ven1 = _mm256_andnot_ps(vindex_mask, vn1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48() local 74 …128 ven1_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven1)), 21)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48() 91 …8 ven1_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven1, 1)), 21)); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48() 104 ven1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven1_lo), ven1_hi, 1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48() 117 __m256 vs1 = _mm256_mul_ps(vl1, ven1); in xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48()
|
D | velu-scalar-rr2-lut16-p3-x4.c | 63 const uint32_t ven1 = fp32_to_bits(vn1) << 19; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4() local 76 float vs1 = fp32_from_bits(xnn_table_exp2minus_k_over_16[vidx1] + ven1); in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4()
|
D | velu-wasm-rr2-lut16-p3-x4.c | 63 const uint32_t ven1 = fp32_to_bits(vn1) << 19; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x4() local 76 float vs1 = fp32_from_bits(xnn_table_exp2minus_k_over_16[vidx1] + ven1); in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x4()
|
D | velu-avx512f-rr1-lut16-p3-perm-x32.c | 55 const __m512i ven1 = _mm512_slli_epi32(_mm512_castps_si512(vn1), 19); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x32() local 60 __m512 vs1 = _mm512_castsi512_ps(_mm512_add_epi32(vl1, ven1)); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x32()
|
D | velu-scalar-rr2-lut16-p3-x5.c | 66 const uint32_t ven1 = fp32_to_bits(vn1) << 19; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5() local 82 float vs1 = fp32_from_bits(xnn_table_exp2minus_k_over_16[vidx1] + ven1); in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5()
|
D | velu-wasm-rr2-lut16-p3-x5.c | 66 const uint32_t ven1 = fp32_to_bits(vn1) << 19; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x5() local 82 float vs1 = fp32_from_bits(xnn_table_exp2minus_k_over_16[vidx1] + ven1); in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x5()
|
D | velu-avx2-rr1-lut16-p3-gather-x16.c | 60 const __m256i ven1 = _mm256_slli_epi32(_mm256_castps_si256(vn1), 19); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x16() local 65 __m256 vs1 = _mm256_castsi256_ps(_mm256_add_epi32(vl1, ven1)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x16()
|
D | velu-avx2-rr1-lut8-p4-perm-x16.c | 56 const __m256i ven1 = _mm256_slli_epi32(_mm256_castps_si256(vn1), 20); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x16() local 62 __m256 vs1 = _mm256_castsi256_ps(_mm256_add_epi32(vl1, ven1)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x16()
|
D | velu-scalar-rr2-lut16-p3-x6.c | 69 const uint32_t ven1 = fp32_to_bits(vn1) << 19; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() local 88 float vs1 = fp32_from_bits(xnn_table_exp2minus_k_over_16[vidx1] + ven1); in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6()
|
D | velu-avx2-rr1-lut4-p4-perm-x16.c | 57 const __m256i ven1 = _mm256_slli_epi32(_mm256_castps_si256(vn1), 21); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x16() local 63 __m256 vs1 = _mm256_castsi256_ps(_mm256_add_epi32(vl1, ven1)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x16()
|
D | velu-wasm-rr2-lut16-p3-x6.c | 69 const uint32_t ven1 = fp32_to_bits(vn1) << 19; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6() local 88 float vs1 = fp32_from_bits(xnn_table_exp2minus_k_over_16[vidx1] + ven1); in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6()
|
D | velu-avx512f-rr1-lut16-p3-perm-x48.c | 58 const __m512i ven1 = _mm512_slli_epi32(_mm512_castps_si512(vn1), 19); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x48() local 65 __m512 vs1 = _mm512_castsi512_ps(_mm512_add_epi32(vl1, ven1)); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x48()
|
D | velu-avx2-rr1-lut16-p3-gather-x24.c | 65 const __m256i ven1 = _mm256_slli_epi32(_mm256_castps_si256(vn1), 19); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x24() local 72 __m256 vs1 = _mm256_castsi256_ps(_mm256_add_epi32(vl1, ven1)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x24()
|
D | velu-avx2-rr1-lut4-p4-perm-x24.c | 60 const __m256i ven1 = _mm256_slli_epi32(_mm256_castps_si256(vn1), 21); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x24() local 69 __m256 vs1 = _mm256_castsi256_ps(_mm256_add_epi32(vl1, ven1)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x24()
|
D | velu-avx2-rr1-lut8-p4-perm-x24.c | 59 const __m256i ven1 = _mm256_slli_epi32(_mm256_castps_si256(vn1), 20); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x24() local 68 __m256 vs1 = _mm256_castsi256_ps(_mm256_add_epi32(vl1, ven1)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x24()
|
D | velu-avx2-rr1-lut4-p4-perm-x32.c | 63 const __m256i ven1 = _mm256_slli_epi32(_mm256_castps_si256(vn1), 21); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x32() local 75 __m256 vs1 = _mm256_castsi256_ps(_mm256_add_epi32(vl1, ven1)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x32()
|
D | velu-avx2-rr1-lut16-p3-gather-x32.c | 70 const __m256i ven1 = _mm256_slli_epi32(_mm256_castps_si256(vn1), 19); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x32() local 79 __m256 vs1 = _mm256_castsi256_ps(_mm256_add_epi32(vl1, ven1)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x32()
|