/external/XNNPACK/src/f32-velu/gen/ |
D | velu-avx-rr2-lut16-p3-x8.c | 52 const __m128i vidx_lo = _mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vidx)), 2); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x8() local 55 const uint64_t vidx_ll = (uint64_t) _mm_cvtsi128_si64(vidx_lo); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x8() 56 const uint64_t vidx_lh = (uint64_t) _mm_extract_epi64(vidx_lo, 1); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x8() 68 …(const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_cvtsi128_si32(vidx_lo)))); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x8() 69 …nst int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 2)))); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x8() 72 … int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 1))), 1); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x8() 73 … int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 3))), 1); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x8() 118 const __m128i vidx_lo = _mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vidx)), 2); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x8() local 121 const uint64_t vidx_ll = (uint64_t) _mm_cvtsi128_si64(vidx_lo); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x8() 122 const uint64_t vidx_lh = (uint64_t) _mm_extract_epi64(vidx_lo, 1); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x8() [all …]
|
D | velu-avx-rr2-lut16-p3-x16.c | 167 const __m128i vidx_lo = _mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vidx)), 2); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x16() local 170 const uint64_t vidx_ll = (uint64_t) _mm_cvtsi128_si64(vidx_lo); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x16() 171 const uint64_t vidx_lh = (uint64_t) _mm_extract_epi64(vidx_lo, 1); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x16() 183 …(const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_cvtsi128_si32(vidx_lo)))); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x16() 184 …nst int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 2)))); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x16() 187 … int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 1))), 1); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x16() 188 … int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 3))), 1); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x16() 233 const __m128i vidx_lo = _mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vidx)), 2); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x16() local 236 const uint64_t vidx_ll = (uint64_t) _mm_cvtsi128_si64(vidx_lo); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x16() 237 const uint64_t vidx_lh = (uint64_t) _mm_extract_epi64(vidx_lo, 1); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x16() [all …]
|
D | velu-wasmsimd-x86-rr2-lut16-p3-x4.c | 54 const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() local 56 …nst float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() 57 …t vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32))); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() 95 const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() local 97 …nst float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4() 98 …t vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32))); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4()
|
D | velu-neon-rr2-lut16-p3-x4.c | 53 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() local 55 … = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() 57 …const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1… in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() 90 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() local 92 … = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4() 94 …const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1… in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4()
|
D | velu-neonfma-rr1-lut16-p3-x4.c | 52 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x4() local 54 … = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)); in xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x4() 56 …const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1… in xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x4() 88 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x4() local 90 … = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)); in xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x4() 92 …const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1… in xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x4()
|
D | velu-wasmsimd-arm-rr2-lut16-p3-x4.c | 54 const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() local 56 …nst float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() 57 …t vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32))); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() 92 const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() local 94 …nst float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4() 95 …t vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32))); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4()
|
D | velu-sse41-rr2-lut16-p3-x4.c | 55 const uint64_t vidx_lo = (uint64_t) _mm_cvtsi128_si64(vidx); in xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x4() local 57 …mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo))); in xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x4() 59 …l_ll, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32))), 1); in xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x4() 98 const uint64_t vidx_lo = (uint64_t) _mm_cvtsi128_si64(vidx); in xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x4() local 100 …mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo))); in xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x4() 102 …l_ll, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32))), 1); in xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x4()
|
D | velu-avx-rr2-lut16-p3-x24.c | 216 const __m128i vidx_lo = _mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vidx)), 2); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x24() local 219 const uint64_t vidx_ll = (uint64_t) _mm_cvtsi128_si64(vidx_lo); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x24() 220 const uint64_t vidx_lh = (uint64_t) _mm_extract_epi64(vidx_lo, 1); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x24() 232 …(const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_cvtsi128_si32(vidx_lo)))); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x24() 233 …nst int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 2)))); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x24() 236 … int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 1))), 1); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x24() 237 … int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 3))), 1); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x24() 282 const __m128i vidx_lo = _mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vidx)), 2); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x24() local 285 const uint64_t vidx_ll = (uint64_t) _mm_cvtsi128_si64(vidx_lo); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x24() 286 const uint64_t vidx_lh = (uint64_t) _mm_extract_epi64(vidx_lo, 1); in xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x24() [all …]
|
/external/XNNPACK/src/math/ |
D | expm1minus-f32-avx-rr2-lut16-p3.c | 73 const __m128i vidx_lo = _mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vidx)), 2); in xnn_math_f32_expm1minus__avx_rr2_lut16_p3() local 76 const uint64_t vidx_ll = (uint64_t) _mm_cvtsi128_si64(vidx_lo); in xnn_math_f32_expm1minus__avx_rr2_lut16_p3() 77 const uint64_t vidx_lh = (uint64_t) _mm_extract_epi64(vidx_lo, 1); in xnn_math_f32_expm1minus__avx_rr2_lut16_p3() 89 …(const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_cvtsi128_si32(vidx_lo)))); in xnn_math_f32_expm1minus__avx_rr2_lut16_p3() 90 …nst int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 2)))); in xnn_math_f32_expm1minus__avx_rr2_lut16_p3() 93 … int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 1))), 1); in xnn_math_f32_expm1minus__avx_rr2_lut16_p3() 94 … int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 3))), 1); in xnn_math_f32_expm1minus__avx_rr2_lut16_p3()
|
D | sigmoid-f32-avx-rr2-lut64-p2-div.c | 77 const __m128i vidx_lo = _mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vidx)), 2); in xnn_math_f32_sigmoid__avx_rr2_lut64_p2_div() local 80 const uint64_t vidx_ll = (uint64_t) _mm_cvtsi128_si64(vidx_lo); in xnn_math_f32_sigmoid__avx_rr2_lut64_p2_div() 81 const uint64_t vidx_lh = (uint64_t) _mm_extract_epi64(vidx_lo, 1); in xnn_math_f32_sigmoid__avx_rr2_lut64_p2_div() 93 …(const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_cvtsi128_si32(vidx_lo)))); in xnn_math_f32_sigmoid__avx_rr2_lut64_p2_div() 94 …nst int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi32(vidx_lo, 2)))); in xnn_math_f32_sigmoid__avx_rr2_lut64_p2_div() 97 … int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi32(vidx_lo, 1))), 1); in xnn_math_f32_sigmoid__avx_rr2_lut64_p2_div() 98 … int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi32(vidx_lo, 3))), 1); in xnn_math_f32_sigmoid__avx_rr2_lut64_p2_div()
|
/external/XNNPACK/src/f32-vsigmoid/gen/ |
D | vsigmoid-neonfma-rr1-lut2048-p1-div-x4.c | 45 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x4() local 47 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x4() 49 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x4() 78 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x4() local 80 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x4() 82 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x4()
|
D | vsigmoid-wasmsimd-rr2-lut64-p2-div-x4.c | 47 const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); in xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_x4() local 49 …nst float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); in xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_x4() 50 …t vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); in xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_x4() 83 const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); in xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_x4() local 85 …nst float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); in xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_x4() 86 …t vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); in xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_x4()
|
D | vsigmoid-neonfma-rr1-lut64-p2-div-x4.c | 45 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_div_x4() local 47 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_div_x4() 49 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_div_x4() 79 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_div_x4() local 81 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_div_x4() 83 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_div_x4()
|
D | vsigmoid-neonfma-rr1-lut64-p2-nr1recps1fma-x4.c | 45 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x4() local 47 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x4() 49 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x4() 83 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x4() local 85 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x4() 87 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x4()
|
D | vsigmoid-neonfma-rr1-lut2048-p1-nr2fma-x4.c | 45 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x4() local 47 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x4() 49 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x4() 82 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x4() local 84 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x4() 86 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x4()
|
D | vsigmoid-neonfma-rr1-lut64-p2-nr2fma-x4.c | 45 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x4() local 47 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x4() 49 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x4() 83 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x4() local 85 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x4() 87 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x4()
|
D | vsigmoid-neon-rr2-lut2048-p1-nr2recps-x4.c | 46 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() local 48 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() 50 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() 84 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() local 86 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4() 88 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4()
|
D | vsigmoid-neonfma-rr1-lut2048-p1-nr1recps1fma-x4.c | 45 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x4() local 47 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x4() 49 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x4() 82 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x4() local 84 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x4() 86 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x4()
|
D | vsigmoid-neon-rr2-lut64-p2-nr2recps-x4.c | 46 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() local 48 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() 50 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() 85 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() local 87 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4() 89 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4()
|
D | vsigmoid-neonfma-rr1-lut2048-p1-nr2recps-x4.c | 45 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x4() local 47 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x4() 49 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x4() 82 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x4() local 84 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x4() 86 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x4()
|
D | vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x4.c | 45 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x4() local 47 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x4() 49 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x4() 83 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x4() local 85 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x4() 87 vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x4()
|
D | vsigmoid-sse41-rr2-lut64-p2-div-x4.c | 49 const uint64_t vidx_lo = (uint64_t) _mm_cvtsi128_si64(vidx); in xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x4() local 51 …mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo))); in xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x4() 53 …l_ll, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))), 1); in xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x4() 91 const uint64_t vidx_lo = (uint64_t) _mm_cvtsi128_si64(vidx); in xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x4() local 93 …mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo))); in xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x4() 95 …l_ll, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))), 1); in xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x4()
|
/external/XNNPACK/src/f32-raddstoreexpminusmax/gen/ |
D | neonfma-rr1-lut64-p2-x4.c | 49 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x4() local 51 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x4() 53 vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x4() 90 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x4() local 92 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x4() 94 vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x4()
|
D | neon-rr2-lut64-p2-x4.c | 50 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x4() local 52 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); in xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x4() 54 vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x4() 92 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); in xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x4() local 94 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); in xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x4() 96 vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); in xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x4()
|
/external/XNNPACK/src/f32-velu/ |
D | avx-rr2-lut16-p3.c.in | 134 const __m128i vidx_lo = _mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vidx)), 2); 137 const uint64_t vidx_ll = (uint64_t) _mm_cvtsi128_si64(vidx_lo); 138 const uint64_t vidx_lh = (uint64_t) _mm_extract_epi64(vidx_lo, 1); 150 …(const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_cvtsi128_si32(vidx_lo)))); 151 …nst int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 2)))); 154 … int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 1))), 1); 155 … int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi32(vidx_lo, 3))), 1); 200 const __m128i vidx_lo = _mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vidx)), 2); 203 const uint64_t vidx_ll = (uint64_t) _mm_cvtsi128_si64(vidx_lo); 204 const uint64_t vidx_lh = (uint64_t) _mm_extract_epi64(vidx_lo, 1); [all …]
|