/external/XNNPACK/src/f32-hswish/gen/ |
D | hswish-wasmsimd-x16.c | 36 v128_t vxCDEF = wasm_v128_load(x + 12); in xnn_f32_hswish_ukernel__wasmsimd_x16() local 45 v128_t vaccCDEF = wasm_f32x4_add(vxCDEF, vthree); in xnn_f32_hswish_ukernel__wasmsimd_x16() 46 vxCDEF = wasm_f32x4_mul(vxCDEF, vsixth); in xnn_f32_hswish_ukernel__wasmsimd_x16() 61 vaccCDEF = wasm_f32x4_mul(vaccCDEF, vxCDEF); in xnn_f32_hswish_ukernel__wasmsimd_x16()
|
D | hswish-neon-x16.c | 36 float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_hswish_ukernel__neon_x16() local 44 float32x4_t vaccCDEF = vaddq_f32(vxCDEF, vthree); in xnn_f32_hswish_ukernel__neon_x16() 45 vxCDEF = vmulq_f32(vxCDEF, vsixth); in xnn_f32_hswish_ukernel__neon_x16() 60 vaccCDEF = vmulq_f32(vaccCDEF, vxCDEF); in xnn_f32_hswish_ukernel__neon_x16()
|
/external/XNNPACK/src/f32-vsqrt/gen/ |
D | neonfma-nr1rsqrts1fma1adj-x16.c | 33 const float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x16() local 38 float32x4_t vrsqrtxCDEF = vrsqrteq_f32(vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x16() 48 const float32x4_t vcorrectionCDEF = vrsqrtsq_f32(vxCDEF, vrxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x16() 61 float32x4_t vsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x16() 81 const float32x4_t vadjustmentCDEF = vfmsq_f32(vxCDEF, vsqrtxCDEF, vsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x16()
|
D | neonfma-nr1rsqrts1fma1adj-x20.c | 33 const float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x20() local 39 float32x4_t vrsqrtxCDEF = vrsqrteq_f32(vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x20() 51 const float32x4_t vcorrectionCDEF = vrsqrtsq_f32(vxCDEF, vrxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x20() 66 float32x4_t vsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x20() 91 const float32x4_t vadjustmentCDEF = vfmsq_f32(vxCDEF, vsqrtxCDEF, vsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x20()
|
D | neonfma-nr2fma1adj-x16.c | 33 const float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() local 38 const float32x4_t vrsqrtxCDEF = vrsqrteq_f32(vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 46 float32x4_t vsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 80 const float32x4_t vadjustmentCDEF = vfmsq_f32(vxCDEF, vsqrtxCDEF, vsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
|
D | neonfma-nr1rsqrts1fma1adj-x24.c | 33 const float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() local 40 float32x4_t vrsqrtxCDEF = vrsqrteq_f32(vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() 54 const float32x4_t vcorrectionCDEF = vrsqrtsq_f32(vxCDEF, vrxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() 71 float32x4_t vsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() 101 const float32x4_t vadjustmentCDEF = vfmsq_f32(vxCDEF, vsqrtxCDEF, vsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24()
|
D | neonfma-nr1rsqrts1fma1adj-x28.c | 33 const float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() local 41 float32x4_t vrsqrtxCDEF = vrsqrteq_f32(vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() 57 const float32x4_t vcorrectionCDEF = vrsqrtsq_f32(vxCDEF, vrxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() 76 float32x4_t vsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() 111 const float32x4_t vadjustmentCDEF = vfmsq_f32(vxCDEF, vsqrtxCDEF, vsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28()
|
D | neonfma-nr2fma1adj-x20.c | 33 const float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() local 39 const float32x4_t vrsqrtxCDEF = vrsqrteq_f32(vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 48 float32x4_t vsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 90 const float32x4_t vadjustmentCDEF = vfmsq_f32(vxCDEF, vsqrtxCDEF, vsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
|
D | neonfma-nr1rsqrts1fma1adj-x32.c | 33 const float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() local 42 float32x4_t vrsqrtxCDEF = vrsqrteq_f32(vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() 60 const float32x4_t vcorrectionCDEF = vrsqrtsq_f32(vxCDEF, vrxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() 81 float32x4_t vsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() 121 const float32x4_t vadjustmentCDEF = vfmsq_f32(vxCDEF, vsqrtxCDEF, vsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32()
|
D | neonfma-nr2fma1adj-x24.c | 33 const float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() local 40 const float32x4_t vrsqrtxCDEF = vrsqrteq_f32(vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() 50 float32x4_t vsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() 100 const float32x4_t vadjustmentCDEF = vfmsq_f32(vxCDEF, vsqrtxCDEF, vsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24()
|
D | neonfma-nr1rsqrts1fma1adj-x36.c | 33 const float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() local 43 float32x4_t vrsqrtxCDEF = vrsqrteq_f32(vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() 63 const float32x4_t vcorrectionCDEF = vrsqrtsq_f32(vxCDEF, vrxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() 86 float32x4_t vsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() 131 const float32x4_t vadjustmentCDEF = vfmsq_f32(vxCDEF, vsqrtxCDEF, vsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36()
|
D | neonfma-nr2fma1adj-x28.c | 33 const float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() local 41 const float32x4_t vrsqrtxCDEF = vrsqrteq_f32(vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() 52 float32x4_t vsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() 110 const float32x4_t vadjustmentCDEF = vfmsq_f32(vxCDEF, vsqrtxCDEF, vsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28()
|
D | neonfma-nr1rsqrts1fma1adj-x40.c | 33 const float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() local 44 float32x4_t vrsqrtxCDEF = vrsqrteq_f32(vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() 66 const float32x4_t vcorrectionCDEF = vrsqrtsq_f32(vxCDEF, vrxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() 91 float32x4_t vsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() 141 const float32x4_t vadjustmentCDEF = vfmsq_f32(vxCDEF, vsqrtxCDEF, vsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40()
|
/external/XNNPACK/src/f32-velu/gen/ |
D | velu-neonfma-rr1-p6-x16.c | 48 float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_velu_ukernel__neonfma_rr1_p6_x16() local 53 const float32x4_t vzCDEF = vmaxq_f32(vmulq_f32(vxCDEF, vprescale), vsat_cutoff); in xnn_f32_velu_ukernel__neonfma_rr1_p6_x16() 124 const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f)); in xnn_f32_velu_ukernel__neonfma_rr1_p6_x16() 125 vxCDEF = vmulq_f32(vxCDEF, vbeta); in xnn_f32_velu_ukernel__neonfma_rr1_p6_x16() 130 const float32x4_t vyCDEF = vbslq_f32(vmCDEF, veCDEF, vxCDEF); in xnn_f32_velu_ukernel__neonfma_rr1_p6_x16()
|
D | velu-wasmsimd-arm-rr2-p6-x16.c | 49 v128_t vxCDEF = wasm_v128_load(x + 12); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x16() local 55 const v128_t vzCDEF = wasm_f32x4_max(wasm_f32x4_mul(vxCDEF, vprescale), vsat_cutoff); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x16() 132 const v128_t vsignmCDEF = wasm_i32x4_shr(vxCDEF, 31); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x16() 133 vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x16() 138 const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vsignmCDEF); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x16()
|
D | velu-sse41-rr2-p6-x16.c | 49 __m128 vxCDEF = _mm_loadu_ps(x + 12); in xnn_f32_velu_ukernel__sse41_rr2_p6_x16() local 55 const __m128 vzCDEF = _mm_max_ps(vsat_cutoff, _mm_mul_ps(vxCDEF, vprescale)); in xnn_f32_velu_ukernel__sse41_rr2_p6_x16() 129 vxCDEF = _mm_mul_ps(vxCDEF, vbeta); in xnn_f32_velu_ukernel__sse41_rr2_p6_x16() 134 const __m128 vyCDEF = _mm_blendv_ps(vxCDEF, veCDEF, vxCDEF); in xnn_f32_velu_ukernel__sse41_rr2_p6_x16()
|
D | velu-neon-rr2-p6-x16.c | 49 float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_velu_ukernel__neon_rr2_p6_x16() local 54 const float32x4_t vzCDEF = vmaxq_f32(vmulq_f32(vxCDEF, vprescale), vsat_cutoff); in xnn_f32_velu_ukernel__neon_rr2_p6_x16() 130 const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f)); in xnn_f32_velu_ukernel__neon_rr2_p6_x16() 131 vxCDEF = vmulq_f32(vxCDEF, vbeta); in xnn_f32_velu_ukernel__neon_rr2_p6_x16() 136 const float32x4_t vyCDEF = vbslq_f32(vmCDEF, veCDEF, vxCDEF); in xnn_f32_velu_ukernel__neon_rr2_p6_x16()
|
D | velu-neonfma-rr1-p6-x20.c | 48 float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_velu_ukernel__neonfma_rr1_p6_x20() local 54 const float32x4_t vzCDEF = vmaxq_f32(vmulq_f32(vxCDEF, vprescale), vsat_cutoff); in xnn_f32_velu_ukernel__neonfma_rr1_p6_x20() 139 const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f)); in xnn_f32_velu_ukernel__neonfma_rr1_p6_x20() 140 vxCDEF = vmulq_f32(vxCDEF, vbeta); in xnn_f32_velu_ukernel__neonfma_rr1_p6_x20() 147 const float32x4_t vyCDEF = vbslq_f32(vmCDEF, veCDEF, vxCDEF); in xnn_f32_velu_ukernel__neonfma_rr1_p6_x20()
|
D | velu-sse2-rr2-p6-x16.c | 49 __m128 vxCDEF = _mm_loadu_ps(x + 12); in xnn_f32_velu_ukernel__sse2_rr2_p6_x16() local 55 const __m128 vzCDEF = _mm_max_ps(vsat_cutoff, _mm_mul_ps(vxCDEF, vprescale)); in xnn_f32_velu_ukernel__sse2_rr2_p6_x16() 132 …t __m128 vmCDEF = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vxCDEF))); in xnn_f32_velu_ukernel__sse2_rr2_p6_x16() 133 vxCDEF = _mm_mul_ps(vxCDEF, vbeta); in xnn_f32_velu_ukernel__sse2_rr2_p6_x16() 138 const __m128 vyCDEF = _mm_or_ps(_mm_and_ps(veCDEF, vmCDEF), _mm_andnot_ps(vmCDEF, vxCDEF)); in xnn_f32_velu_ukernel__sse2_rr2_p6_x16()
|
D | velu-wasmsimd-arm-rr2-p6-x20.c | 49 v128_t vxCDEF = wasm_v128_load(x + 12); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20() local 56 const v128_t vzCDEF = wasm_f32x4_max(wasm_f32x4_mul(vxCDEF, vprescale), vsat_cutoff); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20() 148 const v128_t vsignmCDEF = wasm_i32x4_shr(vxCDEF, 31); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20() 149 vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20() 156 const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vsignmCDEF); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20()
|
D | velu-sse41-rr2-p6-x20.c | 49 __m128 vxCDEF = _mm_loadu_ps(x + 12); in xnn_f32_velu_ukernel__sse41_rr2_p6_x20() local 56 const __m128 vzCDEF = _mm_max_ps(vsat_cutoff, _mm_mul_ps(vxCDEF, vprescale)); in xnn_f32_velu_ukernel__sse41_rr2_p6_x20() 145 vxCDEF = _mm_mul_ps(vxCDEF, vbeta); in xnn_f32_velu_ukernel__sse41_rr2_p6_x20() 151 const __m128 vyCDEF = _mm_blendv_ps(vxCDEF, veCDEF, vxCDEF); in xnn_f32_velu_ukernel__sse41_rr2_p6_x20()
|
D | velu-neon-rr2-p6-x20.c | 49 float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_velu_ukernel__neon_rr2_p6_x20() local 55 const float32x4_t vzCDEF = vmaxq_f32(vmulq_f32(vxCDEF, vprescale), vsat_cutoff); in xnn_f32_velu_ukernel__neon_rr2_p6_x20() 146 const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f)); in xnn_f32_velu_ukernel__neon_rr2_p6_x20() 147 vxCDEF = vmulq_f32(vxCDEF, vbeta); in xnn_f32_velu_ukernel__neon_rr2_p6_x20() 154 const float32x4_t vyCDEF = vbslq_f32(vmCDEF, veCDEF, vxCDEF); in xnn_f32_velu_ukernel__neon_rr2_p6_x20()
|
D | velu-wasmsimd-x86-rr2-p6-x16.c | 49 v128_t vxCDEF = wasm_v128_load(x + 12); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x16() local 55 const v128_t vzCDEF = wasm_f32x4_mul(vxCDEF, vprescale); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x16() 144 const v128_t vsignmCDEF = wasm_i32x4_shr(vxCDEF, 31); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x16() 145 vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x16() 150 const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vsignmCDEF); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x16()
|
D | velu-neon-rr2-p6-x24.c | 49 float32x4_t vxCDEF = vld1q_f32(x); x += 4; in xnn_f32_velu_ukernel__neon_rr2_p6_x24() local 56 const float32x4_t vzCDEF = vmaxq_f32(vmulq_f32(vxCDEF, vprescale), vsat_cutoff); in xnn_f32_velu_ukernel__neon_rr2_p6_x24() 162 const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f)); in xnn_f32_velu_ukernel__neon_rr2_p6_x24() 163 vxCDEF = vmulq_f32(vxCDEF, vbeta); in xnn_f32_velu_ukernel__neon_rr2_p6_x24() 172 const float32x4_t vyCDEF = vbslq_f32(vmCDEF, veCDEF, vxCDEF); in xnn_f32_velu_ukernel__neon_rr2_p6_x24()
|
D | velu-wasmsimd-arm-rr2-lut16-p3-x16.c | 49 v128_t vxCDEF = wasm_v128_load(x + 12); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x16() local 55 const v128_t vzCDEF = wasm_f32x4_max(wasm_f32x4_mul(vxCDEF, vprescale), vsat_cutoff); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x16() 154 const v128_t vsignmCDEF = wasm_i32x4_shr(vxCDEF, 31); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x16() 155 vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x16() 160 const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vsignmCDEF); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x16()
|