/external/XNNPACK/src/f32-vsqrt/gen/ |
D | neonfma-nr1rsqrts1fma1adj-x24.c | 35 const float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() local 42 float32x4_t vrsqrtxKLMN = vrsqrteq_f32(vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() 56 const float32x4_t vcorrectionKLMN = vrsqrtsq_f32(vxKLMN, vrxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() 75 float32x4_t vsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() 103 const float32x4_t vadjustmentKLMN = vfmsq_f32(vxKLMN, vsqrtxKLMN, vsqrtxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24()
|
D | neonfma-nr1rsqrts1fma1adj-x28.c | 35 const float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() local 43 float32x4_t vrsqrtxKLMN = vrsqrteq_f32(vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() 59 const float32x4_t vcorrectionKLMN = vrsqrtsq_f32(vxKLMN, vrxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() 80 float32x4_t vsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() 113 const float32x4_t vadjustmentKLMN = vfmsq_f32(vxKLMN, vsqrtxKLMN, vsqrtxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28()
|
D | neonfma-nr1rsqrts1fma1adj-x32.c | 35 const float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() local 44 float32x4_t vrsqrtxKLMN = vrsqrteq_f32(vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() 62 const float32x4_t vcorrectionKLMN = vrsqrtsq_f32(vxKLMN, vrxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() 85 float32x4_t vsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() 123 const float32x4_t vadjustmentKLMN = vfmsq_f32(vxKLMN, vsqrtxKLMN, vsqrtxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32()
|
D | neonfma-nr2fma1adj-x24.c | 35 const float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() local 42 const float32x4_t vrsqrtxKLMN = vrsqrteq_f32(vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() 54 float32x4_t vsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() 102 const float32x4_t vadjustmentKLMN = vfmsq_f32(vxKLMN, vsqrtxKLMN, vsqrtxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24()
|
D | neonfma-nr1rsqrts1fma1adj-x36.c | 35 const float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() local 45 float32x4_t vrsqrtxKLMN = vrsqrteq_f32(vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() 65 const float32x4_t vcorrectionKLMN = vrsqrtsq_f32(vxKLMN, vrxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() 90 float32x4_t vsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() 133 const float32x4_t vadjustmentKLMN = vfmsq_f32(vxKLMN, vsqrtxKLMN, vsqrtxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36()
|
D | neonfma-nr2fma1adj-x28.c | 35 const float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() local 43 const float32x4_t vrsqrtxKLMN = vrsqrteq_f32(vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() 56 float32x4_t vsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() 112 const float32x4_t vadjustmentKLMN = vfmsq_f32(vxKLMN, vsqrtxKLMN, vsqrtxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28()
|
D | neonfma-nr1rsqrts1fma1adj-x40.c | 35 const float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() local 46 float32x4_t vrsqrtxKLMN = vrsqrteq_f32(vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() 68 const float32x4_t vcorrectionKLMN = vrsqrtsq_f32(vxKLMN, vrxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() 95 float32x4_t vsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() 143 const float32x4_t vadjustmentKLMN = vfmsq_f32(vxKLMN, vsqrtxKLMN, vsqrtxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40()
|
D | neonfma-nr2fma1adj-x32.c | 35 const float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32() local 44 const float32x4_t vrsqrtxKLMN = vrsqrteq_f32(vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32() 58 float32x4_t vsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32() 122 const float32x4_t vadjustmentKLMN = vfmsq_f32(vxKLMN, vsqrtxKLMN, vsqrtxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32()
|
D | neonfma-nr2fma1adj-x36.c | 35 const float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36() local 45 const float32x4_t vrsqrtxKLMN = vrsqrteq_f32(vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36() 60 float32x4_t vsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36() 132 const float32x4_t vadjustmentKLMN = vfmsq_f32(vxKLMN, vsqrtxKLMN, vsqrtxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36()
|
D | neonfma-nr2fma1adj-x40.c | 35 const float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() local 46 const float32x4_t vrsqrtxKLMN = vrsqrteq_f32(vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 62 float32x4_t vsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 142 const float32x4_t vadjustmentKLMN = vfmsq_f32(vxKLMN, vsqrtxKLMN, vsqrtxKLMN); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
|
/external/XNNPACK/src/f32-velu/gen/ |
D | velu-neon-rr2-p6-x24.c | 51 float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_velu_ukernel__neon_rr2_p6_x24() local 58 const float32x4_t vzKLMN = vmaxq_f32(vmulq_f32(vxKLMN, vprescale), vsat_cutoff); in xnn_f32_velu_ukernel__neon_rr2_p6_x24() 166 const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_f32(0.0f)); in xnn_f32_velu_ukernel__neon_rr2_p6_x24() 167 vxKLMN = vmulq_f32(vxKLMN, vbeta); in xnn_f32_velu_ukernel__neon_rr2_p6_x24() 174 const float32x4_t vyKLMN = vbslq_f32(vmKLMN, veKLMN, vxKLMN); in xnn_f32_velu_ukernel__neon_rr2_p6_x24()
|
D | velu-neonfma-rr1-p6-x24.c | 50 float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_velu_ukernel__neonfma_rr1_p6_x24() local 57 const float32x4_t vzKLMN = vmaxq_f32(vmulq_f32(vxKLMN, vprescale), vsat_cutoff); in xnn_f32_velu_ukernel__neonfma_rr1_p6_x24() 158 const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_f32(0.0f)); in xnn_f32_velu_ukernel__neonfma_rr1_p6_x24() 159 vxKLMN = vmulq_f32(vxKLMN, vbeta); in xnn_f32_velu_ukernel__neonfma_rr1_p6_x24() 166 const float32x4_t vyKLMN = vbslq_f32(vmKLMN, veKLMN, vxKLMN); in xnn_f32_velu_ukernel__neonfma_rr1_p6_x24()
|
D | velu-sse41-rr2-p6-x24.c | 51 __m128 vxKLMN = _mm_loadu_ps(x + 20); in xnn_f32_velu_ukernel__sse41_rr2_p6_x24() local 59 const __m128 vzKLMN = _mm_max_ps(vsat_cutoff, _mm_mul_ps(vxKLMN, vprescale)); in xnn_f32_velu_ukernel__sse41_rr2_p6_x24() 163 vxKLMN = _mm_mul_ps(vxKLMN, vbeta); in xnn_f32_velu_ukernel__sse41_rr2_p6_x24() 170 const __m128 vyKLMN = _mm_blendv_ps(vxKLMN, veKLMN, vxKLMN); in xnn_f32_velu_ukernel__sse41_rr2_p6_x24()
|
D | velu-wasmsimd-arm-rr2-p6-x24.c | 51 v128_t vxKLMN = wasm_v128_load(x + 20); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x24() local 59 const v128_t vzKLMN = wasm_f32x4_max(wasm_f32x4_mul(vxKLMN, vprescale), vsat_cutoff); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x24() 168 const v128_t vsignmKLMN = wasm_i32x4_shr(vxKLMN, 31); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x24() 169 vxKLMN = wasm_f32x4_mul(vxKLMN, vbeta); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x24() 176 const v128_t vyKLMN = wasm_v128_bitselect(veKLMN, vxKLMN, vsignmKLMN); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x24()
|
D | velu-wasmsimd-x86-rr2-p6-x24.c | 51 v128_t vxKLMN = wasm_v128_load(x + 20); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x24() local 59 const v128_t vzKLMN = wasm_f32x4_mul(vxKLMN, vprescale); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x24() 186 const v128_t vsignmKLMN = wasm_i32x4_shr(vxKLMN, 31); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x24() 187 vxKLMN = wasm_f32x4_mul(vxKLMN, vbeta); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x24() 194 const v128_t vyKLMN = wasm_v128_bitselect(veKLMN, vxKLMN, vsignmKLMN); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x24()
|
D | velu-sse2-rr2-p6-x24.c | 51 __m128 vxKLMN = _mm_loadu_ps(x + 20); in xnn_f32_velu_ukernel__sse2_rr2_p6_x24() local 59 const __m128 vzKLMN = _mm_max_ps(vsat_cutoff, _mm_mul_ps(vxKLMN, vprescale)); in xnn_f32_velu_ukernel__sse2_rr2_p6_x24() 168 …t __m128 vmKLMN = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vxKLMN))); in xnn_f32_velu_ukernel__sse2_rr2_p6_x24() 169 vxKLMN = _mm_mul_ps(vxKLMN, vbeta); in xnn_f32_velu_ukernel__sse2_rr2_p6_x24() 176 const __m128 vyKLMN = _mm_or_ps(_mm_and_ps(veKLMN, vmKLMN), _mm_andnot_ps(vmKLMN, vxKLMN)); in xnn_f32_velu_ukernel__sse2_rr2_p6_x24()
|
D | velu-wasmsimd-arm-rr2-lut16-p3-x24.c | 51 v128_t vxKLMN = wasm_v128_load(x + 20); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x24() local 59 const v128_t vzKLMN = wasm_f32x4_max(wasm_f32x4_mul(vxKLMN, vprescale), vsat_cutoff); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x24() 202 const v128_t vsignmKLMN = wasm_i32x4_shr(vxKLMN, 31); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x24() 203 vxKLMN = wasm_f32x4_mul(vxKLMN, vbeta); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x24() 210 const v128_t vyKLMN = wasm_v128_bitselect(veKLMN, vxKLMN, vsignmKLMN); in xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x24()
|
D | velu-neonfma-rr1-lut16-p3-x24.c | 50 float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x24() local 57 const float32x4_t vzKLMN = vmaxq_f32(vmulq_f32(vxKLMN, vprescale), vsat_cutoff); in xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x24() 193 const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_f32(0.0f)); in xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x24() 194 vxKLMN = vmulq_f32(vxKLMN, vbeta); in xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x24() 201 const float32x4_t vyKLMN = vbslq_f32(vmKLMN, veKLMN, vxKLMN); in xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x24()
|
D | velu-wasmsimd-x86-rr2-lut16-p3-x24.c | 51 v128_t vxKLMN = wasm_v128_load(x + 20); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x24() local 59 const v128_t vzKLMN = wasm_f32x4_mul(vxKLMN, vprescale); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x24() 220 const v128_t vsignmKLMN = wasm_i32x4_shr(vxKLMN, 31); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x24() 221 vxKLMN = wasm_f32x4_mul(vxKLMN, vbeta); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x24() 228 const v128_t vyKLMN = wasm_v128_bitselect(veKLMN, vxKLMN, vsignmKLMN); in xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x24()
|
D | velu-neon-rr2-lut16-p3-x24.c | 51 float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x24() local 58 const float32x4_t vzKLMN = vmaxq_f32(vmulq_f32(vxKLMN, vprescale), vsat_cutoff); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x24() 201 const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_f32(0.0f)); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x24() 202 vxKLMN = vmulq_f32(vxKLMN, vbeta); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x24() 209 const float32x4_t vyKLMN = vbslq_f32(vmKLMN, veKLMN, vxKLMN); in xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x24()
|
D | velu-sse41-rr2-lut16-p3-x24.c | 51 __m128 vxKLMN = _mm_loadu_ps(x + 20); in xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x24() local 59 const __m128 vzKLMN = _mm_max_ps(vsat_cutoff, _mm_mul_ps(vxKLMN, vprescale)); in xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x24() 254 vxKLMN = _mm_mul_ps(vxKLMN, vbeta); in xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x24() 261 const __m128 vyKLMN = _mm_blendv_ps(vxKLMN, veKLMN, vxKLMN); in xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x24()
|
/external/XNNPACK/src/f32-sigmoid/gen/ |
D | neonfma-rr1-p5-div-x24.c | 43 const float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x24() local 50 const float32x4_t vzKLMN = vabsq_f32(vxKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x24() 141 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x24() 148 const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_f32(0.0f)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x24()
|
D | neonfma-rr1-p5-nr2recps-x24.c | 43 const float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x24() local 50 const float32x4_t vzKLMN = vabsq_f32(vxKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x24() 162 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x24() 169 const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_f32(0.0f)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x24()
|
D | neonfma-rr1-p5-nr1recps1fma-x24.c | 43 const float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x24() local 50 const float32x4_t vzKLMN = vabsq_f32(vxKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x24() 162 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x24() 169 const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_f32(0.0f)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x24()
|
D | neonfma-rr1-p5-nr2fma-x24.c | 43 const float32x4_t vxKLMN = vld1q_f32(x); x += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x24() local 50 const float32x4_t vzKLMN = vabsq_f32(vxKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x24() 162 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x24() 169 const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_f32(0.0f)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x24()
|