/external/XNNPACK/src/f32-sigmoid/gen/ |
D | psimd-p5-div-x24.c | 165 psimd_f32 vfKLMN = psimd_div_f32(veKLMN, psimd_add_f32(veKLMN, vone)); in xnn_f32_sigmoid_ukernel__psimd_p5_div_x24() local 174 vfKLMN = psimd_andnotmask_f32(vzKLMN > vdenorm_cutoff, vfKLMN); in xnn_f32_sigmoid_ukernel__psimd_p5_div_x24() 182 vfKLMN = psimd_signblend_f32(vxKLMN, vfKLMN, psimd_sub_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__psimd_p5_div_x24() 189 psimd_store_f32(y + 20, vfKLMN); in xnn_f32_sigmoid_ukernel__psimd_p5_div_x24()
|
D | neonfma-rr1-p5-div-x24.c | 161 float32x4_t vfKLMN = vdivq_f32(veKLMN, vdKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x24() local 170 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x24() 185 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x24() 192 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x24()
|
D | sse41-p5-div-x24.c | 172 __m128 vfKLMN = _mm_div_ps(veKLMN, vdKLMN); in xnn_f32_sigmoid_ukernel__sse41_p5_div_x24() local 181 vfKLMN = _mm_andnot_ps(_mm_cmplt_ps(vzKLMN, vdenorm_cutoff), vfKLMN); in xnn_f32_sigmoid_ukernel__sse41_p5_div_x24() 189 vfKLMN = _mm_blendv_ps(_mm_sub_ps(vone, vfKLMN), vfKLMN, vxKLMN); in xnn_f32_sigmoid_ukernel__sse41_p5_div_x24() 196 _mm_storeu_ps(y + 20, vfKLMN); in xnn_f32_sigmoid_ukernel__sse41_p5_div_x24()
|
D | neonfma-rr1-p5-nr1recps1fma-x24.c | 185 float32x4_t vfKLMN = vmulq_f32(veKLMN, vrKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x24() local 194 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x24() 209 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x24() 216 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x24()
|
D | neonfma-rr1-p5-nr2recps-x24.c | 185 float32x4_t vfKLMN = vmulq_f32(veKLMN, vrKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x24() local 194 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x24() 209 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x24() 216 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x24()
|
D | sse2-p5-div-x24.c | 172 __m128 vfKLMN = _mm_div_ps(veKLMN, vdKLMN); in xnn_f32_sigmoid_ukernel__sse2_p5_div_x24() local 181 vfKLMN = _mm_andnot_ps(_mm_cmplt_ps(vzKLMN, vdenorm_cutoff), vfKLMN); in xnn_f32_sigmoid_ukernel__sse2_p5_div_x24() 196 vfKLMN = _mm_or_ps(_mm_and_ps(vfKLMN, vmKLMN), _mm_andnot_ps(vmKLMN, _mm_sub_ps(vone, vfKLMN))); in xnn_f32_sigmoid_ukernel__sse2_p5_div_x24() 203 _mm_storeu_ps(y + 20, vfKLMN); in xnn_f32_sigmoid_ukernel__sse2_p5_div_x24()
|
D | neonfma-rr1-p5-nr2fma-x24.c | 185 float32x4_t vfKLMN = vmulq_f32(veKLMN, vrKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x24() local 194 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x24() 209 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x24() 216 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x24()
|
D | neon-rr2-p5-nr2recps-x24.c | 195 float32x4_t vfKLMN = vmulq_f32(veKLMN, vrKLMN); in xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x24() local 204 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x24() 219 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x24() 226 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x24()
|
D | neonfma-rr1-lut64-p2-div-x24.c | 211 float32x4_t vfKLMN = vdivq_f32(vyKLMN, vdKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x24() local 220 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x24() 235 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x24() 242 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x24()
|
D | neonfma-rr1-lut2048-p1-div-x24.c | 203 float32x4_t vfKLMN = vdivq_f32(vyKLMN, vdKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x24() local 212 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x24() 227 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x24() 234 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x24()
|
D | neonfma-rr1-lut64-p2-nr1recps1fma-x24.c | 235 float32x4_t vfKLMN = vmulq_f32(vyKLMN, vrKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x24() local 244 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x24() 259 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x24() 266 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x24()
|
D | neonfma-rr1-lut2048-p1-nr2fma-x24.c | 227 float32x4_t vfKLMN = vmulq_f32(vyKLMN, vrKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x24() local 236 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x24() 251 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x24() 258 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x24()
|
D | neonfma-rr1-lut64-p2-nr2fma-x24.c | 235 float32x4_t vfKLMN = vmulq_f32(vyKLMN, vrKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x24() local 244 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x24() 259 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x24() 266 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x24()
|
D | neonfma-rr1-lut2048-p1-nr2recps-x24.c | 227 float32x4_t vfKLMN = vmulq_f32(vyKLMN, vrKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x24() local 236 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x24() 251 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x24() 258 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x24()
|
D | neonfma-rr1-lut64-p2-nr2recps-x24.c | 235 float32x4_t vfKLMN = vmulq_f32(vyKLMN, vrKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x24() local 244 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x24() 259 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x24() 266 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x24()
|
D | neon-rr2-lut64-p2-nr2recps-x24.c | 245 float32x4_t vfKLMN = vmulq_f32(vyKLMN, vrKLMN); in xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x24() local 254 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x24() 269 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x24() 276 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x24()
|
D | neon-rr2-lut2048-p1-nr2recps-x24.c | 237 float32x4_t vfKLMN = vmulq_f32(vyKLMN, vrKLMN); in xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x24() local 246 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x24() 261 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x24() 268 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x24()
|
D | neonfma-rr1-lut2048-p1-nr1recps1fma-x24.c | 227 float32x4_t vfKLMN = vmulq_f32(vyKLMN, vrKLMN); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x24() local 236 …vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm… in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x24() 251 vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN)); in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x24() 258 vst1q_f32(y, vfKLMN); y += 4; in xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x24()
|