/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 5x8s4inc-minmax-wasmsimd-x86.c | 77 v128_t vacc4x4567 = wasm_v128_load(acc + 36); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() local 106 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 126 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 146 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 166 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 198 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 214 vacc4x4567 = wasm_v128_bitselect(vmin, vacc4x4567, wasm_f32x4_lt(vacc4x4567, vmin)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 226 vacc4x4567 = wasm_v128_bitselect(vacc4x4567, vmax, wasm_f32x4_le(vacc4x4567, vmax)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 230 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 260 vacc4x0123 = vacc4x4567; in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86()
|
D | 5x8inc-minmax-wasmsimd-x86-splat.c | 77 v128_t vacc4x4567 = wasm_v128_load(acc + 36); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_x86_splat() local 111 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_x86_splat() 130 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c1, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_x86_splat() 149 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c2, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_x86_splat() 168 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c3, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_x86_splat() 199 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_x86_splat() 215 vacc4x4567 = wasm_v128_bitselect(vmin, vacc4x4567, wasm_f32x4_lt(vacc4x4567, vmin)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_x86_splat() 227 vacc4x4567 = wasm_v128_bitselect(vacc4x4567, vmax, wasm_f32x4_le(vacc4x4567, vmax)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_x86_splat() 231 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_x86_splat() 261 vacc4x0123 = vacc4x4567; in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_x86_splat()
|
D | 5x8inc-minmax-neon-lane-ld64.c | 78 float32x4_t vacc4x4567 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_5x8__neon_lane_ld64() local 100 vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0); in xnn_f32_gemminc_minmax_ukernel_5x8__neon_lane_ld64() 113 vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c1, va4, 1); in xnn_f32_gemminc_minmax_ukernel_5x8__neon_lane_ld64() 134 vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567); in xnn_f32_gemminc_minmax_ukernel_5x8__neon_lane_ld64() 146 vacc4x4567 = vminq_f32(vacc4x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x8__neon_lane_ld64() 158 vacc4x4567 = vmaxq_f32(vacc4x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x8__neon_lane_ld64() 162 vst1q_f32(c4 + 4, vacc4x4567); in xnn_f32_gemminc_minmax_ukernel_5x8__neon_lane_ld64() 193 vacc4x0123 = vacc4x4567; in xnn_f32_gemminc_minmax_ukernel_5x8__neon_lane_ld64()
|
D | 5x8inc-minmax-neonfma-lane-ld64.c | 78 float32x4_t vacc4x4567 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_5x8__neonfma_lane_ld64() local 100 vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0); in xnn_f32_gemminc_minmax_ukernel_5x8__neonfma_lane_ld64() 113 vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c1, va4, 1); in xnn_f32_gemminc_minmax_ukernel_5x8__neonfma_lane_ld64() 134 vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567); in xnn_f32_gemminc_minmax_ukernel_5x8__neonfma_lane_ld64() 146 vacc4x4567 = vminq_f32(vacc4x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x8__neonfma_lane_ld64() 158 vacc4x4567 = vmaxq_f32(vacc4x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x8__neonfma_lane_ld64() 162 vst1q_f32(c4 + 4, vacc4x4567); in xnn_f32_gemminc_minmax_ukernel_5x8__neonfma_lane_ld64() 193 vacc4x0123 = vacc4x4567; in xnn_f32_gemminc_minmax_ukernel_5x8__neonfma_lane_ld64()
|
D | 5x8inc-minmax-wasmsimd-arm-splat.c | 79 v128_t vacc4x4567 = wasm_v128_load(acc + 36); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_splat() local 113 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_splat() 132 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c1, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_splat() 151 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c2, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_splat() 170 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c3, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_splat() 201 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_splat() 216 vacc4x4567 = wasm_f32x4_max(vacc4x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_splat() 227 vacc4x4567 = wasm_f32x4_min(vacc4x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_splat() 231 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_splat() 261 vacc4x0123 = vacc4x4567; in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_splat()
|
D | 5x8s4inc-minmax-wasmsimd-arm.c | 79 v128_t vacc4x4567 = wasm_v128_load(acc + 36); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() local 108 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 128 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 148 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 168 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 200 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 215 vacc4x4567 = wasm_f32x4_max(vacc4x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 226 vacc4x4567 = wasm_f32x4_min(vacc4x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 230 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 260 vacc4x0123 = vacc4x4567; in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm()
|
D | 5x8s4inc-minmax-sse.c | 77 __m128 vacc4x4567 = _mm_load_ps(acc + 36); in xnn_f32_gemminc_minmax_ukernel_5x8s4__sse() local 106 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__sse() 126 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__sse() 146 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__sse() 166 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__sse() 198 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__sse() 214 vacc4x4567 = _mm_min_ps(vacc4x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x8s4__sse() 226 vacc4x4567 = _mm_max_ps(vacc4x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x8s4__sse() 230 _mm_storeu_ps(c4 + 4, vacc4x4567); in xnn_f32_gemminc_minmax_ukernel_5x8s4__sse() 260 vacc4x0123 = vacc4x4567; in xnn_f32_gemminc_minmax_ukernel_5x8s4__sse()
|
D | 6x8s4inc-minmax-wasmsimd-x86.c | 83 v128_t vacc4x4567 = wasm_v128_load(acc + 36); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() local 117 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 140 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 163 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 186 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 222 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 240 vacc4x4567 = wasm_v128_bitselect(vmin, vacc4x4567, wasm_f32x4_lt(vacc4x4567, vmin)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 254 vacc4x4567 = wasm_v128_bitselect(vacc4x4567, vmax, wasm_f32x4_le(vacc4x4567, vmax)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 262 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 295 vacc4x0123 = vacc4x4567; in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86()
|
D | 6x8inc-minmax-wasmsimd-x86-splat.c | 83 v128_t vacc4x4567 = wasm_v128_load(acc + 36); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() local 123 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 145 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c1, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 167 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c2, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 189 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c3, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 224 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 242 vacc4x4567 = wasm_v128_bitselect(vmin, vacc4x4567, wasm_f32x4_lt(vacc4x4567, vmin)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 256 vacc4x4567 = wasm_v128_bitselect(vacc4x4567, vmax, wasm_f32x4_le(vacc4x4567, vmax)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 264 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 297 vacc4x0123 = vacc4x4567; in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 5x8s4-minmax-wasmsimd-x86.c | 75 v128_t vacc4x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() local 104 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 124 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 144 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 164 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 196 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 212 vacc4x4567 = wasm_v128_bitselect(vmin, vacc4x4567, wasm_f32x4_lt(vacc4x4567, vmin)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 224 vacc4x4567 = wasm_v128_bitselect(vacc4x4567, vmax, wasm_f32x4_le(vacc4x4567, vmax)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 228 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 258 vacc4x0123 = vacc4x4567; in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86()
|
D | 5x8s4-minmax-wasmsimd-arm.c | 77 v128_t vacc4x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() local 106 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 126 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 146 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 166 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 198 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 213 vacc4x4567 = wasm_f32x4_max(vacc4x4567, vmin); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 224 vacc4x4567 = wasm_f32x4_min(vacc4x4567, vmax); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 228 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 258 vacc4x0123 = vacc4x4567; in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm()
|
D | 5x8-minmax-wasmsimd-x86-splat.c | 75 v128_t vacc4x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat() local 109 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 128 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c1, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 147 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c2, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 166 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c3, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 197 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 213 vacc4x4567 = wasm_v128_bitselect(vmin, vacc4x4567, wasm_f32x4_lt(vacc4x4567, vmin)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 225 vacc4x4567 = wasm_v128_bitselect(vacc4x4567, vmax, wasm_f32x4_le(vacc4x4567, vmax)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 229 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 259 vacc4x0123 = vacc4x4567; in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat()
|
D | 5x8-minmax-neonfma-lane-ld64.c | 76 float32x4_t vacc4x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64() local 98 vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0); in xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64() 111 vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c1, va4, 1); in xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64() 132 vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567); in xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64() 144 vacc4x4567 = vminq_f32(vacc4x4567, vmax); in xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64() 156 vacc4x4567 = vmaxq_f32(vacc4x4567, vmin); in xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64() 160 vst1q_f32(c4 + 4, vacc4x4567); in xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64() 191 vacc4x0123 = vacc4x4567; in xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64()
|
D | 5x8-minmax-neon-lane-ld64.c | 76 float32x4_t vacc4x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64() local 98 vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0); in xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64() 111 vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c1, va4, 1); in xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64() 132 vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567); in xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64() 144 vacc4x4567 = vminq_f32(vacc4x4567, vmax); in xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64() 156 vacc4x4567 = vmaxq_f32(vacc4x4567, vmin); in xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64() 160 vst1q_f32(c4 + 4, vacc4x4567); in xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64() 191 vacc4x0123 = vacc4x4567; in xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64()
|
D | 5x8-minmax-sse-dup.c | 75 __m128 vacc4x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_5x8__sse_dup() local 110 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4c0000, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_5x8__sse_dup() 130 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4c1111, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_5x8__sse_dup() 150 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4c2222, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_5x8__sse_dup() 170 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4c3333, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_5x8__sse_dup() 201 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8__sse_dup() 217 vacc4x4567 = _mm_min_ps(vacc4x4567, vmax); in xnn_f32_gemm_minmax_ukernel_5x8__sse_dup() 229 vacc4x4567 = _mm_max_ps(vacc4x4567, vmin); in xnn_f32_gemm_minmax_ukernel_5x8__sse_dup() 233 _mm_storeu_ps(c4 + 4, vacc4x4567); in xnn_f32_gemm_minmax_ukernel_5x8__sse_dup() 263 vacc4x0123 = vacc4x4567; in xnn_f32_gemm_minmax_ukernel_5x8__sse_dup()
|
D | 5x8-minmax-wasmsimd-arm-splat.c | 77 v128_t vacc4x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat() local 111 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat() 130 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c1, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat() 149 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c2, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat() 168 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c3, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat() 199 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat() 214 vacc4x4567 = wasm_f32x4_max(vacc4x4567, vmin); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat() 225 vacc4x4567 = wasm_f32x4_min(vacc4x4567, vmax); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat() 229 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat() 259 vacc4x0123 = vacc4x4567; in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat()
|
D | 5x8s4-minmax-sse.c | 75 __m128 vacc4x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() local 104 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 124 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 144 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 164 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 196 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 212 vacc4x4567 = _mm_min_ps(vacc4x4567, vmax); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 224 vacc4x4567 = _mm_max_ps(vacc4x4567, vmin); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 228 _mm_storeu_ps(c4 + 4, vacc4x4567); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 258 vacc4x0123 = vacc4x4567; in xnn_f32_gemm_minmax_ukernel_5x8s4__sse()
|
D | 6x8-minmax-wasmsimd-x86-splat.c | 81 v128_t vacc4x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() local 121 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 143 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c1, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 165 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c2, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 187 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c3, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 222 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 240 vacc4x4567 = wasm_v128_bitselect(vmin, vacc4x4567, wasm_f32x4_lt(vacc4x4567, vmin)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 254 vacc4x4567 = wasm_v128_bitselect(vacc4x4567, vmax, wasm_f32x4_le(vacc4x4567, vmax)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 262 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 295 vacc4x0123 = vacc4x4567; in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat()
|
D | 6x8s4-minmax-wasmsimd-x86.c | 81 v128_t vacc4x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() local 115 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 138 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 161 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 184 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 220 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 238 vacc4x4567 = wasm_v128_bitselect(vmin, vacc4x4567, wasm_f32x4_lt(vacc4x4567, vmin)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 252 vacc4x4567 = wasm_v128_bitselect(vacc4x4567, vmax, wasm_f32x4_le(vacc4x4567, vmax)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 260 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 293 vacc4x0123 = vacc4x4567; in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 5x8-minmax-wasmsimd-x86-splat.c | 71 v128_t vacc4x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat() local 134 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c0, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 153 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c1, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 172 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c2, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 191 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c3, vb4567c3)); in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 222 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 239 vacc4x4567 = wasm_v128_bitselect(vmin, vacc4x4567, wasm_f32x4_lt(vacc4x4567, vmin)); in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 251 vacc4x4567 = wasm_v128_bitselect(vacc4x4567, vmax, wasm_f32x4_le(vacc4x4567, vmax)); in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 255 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat() 280 vacc4x0123 = vacc4x4567; in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat()
|
D | 5x8s4-minmax-wasmsimd-x86.c | 71 v128_t vacc4x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86() local 129 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86() 149 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86() 169 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86() 189 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c3)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86() 221 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86() 238 vacc4x4567 = wasm_v128_bitselect(vmin, vacc4x4567, wasm_f32x4_lt(vacc4x4567, vmin)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86() 250 vacc4x4567 = wasm_v128_bitselect(vacc4x4567, vmax, wasm_f32x4_le(vacc4x4567, vmax)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86() 254 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86() 279 vacc4x0123 = vacc4x4567; in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86()
|
D | 5x8s4-minmax-wasmsimd-arm.c | 73 v128_t vacc4x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() local 131 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 151 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 171 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 191 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c3)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 223 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 239 vacc4x4567 = wasm_f32x4_max(vacc4x4567, vmin); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 250 vacc4x4567 = wasm_f32x4_min(vacc4x4567, vmax); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 254 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 279 vacc4x0123 = vacc4x4567; in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm()
|
D | 5x8s4-minmax-sse.c | 71 __m128 vacc4x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_5x8s4__sse() local 129 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_5x8s4__sse() 149 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_5x8s4__sse() 169 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_5x8s4__sse() 189 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c3)); in xnn_f32_igemm_minmax_ukernel_5x8s4__sse() 221 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567)); in xnn_f32_igemm_minmax_ukernel_5x8s4__sse() 238 vacc4x4567 = _mm_min_ps(vacc4x4567, vmax); in xnn_f32_igemm_minmax_ukernel_5x8s4__sse() 250 vacc4x4567 = _mm_max_ps(vacc4x4567, vmin); in xnn_f32_igemm_minmax_ukernel_5x8s4__sse() 254 _mm_storeu_ps(c4 + 4, vacc4x4567); in xnn_f32_igemm_minmax_ukernel_5x8s4__sse() 279 vacc4x0123 = vacc4x4567; in xnn_f32_igemm_minmax_ukernel_5x8s4__sse()
|
D | 6x8s4-minmax-wasmsimd-x86.c | 75 v128_t vacc4x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() local 143 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 166 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 189 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 212 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c3)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 247 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 267 vacc4x4567 = wasm_v128_bitselect(vmin, vacc4x4567, wasm_f32x4_lt(vacc4x4567, vmin)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 281 vacc4x4567 = wasm_v128_bitselect(vacc4x4567, vmax, wasm_f32x4_le(vacc4x4567, vmax)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 289 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 316 vacc4x0123 = vacc4x4567; in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86()
|
D | 6x8-minmax-wasmsimd-x86-splat.c | 75 v128_t vacc4x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() local 149 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c0, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 171 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c1, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 193 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c2, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 215 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4c3, vb4567c3)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 249 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 269 vacc4x4567 = wasm_v128_bitselect(vmin, vacc4x4567, wasm_f32x4_lt(vacc4x4567, vmin)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 283 vacc4x4567 = wasm_v128_bitselect(vacc4x4567, vmax, wasm_f32x4_le(vacc4x4567, vmax)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 291 wasm_v128_store(c4 + 4, vacc4x4567); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 318 vacc4x0123 = vacc4x4567; in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat()
|