/external/XNNPACK/src/f32-gemm/gen/ |
D | 6x8-minmax-wasmsimd-x86-loadsplat.c | 82 v128_t vacc5x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() local 110 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 127 vacc5x0123 = wasm_v128_bitselect(vmin, vacc5x0123, wasm_f32x4_lt(vacc5x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 141 vacc5x0123 = wasm_v128_bitselect(vacc5x0123, vmax, wasm_f32x4_le(vacc5x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 150 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 179 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 186 vacc5x0123 = vacc5x4567; in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 201 *((double*) c5) = wasm_f64x2_extract_lane(vacc5x0123, 0); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 208 vacc5x0123 = wasm_v32x4_shuffle(vacc5x0123, vacc5x0123, 2, 3, 2, 3); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 223 *c5 = wasm_f32x4_extract_lane(vacc5x0123, 0); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat()
|
D | 6x8-minmax-wasmsimd-arm-loadsplat.c | 84 v128_t vacc5x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() local 112 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 128 vacc5x0123 = wasm_f32x4_max(vacc5x0123, vmin); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 141 vacc5x0123 = wasm_f32x4_min(vacc5x0123, vmax); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 150 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 179 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 186 vacc5x0123 = vacc5x4567; in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 201 *((double*) c5) = wasm_f64x2_extract_lane(vacc5x0123, 0); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 208 vacc5x0123 = wasm_v32x4_shuffle(vacc5x0123, vacc5x0123, 2, 3, 2, 3); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 223 *c5 = wasm_f32x4_extract_lane(vacc5x0123, 0); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat()
|
D | 6x8-minmax-wasmsimd-x86-splat.c | 82 v128_t vacc5x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() local 116 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 138 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 160 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 182 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 217 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 235 vacc5x0123 = wasm_v128_bitselect(vmin, vacc5x0123, wasm_f32x4_lt(vacc5x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 249 vacc5x0123 = wasm_v128_bitselect(vacc5x0123, vmax, wasm_f32x4_le(vacc5x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 258 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 287 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat() [all …]
|
D | 6x8s4-minmax-wasmsimd-x86.c | 82 v128_t vacc5x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() local 110 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 133 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 156 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 179 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 215 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 233 vacc5x0123 = wasm_v128_bitselect(vmin, vacc5x0123, wasm_f32x4_lt(vacc5x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 247 vacc5x0123 = wasm_v128_bitselect(vacc5x0123, vmax, wasm_f32x4_le(vacc5x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 256 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 285 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() [all …]
|
D | 6x8-minmax-wasmsimd-arm-splat.c | 84 v128_t vacc5x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat() local 118 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 140 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 162 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 184 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 219 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 236 vacc5x0123 = wasm_f32x4_max(vacc5x0123, vmin); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 249 vacc5x0123 = wasm_f32x4_min(vacc5x0123, vmax); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 258 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 287 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat() [all …]
|
D | 6x8s4-minmax-wasmsimd-arm.c | 84 v128_t vacc5x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() local 112 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 135 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 158 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 181 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 217 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 234 vacc5x0123 = wasm_f32x4_max(vacc5x0123, vmin); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 247 vacc5x0123 = wasm_f32x4_min(vacc5x0123, vmax); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 256 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 285 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() [all …]
|
D | 6x8-minmax-neonfma-lane-ld128.c | 83 float32x4_t vacc5x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() local 104 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 120 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 136 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 152 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 177 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 194 vacc5x0123 = vminq_f32(vacc5x0123, vmax); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 208 vacc5x0123 = vmaxq_f32(vacc5x0123, vmin); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 217 vst1q_f32(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 247 vst1q_f32(c5, vacc5x0123); c5 += 4; in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() [all …]
|
D | 6x8s4-minmax-neon.c | 83 float32x4_t vacc5x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() local 104 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 127 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 150 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 173 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c3); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 202 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 219 vacc5x0123 = vminq_f32(vacc5x0123, vmax); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 233 vacc5x0123 = vmaxq_f32(vacc5x0123, vmin); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 242 vst1q_f32(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 272 vst1q_f32(c5, vacc5x0123); c5 += 4; in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() [all …]
|
D | 6x8s4-minmax-neonfma.c | 83 float32x4_t vacc5x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() local 104 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 127 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 150 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 173 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c3); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 202 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 219 vacc5x0123 = vminq_f32(vacc5x0123, vmax); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 233 vacc5x0123 = vmaxq_f32(vacc5x0123, vmin); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 242 vst1q_f32(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 272 vst1q_f32(c5, vacc5x0123); c5 += 4; in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() [all …]
|
D | 6x8-minmax-neon-lane-ld128.c | 83 float32x4_t vacc5x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() local 104 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 120 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 136 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 152 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 177 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 194 vacc5x0123 = vminq_f32(vacc5x0123, vmax); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 208 vacc5x0123 = vmaxq_f32(vacc5x0123, vmin); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 217 vst1q_f32(c5, vacc5x0123); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 247 vst1q_f32(c5, vacc5x0123); c5 += 4; in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() [all …]
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 6x8inc-minmax-wasmsimd-x86-loadsplat.c | 84 v128_t vacc5x0123 = wasm_v128_load(acc + 40); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() local 112 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 129 vacc5x0123 = wasm_v128_bitselect(vmin, vacc5x0123, wasm_f32x4_lt(vacc5x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 143 vacc5x0123 = wasm_v128_bitselect(vacc5x0123, vmax, wasm_f32x4_le(vacc5x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 152 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 181 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 188 vacc5x0123 = vacc5x4567; in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 203 *((double*) c5) = wasm_f64x2_extract_lane(vacc5x0123, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 210 vacc5x0123 = wasm_v32x4_shuffle(vacc5x0123, vacc5x0123, 2, 3, 2, 3); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 225 *c5 = wasm_f32x4_extract_lane(vacc5x0123, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat()
|
D | 6x8inc-minmax-wasmsimd-arm-loadsplat.c | 86 v128_t vacc5x0123 = wasm_v128_load(acc + 40); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() local 114 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 130 vacc5x0123 = wasm_f32x4_max(vacc5x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 143 vacc5x0123 = wasm_f32x4_min(vacc5x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 152 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 181 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 188 vacc5x0123 = vacc5x4567; in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 203 *((double*) c5) = wasm_f64x2_extract_lane(vacc5x0123, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 210 vacc5x0123 = wasm_v32x4_shuffle(vacc5x0123, vacc5x0123, 2, 3, 2, 3); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 225 *c5 = wasm_f32x4_extract_lane(vacc5x0123, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat()
|
D | 6x8s4inc-minmax-wasmsimd-x86.c | 84 v128_t vacc5x0123 = wasm_v128_load(acc + 40); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() local 112 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 135 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 158 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 181 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 217 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 235 vacc5x0123 = wasm_v128_bitselect(vmin, vacc5x0123, wasm_f32x4_lt(vacc5x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 249 vacc5x0123 = wasm_v128_bitselect(vacc5x0123, vmax, wasm_f32x4_le(vacc5x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 258 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 287 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() [all …]
|
D | 6x8inc-minmax-wasmsimd-x86-splat.c | 84 v128_t vacc5x0123 = wasm_v128_load(acc + 40); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() local 118 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 140 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c1, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 162 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 184 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 219 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 237 vacc5x0123 = wasm_v128_bitselect(vmin, vacc5x0123, wasm_f32x4_lt(vacc5x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 251 vacc5x0123 = wasm_v128_bitselect(vacc5x0123, vmax, wasm_f32x4_le(vacc5x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 260 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() 289 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_splat() [all …]
|
D | 6x8s4inc-minmax-wasmsimd-arm.c | 86 v128_t vacc5x0123 = wasm_v128_load(acc + 40); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() local 114 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 137 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 160 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 183 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 219 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 236 vacc5x0123 = wasm_f32x4_max(vacc5x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 249 vacc5x0123 = wasm_f32x4_min(vacc5x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 258 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 287 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() [all …]
|
D | 6x8inc-minmax-wasmsimd-arm-splat.c | 86 v128_t vacc5x0123 = wasm_v128_load(acc + 40); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_splat() local 120 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_splat() 142 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c1, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_splat() 164 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_splat() 186 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_splat() 221 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_splat() 238 vacc5x0123 = wasm_f32x4_max(vacc5x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_splat() 251 vacc5x0123 = wasm_f32x4_min(vacc5x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_splat() 260 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_splat() 289 wasm_v128_store(c5, vacc5x0123); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_splat() [all …]
|
D | 6x8inc-minmax-neon-lane-ld128.c | 85 float32x4_t vacc5x0123 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() local 106 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 122 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 138 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 154 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 179 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 196 vacc5x0123 = vminq_f32(vacc5x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 210 vacc5x0123 = vmaxq_f32(vacc5x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 219 vst1q_f32(c5, vacc5x0123); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 249 vst1q_f32(c5, vacc5x0123); c5 += 4; in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() [all …]
|
D | 6x8s4inc-minmax-neon.c | 85 float32x4_t vacc5x0123 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() local 106 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 129 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 152 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 175 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c3); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 204 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 221 vacc5x0123 = vminq_f32(vacc5x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 235 vacc5x0123 = vmaxq_f32(vacc5x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 244 vst1q_f32(c5, vacc5x0123); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 274 vst1q_f32(c5, vacc5x0123); c5 += 4; in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() [all …]
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 6x8s4-minmax-wasmsimd-x86.c | 76 v128_t vacc5x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() local 138 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 161 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 184 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 207 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 248 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 262 vacc5x0123 = wasm_v128_bitselect(vmin, vacc5x0123, wasm_f32x4_lt(vacc5x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 276 vacc5x0123 = wasm_v128_bitselect(vacc5x0123, vmax, wasm_f32x4_le(vacc5x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 285 wasm_v128_store(c5, vacc5x0123); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 308 wasm_v128_store(c5, vacc5x0123); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() [all …]
|
D | 6x8-minmax-wasmsimd-x86-splat.c | 76 v128_t vacc5x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() local 144 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 166 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c1, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 188 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 210 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c3, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 250 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 264 vacc5x0123 = wasm_v128_bitselect(vmin, vacc5x0123, wasm_f32x4_lt(vacc5x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 278 vacc5x0123 = wasm_v128_bitselect(vacc5x0123, vmax, wasm_f32x4_le(vacc5x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 287 wasm_v128_store(c5, vacc5x0123); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() 310 wasm_v128_store(c5, vacc5x0123); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat() [all …]
|
D | 6x8-minmax-wasmsimd-x86-loadsplat.c | 76 v128_t vacc5x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() local 143 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 156 vacc5x0123 = wasm_v128_bitselect(vmin, vacc5x0123, wasm_f32x4_lt(vacc5x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 170 vacc5x0123 = wasm_v128_bitselect(vacc5x0123, vmax, wasm_f32x4_le(vacc5x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 179 wasm_v128_store(c5, vacc5x0123); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 202 wasm_v128_store(c5, vacc5x0123); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 209 vacc5x0123 = vacc5x4567; in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 224 *((double*) c5) = wasm_f64x2_extract_lane(vacc5x0123, 0); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 231 vacc5x0123 = wasm_v32x4_shuffle(vacc5x0123, vacc5x0123, 2, 3, 2, 3); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 246 *c5 = wasm_f32x4_extract_lane(vacc5x0123, 0); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat()
|
D | 6x8s4-minmax-wasmsimd-arm.c | 78 v128_t vacc5x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() local 140 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 163 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 186 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 209 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 250 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 263 vacc5x0123 = wasm_f32x4_max(vacc5x0123, vmin); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 276 vacc5x0123 = wasm_f32x4_min(vacc5x0123, vmax); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 285 wasm_v128_store(c5, vacc5x0123); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 308 wasm_v128_store(c5, vacc5x0123); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() [all …]
|
D | 6x8-minmax-wasmsimd-arm-loadsplat.c | 78 v128_t vacc5x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() local 145 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 157 vacc5x0123 = wasm_f32x4_max(vacc5x0123, vmin); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 170 vacc5x0123 = wasm_f32x4_min(vacc5x0123, vmax); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 179 wasm_v128_store(c5, vacc5x0123); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 202 wasm_v128_store(c5, vacc5x0123); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 209 vacc5x0123 = vacc5x4567; in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 224 *((double*) c5) = wasm_f64x2_extract_lane(vacc5x0123, 0); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 231 vacc5x0123 = wasm_v32x4_shuffle(vacc5x0123, vacc5x0123, 2, 3, 2, 3); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 246 *c5 = wasm_f32x4_extract_lane(vacc5x0123, 0); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat()
|
D | 6x8-minmax-wasmsimd-arm-splat.c | 78 v128_t vacc5x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat() local 146 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 168 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c1, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 190 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 212 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5c3, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 252 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 265 vacc5x0123 = wasm_f32x4_max(vacc5x0123, vmin); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 278 vacc5x0123 = wasm_f32x4_min(vacc5x0123, vmax); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 287 wasm_v128_store(c5, vacc5x0123); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat() 310 wasm_v128_store(c5, vacc5x0123); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat() [all …]
|
D | 6x8-minmax-neonfma-lane-ld128.c | 77 float32x4_t vacc5x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() local 132 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 148 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 164 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 180 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 205 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 226 vacc5x0123 = vminq_f32(vacc5x0123, vmax); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 240 vacc5x0123 = vmaxq_f32(vacc5x0123, vmin); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 249 vst1q_f32(c5, vacc5x0123); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 272 vst1q_f32(c5, vacc5x0123); c5 += 4; in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() [all …]
|