/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x8-minmax-neon-mlal-lane.c | 47 int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() local 66 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 71 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 76 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 81 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 87 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 92 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 97 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 102 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 114 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() [all …]
|
D | 1x8-minmax-neon-mull-addw-dup.c | 47 int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() local 65 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 70 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 75 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 80 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 85 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 90 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 95 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 100 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 111 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() [all …]
|
D | 2x8-minmax-neon-mull-addw-dup.c | 51 int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() local 53 int32x4_t vacc1x4567 = vacc0x4567; in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 76 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 84 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 92 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 100 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 108 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 116 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 124 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 132 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x8-minmax-neon-mlal-lane.c | 44 int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() local 55 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 60 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 65 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 70 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 76 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 81 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 86 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 91 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 103 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() [all …]
|
D | 1x8-minmax-neon-mull-addw-dup.c | 44 int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() local 54 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 59 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 64 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 69 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 74 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 79 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 84 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 89 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 100 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() [all …]
|
D | 2x8-minmax-neon-mull-addw-dup.c | 50 int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() local 52 int32x4_t vacc1x4567 = vacc0x4567; in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 63 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 71 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 79 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 87 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 95 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 103 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 111 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 119 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() [all …]
|
D | 2x8-minmax-neon-mlal-lane.c | 50 int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() local 52 int32x4_t vacc1x4567 = vacc0x4567; in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 65 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 72 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 79 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 86 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 94 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 101 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 108 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 115 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() [all …]
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 1x8s4-minmax-wasmsimd-x86.c | 43 v128_t vacc0x4567 = wasm_v128_load(w + 4); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() local 56 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 64 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 72 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 80 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 96 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 104 vacc0x4567 = wasm_v128_bitselect(vmin, vacc0x4567, wasm_f32x4_lt(vacc0x4567, vmin)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 108 vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vmax, wasm_f32x4_le(vacc0x4567, vmax)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 112 wasm_v128_store(c0 + 4, vacc0x4567); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 122 vacc0x0123 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86()
|
D | 1x8-minmax-wasmsimd-x86-splat.c | 43 v128_t vacc0x4567 = wasm_v128_load(w + 4); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat() local 57 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 64 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c1, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 71 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c2, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 78 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c3, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 93 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 101 vacc0x4567 = wasm_v128_bitselect(vmin, vacc0x4567, wasm_f32x4_lt(vacc0x4567, vmin)); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 105 vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vmax, wasm_f32x4_le(vacc0x4567, vmax)); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 109 wasm_v128_store(c0 + 4, vacc0x4567); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 119 vacc0x0123 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat()
|
D | 1x8s4-minmax-neon.c | 44 float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() local 55 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 63 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 71 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 79 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c3); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 93 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 100 vacc0x4567 = vminq_f32(vacc0x4567, vmax); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 104 vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 108 vst1q_f32(c0 + 4, vacc0x4567); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 119 vacc0x0123 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_1x8s4__neon()
|
D | 1x8s4-minmax-neonfma.c | 44 float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() local 55 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 63 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 71 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 79 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 93 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 100 vacc0x4567 = vminq_f32(vacc0x4567, vmax); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 104 vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 108 vst1q_f32(c0 + 4, vacc0x4567); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 119 vacc0x0123 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma()
|
D | 1x8s4-minmax-sse.c | 43 __m128 vacc0x4567 = _mm_load_ps(w + 4); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() local 56 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 64 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 72 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 80 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 96 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 104 vacc0x4567 = _mm_min_ps(vacc0x4567, vmax); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 108 vacc0x4567 = _mm_max_ps(vacc0x4567, vmin); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 112 _mm_storeu_ps(c0 + 4, vacc0x4567); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 122 vacc0x0123 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_1x8s4__sse()
|
D | 1x8s4-minmax-wasmsimd-arm.c | 45 v128_t vacc0x4567 = wasm_v128_load(w + 4); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() local 58 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 66 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 74 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 82 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 98 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 105 vacc0x4567 = wasm_f32x4_max(vacc0x4567, vmin); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 108 vacc0x4567 = wasm_f32x4_min(vacc0x4567, vmax); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 112 wasm_v128_store(c0 + 4, vacc0x4567); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 122 vacc0x0123 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm()
|
D | 1x8-minmax-wasmsimd-arm-splat.c | 45 v128_t vacc0x4567 = wasm_v128_load(w + 4); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat() local 59 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat() 66 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c1, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat() 73 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c2, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat() 80 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c3, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat() 95 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat() 102 vacc0x4567 = wasm_f32x4_max(vacc0x4567, vmin); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat() 105 vacc0x4567 = wasm_f32x4_min(vacc0x4567, vmax); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat() 109 wasm_v128_store(c0 + 4, vacc0x4567); in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat() 119 vacc0x0123 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat()
|
D | 1x8-minmax-sse-dup.c | 43 __m128 vacc0x4567 = _mm_load_ps(w + 4); in xnn_f32_gemm_minmax_ukernel_1x8__sse_dup() local 58 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c0000, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_1x8__sse_dup() 66 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c1111, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_1x8__sse_dup() 74 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c2222, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_1x8__sse_dup() 82 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c3333, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_1x8__sse_dup() 97 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567)); in xnn_f32_gemm_minmax_ukernel_1x8__sse_dup() 105 vacc0x4567 = _mm_min_ps(vacc0x4567, vmax); in xnn_f32_gemm_minmax_ukernel_1x8__sse_dup() 109 vacc0x4567 = _mm_max_ps(vacc0x4567, vmin); in xnn_f32_gemm_minmax_ukernel_1x8__sse_dup() 113 _mm_storeu_ps(c0 + 4, vacc0x4567); in xnn_f32_gemm_minmax_ukernel_1x8__sse_dup() 123 vacc0x0123 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_1x8__sse_dup()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 1x8s4inc-minmax-wasmsimd-x86.c | 45 v128_t vacc0x4567 = wasm_v128_load(acc + 4); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() local 58 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 66 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 74 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 82 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 98 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 106 vacc0x4567 = wasm_v128_bitselect(vmin, vacc0x4567, wasm_f32x4_lt(vacc0x4567, vmin)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 110 vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vmax, wasm_f32x4_le(vacc0x4567, vmax)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 114 wasm_v128_store(c0 + 4, vacc0x4567); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 124 vacc0x0123 = vacc0x4567; in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86()
|
D | 1x8inc-minmax-wasmsimd-x86-splat.c | 45 v128_t vacc0x4567 = wasm_v128_load(acc + 4); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_x86_splat() local 59 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_x86_splat() 66 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c1, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_x86_splat() 73 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c2, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_x86_splat() 80 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c3, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_x86_splat() 95 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_x86_splat() 103 vacc0x4567 = wasm_v128_bitselect(vmin, vacc0x4567, wasm_f32x4_lt(vacc0x4567, vmin)); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_x86_splat() 107 vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vmax, wasm_f32x4_le(vacc0x4567, vmax)); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_x86_splat() 111 wasm_v128_store(c0 + 4, vacc0x4567); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_x86_splat() 121 vacc0x0123 = vacc0x4567; in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_x86_splat()
|
D | 1x8s4inc-minmax-wasmsimd-arm.c | 47 v128_t vacc0x4567 = wasm_v128_load(acc + 4); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() local 60 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 68 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 76 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 84 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 100 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 107 vacc0x4567 = wasm_f32x4_max(vacc0x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 110 vacc0x4567 = wasm_f32x4_min(vacc0x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 114 wasm_v128_store(c0 + 4, vacc0x4567); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 124 vacc0x0123 = vacc0x4567; in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm()
|
D | 1x8s4inc-minmax-neon.c | 46 float32x4_t vacc0x4567 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() local 57 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 65 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 73 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 81 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c3); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 95 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 102 vacc0x4567 = vminq_f32(vacc0x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 106 vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 110 vst1q_f32(c0 + 4, vacc0x4567); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 121 vacc0x0123 = vacc0x4567; in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon()
|
D | 1x8s4inc-minmax-neonfma.c | 46 float32x4_t vacc0x4567 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() local 57 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 65 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 73 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 81 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 95 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 102 vacc0x4567 = vminq_f32(vacc0x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 106 vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 110 vst1q_f32(c0 + 4, vacc0x4567); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 121 vacc0x0123 = vacc0x4567; in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma()
|
D | 1x8inc-minmax-wasmsimd-arm-splat.c | 47 v128_t vacc0x4567 = wasm_v128_load(acc + 4); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_arm_splat() local 61 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_arm_splat() 68 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c1, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_arm_splat() 75 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c2, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_arm_splat() 82 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c3, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_arm_splat() 97 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_arm_splat() 104 vacc0x4567 = wasm_f32x4_max(vacc0x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_arm_splat() 107 vacc0x4567 = wasm_f32x4_min(vacc0x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_arm_splat() 111 wasm_v128_store(c0 + 4, vacc0x4567); in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_arm_splat() 121 vacc0x0123 = vacc0x4567; in xnn_f32_gemminc_minmax_ukernel_1x8__wasmsimd_arm_splat()
|
D | 1x8s4inc-minmax-sse.c | 45 __m128 vacc0x4567 = _mm_load_ps(acc + 4); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() local 58 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 66 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 74 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 82 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 98 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 106 vacc0x4567 = _mm_min_ps(vacc0x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 110 vacc0x4567 = _mm_max_ps(vacc0x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 114 _mm_storeu_ps(c0 + 4, vacc0x4567); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 124 vacc0x0123 = vacc0x4567; in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse()
|
D | 1x8inc-minmax-sse-dup.c | 45 __m128 vacc0x4567 = _mm_load_ps(acc + 4); in xnn_f32_gemminc_minmax_ukernel_1x8__sse_dup() local 60 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c0000, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_1x8__sse_dup() 68 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c1111, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_1x8__sse_dup() 76 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c2222, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_1x8__sse_dup() 84 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c3333, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_1x8__sse_dup() 99 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567)); in xnn_f32_gemminc_minmax_ukernel_1x8__sse_dup() 107 vacc0x4567 = _mm_min_ps(vacc0x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_1x8__sse_dup() 111 vacc0x4567 = _mm_max_ps(vacc0x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_1x8__sse_dup() 115 _mm_storeu_ps(c0 + 4, vacc0x4567); in xnn_f32_gemminc_minmax_ukernel_1x8__sse_dup() 125 vacc0x0123 = vacc0x4567; in xnn_f32_gemminc_minmax_ukernel_1x8__sse_dup()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 1x8-minmax-wasmsimd-x86-splat.c | 47 v128_t vacc0x4567 = wasm_v128_load(w + 4); in xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat() local 70 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c0, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 77 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c1, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 84 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c2, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 91 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0c3, vb4567c3)); in xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 106 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 115 vacc0x4567 = wasm_v128_bitselect(vmin, vacc0x4567, wasm_f32x4_lt(vacc0x4567, vmin)); in xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 119 vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vmax, wasm_f32x4_le(vacc0x4567, vmax)); in xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 123 wasm_v128_store(c0 + 4, vacc0x4567); in xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat() 132 vacc0x0123 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat()
|
D | 1x8s4-minmax-wasmsimd-x86.c | 47 v128_t vacc0x4567 = wasm_v128_load(w + 4); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() local 69 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 77 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 85 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 93 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c3)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 109 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 118 vacc0x4567 = wasm_v128_bitselect(vmin, vacc0x4567, wasm_f32x4_lt(vacc0x4567, vmin)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 122 vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vmax, wasm_f32x4_le(vacc0x4567, vmax)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 126 wasm_v128_store(c0 + 4, vacc0x4567); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 135 vacc0x0123 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86()
|