Home
last modified time | relevance | path

Searched refs:vacc3x4567 (Results 1 – 25 of 215) sorted by relevance

123456789

/external/XNNPACK/src/qs8-gemm/gen/
D4x8-minmax-neon-mlal-lane.c68 int32x4_t vacc3x4567 = vacc0x4567; in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() local
91 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane()
102 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane()
113 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane()
124 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane()
136 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane()
147 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane()
158 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane()
169 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane()
193 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane()
[all …]
D4x8-minmax-neon-mull-addw-dup.c68 int32x4_t vacc3x4567 = vacc0x4567; in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() local
90 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup()
104 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup()
118 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup()
132 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup()
146 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup()
160 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup()
174 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup()
188 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c7)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup()
211 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup()
[all …]
/external/XNNPACK/src/qu8-igemm/
D4x8-minmax-neon.c62 int32x4_t vacc3x4567 = vacc0x4567; in xnn_qu8_igemm_minmax_ukernel_4x8__neon() local
106vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon()
120vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_ukernel_4x8__neon()
134vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 2); in xnn_qu8_igemm_minmax_ukernel_4x8__neon()
148vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 3); in xnn_qu8_igemm_minmax_ukernel_4x8__neon()
162vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon()
176vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 1); in xnn_qu8_igemm_minmax_ukernel_4x8__neon()
190vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 2); in xnn_qu8_igemm_minmax_ukernel_4x8__neon()
204vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 3); in xnn_qu8_igemm_minmax_ukernel_4x8__neon()
230vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon()
[all …]
/external/XNNPACK/src/qu8-gemm/
D4x8-minmax-neon.c65 int32x4_t vacc3x4567 = vacc0x4567; in xnn_qu8_gemm_minmax_ukernel_4x8__neon() local
88 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon()
100 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_ukernel_4x8__neon()
112 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qu8_gemm_minmax_ukernel_4x8__neon()
124 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qu8_gemm_minmax_ukernel_4x8__neon()
136 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon()
148 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1); in xnn_qu8_gemm_minmax_ukernel_4x8__neon()
160 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2); in xnn_qu8_gemm_minmax_ukernel_4x8__neon()
172 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3); in xnn_qu8_gemm_minmax_ukernel_4x8__neon()
196 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon()
[all …]
/external/XNNPACK/src/qs8-igemm/gen/
D4x8-minmax-neon-mlal-lane.c65 int32x4_t vacc3x4567 = vacc0x4567; in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() local
108vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane()
119vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane()
130vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane()
141vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane()
153vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane()
164vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane()
175vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane()
186vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane()
210vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane()
[all …]
D4x8-minmax-neon-mull-addw-dup.c65 int32x4_t vacc3x4567 = vacc0x4567; in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() local
107 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup()
121 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup()
135 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup()
149 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup()
163 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup()
177 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup()
191 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup()
205 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c7)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup()
228 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup()
[all …]
/external/XNNPACK/src/f32-gemm/gen/
D4x8-minmax-neonfma-lane-ld128.c68 float32x4_t vacc3x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() local
88 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
100 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
112 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
124 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
143 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
156 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
166 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
170 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
196 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
D4x8-minmax-neon-lane-ld128.c68 float32x4_t vacc3x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() local
88 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
100 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
112 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
124 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
143 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
156 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
166 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
170 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
196 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
D4x8s4-minmax-wasmsimd-x86.c67 v128_t vacc3x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() local
92 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
109 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
126 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
143 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
171 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
185 vacc3x4567 = wasm_v128_bitselect(vmin, vacc3x4567, wasm_f32x4_lt(vacc3x4567, vmin)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
195 vacc3x4567 = wasm_v128_bitselect(vacc3x4567, vmax, wasm_f32x4_le(vacc3x4567, vmax)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
199 wasm_v128_store(c3 + 4, vacc3x4567); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
224 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
D4x8-minmax-wasmsimd-x86-splat.c67 v128_t vacc3x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() local
96 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
112 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c1, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
128 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c2, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
144 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c3, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
171 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
185 vacc3x4567 = wasm_v128_bitselect(vmin, vacc3x4567, wasm_f32x4_lt(vacc3x4567, vmin)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
195 vacc3x4567 = wasm_v128_bitselect(vacc3x4567, vmax, wasm_f32x4_le(vacc3x4567, vmax)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
199 wasm_v128_store(c3 + 4, vacc3x4567); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
224 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
D4x8-minmax-neon-dup-ld128.c68 float32x4_t vacc3x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128() local
92 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128()
108 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c1, vb4567c1); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128()
124 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c2, vb4567c2); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128()
140 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c3, vb4567c3); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128()
159 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128()
172 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128()
182 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128()
186 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128()
212 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128()
D4x8s4-minmax-wasmsimd-arm.c69 v128_t vacc3x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() local
94 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
111 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
128 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
145 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
173 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
186 vacc3x4567 = wasm_f32x4_max(vacc3x4567, vmin); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
195 vacc3x4567 = wasm_f32x4_min(vacc3x4567, vmax); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
199 wasm_v128_store(c3 + 4, vacc3x4567); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
224 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
D4x8s4-minmax-neonfma.c68 float32x4_t vacc3x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma() local
88 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma()
105 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma()
122 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma()
139 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma()
162 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma()
175 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma()
185 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma()
189 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma()
215 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma()
/external/XNNPACK/src/f32-gemm/gen-inc/
D4x8s4inc-minmax-wasmsimd-x86.c69 v128_t vacc3x4567 = wasm_v128_load(acc + 28); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() local
94 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
111 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
128 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
145 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
173 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
187 vacc3x4567 = wasm_v128_bitselect(vmin, vacc3x4567, wasm_f32x4_lt(vacc3x4567, vmin)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
197 vacc3x4567 = wasm_v128_bitselect(vacc3x4567, vmax, wasm_f32x4_le(vacc3x4567, vmax)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
201 wasm_v128_store(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
226 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
D4x8inc-minmax-wasmsimd-x86-splat.c69 v128_t vacc3x4567 = wasm_v128_load(acc + 28); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() local
98 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
114 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c1, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
130 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c2, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
146 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c3, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
173 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
187 vacc3x4567 = wasm_v128_bitselect(vmin, vacc3x4567, wasm_f32x4_lt(vacc3x4567, vmin)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
197 vacc3x4567 = wasm_v128_bitselect(vacc3x4567, vmax, wasm_f32x4_le(vacc3x4567, vmax)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
201 wasm_v128_store(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
226 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
D4x8s4inc-minmax-neonfma.c70 float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma() local
90 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma()
107 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma()
124 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma()
141 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma()
164 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma()
177 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma()
187 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma()
191 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma()
217 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma()
D4x8s4inc-minmax-wasmsimd-arm.c71 v128_t vacc3x4567 = wasm_v128_load(acc + 28); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() local
96 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
113 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
130 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
147 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
175 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
188 vacc3x4567 = wasm_f32x4_max(vacc3x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
197 vacc3x4567 = wasm_f32x4_min(vacc3x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
201 wasm_v128_store(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
226 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
D4x8inc-minmax-neon-lane-ld128.c70 float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128() local
90 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128()
102 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128()
114 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128()
126 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128()
145 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128()
158 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128()
168 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128()
172 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128()
198 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128()
D4x8s4inc-minmax-sse.c69 __m128 vacc3x4567 = _mm_load_ps(acc + 28); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() local
94 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
111 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
128 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
145 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
173 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
187 vacc3x4567 = _mm_min_ps(vacc3x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
197 vacc3x4567 = _mm_max_ps(vacc3x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
201 _mm_storeu_ps(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
226 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
D4x8s4inc-minmax-neon.c70 float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon() local
90 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon()
107 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon()
124 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon()
141 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c3); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon()
164 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon()
177 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon()
187 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon()
191 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon()
217 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon()
D4x8inc-minmax-neon-dup-ld128.c70 float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128() local
94 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128()
110 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c1, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128()
126 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c2, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128()
142 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c3, vb4567c3); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128()
161 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128()
174 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128()
184 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128()
188 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128()
214 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128()
/external/XNNPACK/src/f32-igemm/gen/
D4x8s4-minmax-wasmsimd-x86.c65 v128_t vacc3x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() local
114 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
131 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
148 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
165 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c3)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
193 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
208 vacc3x4567 = wasm_v128_bitselect(vmin, vacc3x4567, wasm_f32x4_lt(vacc3x4567, vmin)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
218 vacc3x4567 = wasm_v128_bitselect(vacc3x4567, vmax, wasm_f32x4_le(vacc3x4567, vmax)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
222 wasm_v128_store(c3 + 4, vacc3x4567); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
243 vacc3x0123 = vacc3x4567; in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
D4x8-minmax-neon-lane-ld128.c66 float32x4_t vacc3x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128() local
110 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128()
122 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128()
134 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128()
146 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128()
165 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128()
182 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128()
192 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128()
196 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128()
217 vacc3x0123 = vacc3x4567; in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128()
D4x8s4-minmax-neonfma.c65 float32x4_t vacc3x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma() local
109 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma()
126 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma()
143 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma()
160 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma()
183 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma()
200 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma()
210 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma()
214 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma()
235 vacc3x0123 = vacc3x4567; in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma()
D4x8s4-minmax-neon.c65 float32x4_t vacc3x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_4x8s4__neon() local
109 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c0); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon()
126 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c1); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon()
143 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c2); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon()
160 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c3); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon()
183 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon()
200 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon()
210 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon()
214 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon()
235 vacc3x0123 = vacc3x4567; in xnn_f32_igemm_minmax_ukernel_4x8s4__neon()

123456789