Home
last modified time | relevance | path

Searched refs:vacc2x0123 (Results 1 – 25 of 399) sorted by relevance

12345678910>>...16

/external/XNNPACK/src/f32-gemm/gen/
D3x8s4-minmax-wasmsimd-x86.c58 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() local
77 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
91 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
105 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
119 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
143 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
155 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
163 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
169 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
186 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
[all …]
D3x8-minmax-wasmsimd-x86-splat.c58 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() local
80 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
93 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
106 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
119 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
142 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
154 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
162 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
168 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
185 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
[all …]
D3x8-minmax-sse-dup.c58 __m128 vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() local
81 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup()
95 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup()
109 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c2222, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup()
123 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c3333, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup()
146 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup()
158 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup()
166 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup()
172 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup()
189 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup()
[all …]
D3x8-minmax-sse2-dup.c58 __m128 vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() local
81 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup()
95 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup()
109 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c2222, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup()
123 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c3333, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup()
146 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup()
158 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup()
166 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup()
172 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup()
189 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup()
[all …]
D3x8s4-minmax-wasmsimd-arm.c60 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() local
79 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm()
93 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm()
107 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm()
121 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm()
145 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm()
156 vacc2x0123 = wasm_f32x4_max(vacc2x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm()
163 vacc2x0123 = wasm_f32x4_min(vacc2x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm()
169 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm()
186 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm()
[all …]
D3x8s4-minmax-sse.c58 __m128 vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() local
77 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse()
91 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse()
105 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse()
119 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse()
143 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse()
155 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse()
163 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse()
169 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse()
186 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse()
[all …]
D3x8-minmax-wasmsimd-arm-splat.c60 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() local
82 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
95 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
108 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
121 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
144 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
155 vacc2x0123 = wasm_f32x4_max(vacc2x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
162 vacc2x0123 = wasm_f32x4_min(vacc2x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
168 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
185 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
[all …]
D3x8-minmax-wasmsimd-x86-loadsplat.c58 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() local
77 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
88 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
96 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
102 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
119 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
123 vacc2x0123 = vacc2x4567; in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
132 *((double*) c2) = wasm_f64x2_extract_lane(vacc2x0123, 0); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
136 vacc2x0123 = wasm_v32x4_shuffle(vacc2x0123, vacc2x0123, 2, 3, 2, 3); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
145 *c2 = wasm_f32x4_extract_lane(vacc2x0123, 0); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
/external/XNNPACK/src/f32-gemm/gen-inc/
D3x8s4inc-minmax-wasmsimd-x86.c60 v128_t vacc2x0123 = wasm_v128_load(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() local
79 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
93 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
107 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
121 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
145 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
157 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
165 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
171 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
188 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
[all …]
D3x8inc-minmax-wasmsimd-x86-splat.c60 v128_t vacc2x0123 = wasm_v128_load(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() local
82 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat()
95 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c1, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat()
108 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat()
121 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat()
144 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat()
156 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat()
164 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat()
170 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat()
187 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat()
[all …]
D3x8s4inc-minmax-sse.c60 __m128 vacc2x0123 = _mm_load_ps(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() local
79 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
93 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
107 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
121 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
145 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
157 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
165 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
171 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
188 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
[all …]
D3x8inc-minmax-wasmsimd-arm-splat.c62 v128_t vacc2x0123 = wasm_v128_load(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() local
84 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat()
97 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c1, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat()
110 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat()
123 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat()
146 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat()
157 vacc2x0123 = wasm_f32x4_max(vacc2x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat()
164 vacc2x0123 = wasm_f32x4_min(vacc2x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat()
170 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat()
187 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat()
[all …]
D3x8s4inc-minmax-wasmsimd-arm.c62 v128_t vacc2x0123 = wasm_v128_load(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm() local
81 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm()
95 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm()
109 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm()
123 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm()
147 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm()
158 vacc2x0123 = wasm_f32x4_max(vacc2x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm()
165 vacc2x0123 = wasm_f32x4_min(vacc2x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm()
171 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm()
188 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm()
[all …]
D3x8inc-minmax-sse-dup.c60 __m128 vacc2x0123 = _mm_load_ps(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup() local
83 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup()
97 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup()
111 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c2222, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup()
125 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c3333, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup()
148 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup()
160 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup()
168 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup()
174 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup()
191 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup()
[all …]
D3x8inc-minmax-wasmsimd-x86-loadsplat.c60 v128_t vacc2x0123 = wasm_v128_load(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() local
79 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
90 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
98 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
104 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
121 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
125 vacc2x0123 = vacc2x4567; in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
134 *((double*) c2) = wasm_f64x2_extract_lane(vacc2x0123, 0); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
138 vacc2x0123 = wasm_v32x4_shuffle(vacc2x0123, vacc2x0123, 2, 3, 2, 3); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
147 *c2 = wasm_f32x4_extract_lane(vacc2x0123, 0); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
D3x8inc-minmax-sse2-dup.c60 __m128 vacc2x0123 = _mm_load_ps(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup() local
83 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup()
97 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup()
111 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c2222, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup()
125 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c3333, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup()
148 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup()
160 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup()
168 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup()
174 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup()
191 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup()
[all …]
D3x8inc-minmax-wasmsimd-arm-loadsplat.c62 v128_t vacc2x0123 = wasm_v128_load(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat() local
81 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat()
91 vacc2x0123 = wasm_f32x4_max(vacc2x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat()
98 vacc2x0123 = wasm_f32x4_min(vacc2x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat()
104 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat()
121 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat()
125 vacc2x0123 = vacc2x4567; in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat()
134 *((double*) c2) = wasm_f64x2_extract_lane(vacc2x0123, 0); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat()
138 vacc2x0123 = wasm_v32x4_shuffle(vacc2x0123, vacc2x0123, 2, 3, 2, 3); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat()
147 *c2 = wasm_f32x4_extract_lane(vacc2x0123, 0); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat()
/external/XNNPACK/src/f32-igemm/gen/
D3x8-minmax-wasmsimd-x86-splat.c58 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() local
99 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
112 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c1, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
125 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
138 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c3, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
163 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
174 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
182 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
188 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
202 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat()
[all …]
D3x8s4-minmax-wasmsimd-x86.c58 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() local
96 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86()
110 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86()
124 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86()
138 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86()
164 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86()
175 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86()
183 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86()
189 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86()
203 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86()
[all …]
D3x8s4-minmax-wasmsimd-arm.c60 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() local
98 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
112 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
126 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
140 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
166 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
176 vacc2x0123 = wasm_f32x4_max(vacc2x0123, vmin); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
183 vacc2x0123 = wasm_f32x4_min(vacc2x0123, vmax); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
189 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
203 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
[all …]
D3x8s4-minmax-sse.c58 __m128 vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() local
96 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse()
110 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse()
124 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse()
138 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse()
164 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse()
175 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse()
183 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse()
189 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse()
203 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse()
[all …]
D3x8-minmax-wasmsimd-x86-loadsplat.c58 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() local
98 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
108 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
116 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
122 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
136 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
140 vacc2x0123 = vacc2x4567; in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
149 *((double*) c2) = wasm_f64x2_extract_lane(vacc2x0123, 0); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
153 vacc2x0123 = wasm_v32x4_shuffle(vacc2x0123, vacc2x0123, 2, 3, 2, 3); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
162 *c2 = wasm_f32x4_extract_lane(vacc2x0123, 0); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
D3x8-minmax-wasmsimd-arm-splat.c60 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat() local
101 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
114 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c1, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
127 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
140 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c3, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
165 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
175 vacc2x0123 = wasm_f32x4_max(vacc2x0123, vmin); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
182 vacc2x0123 = wasm_f32x4_min(vacc2x0123, vmax); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
188 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
202 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat()
[all …]
D3x8-minmax-sse-dup.c58 __m128 vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup() local
100 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup()
114 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup()
128 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c2222, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup()
142 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c3333, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup()
167 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup()
178 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup()
186 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup()
192 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup()
206 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup()
[all …]
D3x8-minmax-sse2-dup.c58 __m128 vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup() local
100 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup()
114 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup()
128 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c2222, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup()
142 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c3333, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup()
167 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup()
178 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup()
186 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup()
192 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup()
206 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup()
[all …]

12345678910>>...16