Home
last modified time | relevance | path

Searched refs:vacc3x0123 (Results 1 – 25 of 305) sorted by relevance

12345678910>>...13

/external/XNNPACK/src/f32-ppmm/gen/
D4x8-minmax-wasmsimd-x86-splat.c55 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat() local
76 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3333, vb0123)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat()
89 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat()
99 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat()
106 wasm_v128_store(c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat()
125 wasm_v128_store(c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat()
130 vacc3x0123 = vacc3x4567; in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat()
141 *((double*) c3) = wasm_f64x2_extract_lane(vacc3x0123, 0); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat()
146 vacc3x0123 = wasm_v32x4_shuffle(vacc3x0123, vacc3x0123, 2, 3, 2, 3); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat()
157 *c3 = wasm_f32x4_extract_lane(vacc3x0123, 0); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat()
D4x8-minmax-wasmsimd-arm-splat.c57 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat() local
78 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3333, vb0123)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat()
90 vacc3x0123 = wasm_f32x4_max(vacc3x0123, vmin); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat()
99 vacc3x0123 = wasm_f32x4_min(vacc3x0123, vmax); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat()
106 wasm_v128_store(c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat()
125 wasm_v128_store(c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat()
130 vacc3x0123 = vacc3x4567; in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat()
141 *((double*) c3) = wasm_f64x2_extract_lane(vacc3x0123, 0); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat()
146 vacc3x0123 = wasm_v32x4_shuffle(vacc3x0123, vacc3x0123, 2, 3, 2, 3); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat()
157 *c3 = wasm_f32x4_extract_lane(vacc3x0123, 0); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat()
D4x8-minmax-sse.c55 __m128 vacc3x0123 = vacc0x0123; in xnn_f32_ppmm_minmax_ukernel_4x8__sse() local
76 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3333, vb0123)); in xnn_f32_ppmm_minmax_ukernel_4x8__sse()
89 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); in xnn_f32_ppmm_minmax_ukernel_4x8__sse()
99 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); in xnn_f32_ppmm_minmax_ukernel_4x8__sse()
106 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__sse()
125 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__sse()
130 vacc3x0123 = vacc3x4567; in xnn_f32_ppmm_minmax_ukernel_4x8__sse()
141 _mm_storel_pi((__m64*) c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__sse()
146 vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__sse()
157 _mm_store_ss(c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__sse()
/external/XNNPACK/src/f32-gemm/gen-inc/
D4x8s4inc-minmax-wasmsimd-x86.c68 v128_t vacc3x0123 = wasm_v128_load(acc + 24); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() local
90 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
107 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
124 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
141 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
169 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
183 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
193 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
200 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
221 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
[all …]
D4x8inc-minmax-wasmsimd-x86-splat.c68 v128_t vacc3x0123 = wasm_v128_load(acc + 24); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() local
94 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
110 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c1, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
126 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
142 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
169 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
183 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
193 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
200 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
221 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
[all …]
D4x8s4inc-minmax-wasmsimd-arm.c70 v128_t vacc3x0123 = wasm_v128_load(acc + 24); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() local
92 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
109 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
126 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
143 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
171 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
184 vacc3x0123 = wasm_f32x4_max(vacc3x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
193 vacc3x0123 = wasm_f32x4_min(vacc3x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
200 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
221 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
[all …]
D4x8s4inc-minmax-sse.c68 __m128 vacc3x0123 = _mm_load_ps(acc + 24); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() local
90 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
107 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
124 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
141 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
169 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
183 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
193 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
200 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
221 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
[all …]
D4x8inc-minmax-wasmsimd-x86-loadsplat.c68 v128_t vacc3x0123 = wasm_v128_load(acc + 24); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() local
90 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
103 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
113 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
120 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
141 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
146 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
157 *((double*) c3) = wasm_f64x2_extract_lane(vacc3x0123, 0); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
162 vacc3x0123 = wasm_v32x4_shuffle(vacc3x0123, vacc3x0123, 2, 3, 2, 3); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
173 *c3 = wasm_f32x4_extract_lane(vacc3x0123, 0); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
D4x8inc-minmax-sse-dup.c68 __m128 vacc3x0123 = _mm_load_ps(acc + 24); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup() local
95 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c0000, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup()
112 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c1111, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup()
129 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c2222, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup()
146 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c3333, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup()
173 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123)); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup()
187 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup()
197 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup()
204 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup()
225 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup()
[all …]
/external/XNNPACK/src/f32-igemm/gen/
D4x8s4-minmax-wasmsimd-x86.c64 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() local
110 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
127 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
144 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
161 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
192 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
204 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
214 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
221 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
238 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
[all …]
D4x8-minmax-wasmsimd-x86-splat.c64 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat() local
114 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
130 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c1, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
146 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
162 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c3, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
192 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
204 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
214 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
221 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
238 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
[all …]
D4x8-minmax-wasmsimd-x86-loadsplat.c64 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() local
113 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
124 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
134 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
141 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
158 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
163 vacc3x0123 = vacc3x4567; in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
174 *((double*) c3) = wasm_f64x2_extract_lane(vacc3x0123, 0); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
179 vacc3x0123 = wasm_v32x4_shuffle(vacc3x0123, vacc3x0123, 2, 3, 2, 3); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
190 *c3 = wasm_f32x4_extract_lane(vacc3x0123, 0); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
D4x8-minmax-sse-dup.c64 __m128 vacc3x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup() local
115 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c0000, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup()
132 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c1111, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup()
149 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c2222, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup()
166 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c3333, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup()
196 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123)); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup()
208 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup()
218 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup()
225 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup()
242 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup()
[all …]
D4x8s4-minmax-wasmsimd-arm.c66 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm() local
112 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm()
129 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm()
146 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm()
163 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm()
194 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm()
205 vacc3x0123 = wasm_f32x4_max(vacc3x0123, vmin); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm()
214 vacc3x0123 = wasm_f32x4_min(vacc3x0123, vmax); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm()
221 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm()
238 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm()
[all …]
/external/XNNPACK/src/f32-gemm/gen/
D4x8s4-minmax-wasmsimd-x86.c66 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() local
88 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
105 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
122 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
139 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
167 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
181 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
191 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
198 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
219 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
[all …]
D4x8-minmax-wasmsimd-x86-splat.c66 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() local
92 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
108 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
124 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
140 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
167 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
181 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
191 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
198 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
219 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
[all …]
D4x8s4-minmax-wasmsimd-arm.c68 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() local
90 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
107 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
124 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
141 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
169 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
182 vacc3x0123 = wasm_f32x4_max(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
191 vacc3x0123 = wasm_f32x4_min(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
198 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
219 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
[all …]
D4x8s4-minmax-sse.c66 __m128 vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8s4__sse() local
88 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse()
105 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse()
122 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse()
139 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse()
167 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse()
181 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse()
191 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse()
198 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse()
219 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse()
[all …]
D4x8-minmax-wasmsimd-x86-loadsplat.c66 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() local
88 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
101 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
111 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
118 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
139 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
144 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
155 *((double*) c3) = wasm_f64x2_extract_lane(vacc3x0123, 0); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
160 vacc3x0123 = wasm_v32x4_shuffle(vacc3x0123, vacc3x0123, 2, 3, 2, 3); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
171 *c3 = wasm_f32x4_extract_lane(vacc3x0123, 0); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
D4x8-minmax-wasmsimd-arm-loadsplat.c68 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() local
90 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat()
102 vacc3x0123 = wasm_f32x4_max(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat()
111 vacc3x0123 = wasm_f32x4_min(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat()
118 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat()
139 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat()
144 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat()
155 *((double*) c3) = wasm_f64x2_extract_lane(vacc3x0123, 0); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat()
160 vacc3x0123 = wasm_v32x4_shuffle(vacc3x0123, vacc3x0123, 2, 3, 2, 3); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat()
171 *c3 = wasm_f32x4_extract_lane(vacc3x0123, 0); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat()
D4x8-minmax-sse-load1.c66 __m128 vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1() local
88 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
101 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
111 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
118 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
139 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
144 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
155 _mm_storel_pi((__m64*) c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
160 vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
171 _mm_store_ss(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
D4x8-minmax-neonfma-lane-ld128.c67 float32x4_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() local
84 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
96 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
108 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
120 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
139 vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
152 vacc3x0123 = vminq_f32(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
162 vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
169 vst1q_f32(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
191 vst1q_f32(c3, vacc3x0123); c3 += 4; in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
[all …]
D4x8-minmax-neon-lane-ld128.c67 float32x4_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() local
84 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
96 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
108 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
120 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
139 vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
152 vacc3x0123 = vminq_f32(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
162 vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
169 vst1q_f32(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
191 vst1q_f32(c3, vacc3x0123); c3 += 4; in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
[all …]
D4x8-minmax-sse2-dup.c66 __m128 vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup() local
93 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c0000, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup()
110 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c1111, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup()
127 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c2222, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup()
144 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c3333, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup()
171 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup()
185 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup()
195 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup()
202 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup()
223 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup()
[all …]
D4x8-minmax-wasmsimd-arm-splat.c68 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat() local
94 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat()
110 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat()
126 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat()
142 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat()
169 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat()
182 vacc3x0123 = wasm_f32x4_max(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat()
191 vacc3x0123 = wasm_f32x4_min(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat()
198 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat()
219 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat()
[all …]

12345678910>>...13