/external/XNNPACK/src/f32-gemm/gen/ |
D | 3x8s4-minmax-wasmsimd-x86.c | 58 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() local 77 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 91 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 105 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 119 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 143 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 155 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 163 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 169 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 186 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() [all …]
|
D | 3x8-minmax-wasmsimd-x86-splat.c | 58 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() local 80 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 93 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 106 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 119 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 142 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 154 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 162 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 168 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 185 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() [all …]
|
D | 3x8-minmax-sse-dup.c | 58 __m128 vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() local 81 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 95 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 109 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c2222, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 123 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c3333, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 146 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 158 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 166 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 172 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 189 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() [all …]
|
D | 3x8-minmax-sse2-dup.c | 58 __m128 vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() local 81 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 95 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 109 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c2222, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 123 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c3333, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 146 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 158 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 166 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 172 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 189 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() [all …]
|
D | 3x8s4-minmax-wasmsimd-arm.c | 60 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() local 79 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 93 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 107 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 121 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 145 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 156 vacc2x0123 = wasm_f32x4_max(vacc2x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 163 vacc2x0123 = wasm_f32x4_min(vacc2x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 169 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 186 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() [all …]
|
D | 3x8s4-minmax-sse.c | 58 __m128 vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() local 77 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 91 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 105 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 119 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 143 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 155 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 163 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 169 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 186 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() [all …]
|
D | 3x8-minmax-wasmsimd-arm-splat.c | 60 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() local 82 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 95 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 108 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 121 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 144 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 155 vacc2x0123 = wasm_f32x4_max(vacc2x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 162 vacc2x0123 = wasm_f32x4_min(vacc2x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 168 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 185 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() [all …]
|
D | 3x8-minmax-wasmsimd-x86-loadsplat.c | 58 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() local 77 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 88 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 96 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 102 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 119 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 123 vacc2x0123 = vacc2x4567; in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 132 *((double*) c2) = wasm_f64x2_extract_lane(vacc2x0123, 0); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 136 vacc2x0123 = wasm_v32x4_shuffle(vacc2x0123, vacc2x0123, 2, 3, 2, 3); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 145 *c2 = wasm_f32x4_extract_lane(vacc2x0123, 0); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 3x8s4inc-minmax-wasmsimd-x86.c | 60 v128_t vacc2x0123 = wasm_v128_load(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() local 79 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 93 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 107 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 121 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 145 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 157 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 165 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 171 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 188 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() [all …]
|
D | 3x8inc-minmax-wasmsimd-x86-splat.c | 60 v128_t vacc2x0123 = wasm_v128_load(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() local 82 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 95 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c1, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 108 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 121 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 144 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 156 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 164 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 170 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 187 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() [all …]
|
D | 3x8s4inc-minmax-sse.c | 60 __m128 vacc2x0123 = _mm_load_ps(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() local 79 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 93 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 107 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 121 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 145 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 157 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 165 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 171 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 188 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() [all …]
|
D | 3x8inc-minmax-wasmsimd-arm-splat.c | 62 v128_t vacc2x0123 = wasm_v128_load(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() local 84 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 97 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c1, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 110 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 123 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 146 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 157 vacc2x0123 = wasm_f32x4_max(vacc2x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 164 vacc2x0123 = wasm_f32x4_min(vacc2x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 170 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 187 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() [all …]
|
D | 3x8s4inc-minmax-wasmsimd-arm.c | 62 v128_t vacc2x0123 = wasm_v128_load(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm() local 81 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm() 95 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm() 109 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm() 123 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm() 147 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm() 158 vacc2x0123 = wasm_f32x4_max(vacc2x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm() 165 vacc2x0123 = wasm_f32x4_min(vacc2x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm() 171 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm() 188 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_arm() [all …]
|
D | 3x8inc-minmax-sse-dup.c | 60 __m128 vacc2x0123 = _mm_load_ps(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup() local 83 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup() 97 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup() 111 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c2222, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup() 125 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c3333, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup() 148 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup() 160 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup() 168 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup() 174 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup() 191 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__sse_dup() [all …]
|
D | 3x8inc-minmax-wasmsimd-x86-loadsplat.c | 60 v128_t vacc2x0123 = wasm_v128_load(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() local 79 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 90 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 98 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 104 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 121 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 125 vacc2x0123 = vacc2x4567; in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 134 *((double*) c2) = wasm_f64x2_extract_lane(vacc2x0123, 0); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 138 vacc2x0123 = wasm_v32x4_shuffle(vacc2x0123, vacc2x0123, 2, 3, 2, 3); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 147 *c2 = wasm_f32x4_extract_lane(vacc2x0123, 0); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
|
D | 3x8inc-minmax-sse2-dup.c | 60 __m128 vacc2x0123 = _mm_load_ps(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup() local 83 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup() 97 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup() 111 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c2222, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup() 125 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c3333, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup() 148 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup() 160 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup() 168 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup() 174 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup() 191 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__sse2_dup() [all …]
|
D | 3x8inc-minmax-wasmsimd-arm-loadsplat.c | 62 v128_t vacc2x0123 = wasm_v128_load(acc + 16); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat() local 81 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat() 91 vacc2x0123 = wasm_f32x4_max(vacc2x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat() 98 vacc2x0123 = wasm_f32x4_min(vacc2x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat() 104 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat() 121 wasm_v128_store(c2, vacc2x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat() 125 vacc2x0123 = vacc2x4567; in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat() 134 *((double*) c2) = wasm_f64x2_extract_lane(vacc2x0123, 0); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat() 138 vacc2x0123 = wasm_v32x4_shuffle(vacc2x0123, vacc2x0123, 2, 3, 2, 3); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat() 147 *c2 = wasm_f32x4_extract_lane(vacc2x0123, 0); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_loadsplat()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 3x8-minmax-wasmsimd-x86-splat.c | 58 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() local 99 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 112 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c1, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 125 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 138 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c3, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 163 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 174 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 182 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 188 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 202 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() [all …]
|
D | 3x8s4-minmax-wasmsimd-x86.c | 58 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() local 96 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 110 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 124 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 138 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 164 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 175 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 183 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 189 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 203 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() [all …]
|
D | 3x8s4-minmax-wasmsimd-arm.c | 60 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() local 98 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 112 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 126 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 140 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 166 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 176 vacc2x0123 = wasm_f32x4_max(vacc2x0123, vmin); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 183 vacc2x0123 = wasm_f32x4_min(vacc2x0123, vmax); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 189 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 203 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() [all …]
|
D | 3x8s4-minmax-sse.c | 58 __m128 vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() local 96 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 110 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 124 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 138 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 164 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 175 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 183 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 189 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 203 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() [all …]
|
D | 3x8-minmax-wasmsimd-x86-loadsplat.c | 58 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() local 98 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 108 vacc2x0123 = wasm_v128_bitselect(vmin, vacc2x0123, wasm_f32x4_lt(vacc2x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 116 vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vmax, wasm_f32x4_le(vacc2x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 122 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 136 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 140 vacc2x0123 = vacc2x4567; in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 149 *((double*) c2) = wasm_f64x2_extract_lane(vacc2x0123, 0); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 153 vacc2x0123 = wasm_v32x4_shuffle(vacc2x0123, vacc2x0123, 2, 3, 2, 3); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat() 162 *c2 = wasm_f32x4_extract_lane(vacc2x0123, 0); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat()
|
D | 3x8-minmax-wasmsimd-arm-splat.c | 60 v128_t vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat() local 101 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 114 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c1, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 127 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 140 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c3, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 165 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 175 vacc2x0123 = wasm_f32x4_max(vacc2x0123, vmin); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 182 vacc2x0123 = wasm_f32x4_min(vacc2x0123, vmax); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 188 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 202 wasm_v128_store(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat() [all …]
|
D | 3x8-minmax-sse-dup.c | 58 __m128 vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup() local 100 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup() 114 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup() 128 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c2222, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup() 142 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c3333, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup() 167 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup() 178 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup() 186 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup() 192 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup() 206 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__sse_dup() [all …]
|
D | 3x8-minmax-sse2-dup.c | 58 __m128 vacc2x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup() local 100 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup() 114 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup() 128 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c2222, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup() 142 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c3333, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup() 167 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup() 178 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup() 186 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup() 192 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup() 206 _mm_storeu_ps(c2, vacc2x0123); in xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup() [all …]
|