/external/XNNPACK/src/f32-ppmm/gen/ |
D | 4x8-minmax-wasmsimd-x86-splat.c | 55 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat() local 76 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3333, vb0123)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat() 89 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat() 99 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat() 106 wasm_v128_store(c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat() 125 wasm_v128_store(c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat() 130 vacc3x0123 = vacc3x4567; in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat() 141 *((double*) c3) = wasm_f64x2_extract_lane(vacc3x0123, 0); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat() 146 vacc3x0123 = wasm_v32x4_shuffle(vacc3x0123, vacc3x0123, 2, 3, 2, 3); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat() 157 *c3 = wasm_f32x4_extract_lane(vacc3x0123, 0); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat()
|
D | 4x8-minmax-wasmsimd-arm-splat.c | 57 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat() local 78 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3333, vb0123)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat() 90 vacc3x0123 = wasm_f32x4_max(vacc3x0123, vmin); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat() 99 vacc3x0123 = wasm_f32x4_min(vacc3x0123, vmax); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat() 106 wasm_v128_store(c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat() 125 wasm_v128_store(c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat() 130 vacc3x0123 = vacc3x4567; in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat() 141 *((double*) c3) = wasm_f64x2_extract_lane(vacc3x0123, 0); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat() 146 vacc3x0123 = wasm_v32x4_shuffle(vacc3x0123, vacc3x0123, 2, 3, 2, 3); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat() 157 *c3 = wasm_f32x4_extract_lane(vacc3x0123, 0); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat()
|
D | 4x8-minmax-sse.c | 55 __m128 vacc3x0123 = vacc0x0123; in xnn_f32_ppmm_minmax_ukernel_4x8__sse() local 76 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3333, vb0123)); in xnn_f32_ppmm_minmax_ukernel_4x8__sse() 89 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); in xnn_f32_ppmm_minmax_ukernel_4x8__sse() 99 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); in xnn_f32_ppmm_minmax_ukernel_4x8__sse() 106 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__sse() 125 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__sse() 130 vacc3x0123 = vacc3x4567; in xnn_f32_ppmm_minmax_ukernel_4x8__sse() 141 _mm_storel_pi((__m64*) c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__sse() 146 vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__sse() 157 _mm_store_ss(c3, vacc3x0123); in xnn_f32_ppmm_minmax_ukernel_4x8__sse()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 4x8s4inc-minmax-wasmsimd-x86.c | 68 v128_t vacc3x0123 = wasm_v128_load(acc + 24); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() local 90 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 107 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 124 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 141 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 169 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 183 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 193 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 200 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 221 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() [all …]
|
D | 4x8inc-minmax-wasmsimd-x86-splat.c | 68 v128_t vacc3x0123 = wasm_v128_load(acc + 24); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() local 94 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 110 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c1, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 126 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 142 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 169 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 183 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 193 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 200 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 221 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() [all …]
|
D | 4x8s4inc-minmax-wasmsimd-arm.c | 70 v128_t vacc3x0123 = wasm_v128_load(acc + 24); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() local 92 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 109 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 126 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 143 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 171 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 184 vacc3x0123 = wasm_f32x4_max(vacc3x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 193 vacc3x0123 = wasm_f32x4_min(vacc3x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 200 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 221 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() [all …]
|
D | 4x8s4inc-minmax-sse.c | 68 __m128 vacc3x0123 = _mm_load_ps(acc + 24); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() local 90 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 107 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 124 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 141 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 169 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 183 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 193 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 200 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 221 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() [all …]
|
D | 4x8inc-minmax-wasmsimd-x86-loadsplat.c | 68 v128_t vacc3x0123 = wasm_v128_load(acc + 24); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() local 90 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 103 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 113 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 120 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 141 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 146 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 157 *((double*) c3) = wasm_f64x2_extract_lane(vacc3x0123, 0); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 162 vacc3x0123 = wasm_v32x4_shuffle(vacc3x0123, vacc3x0123, 2, 3, 2, 3); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 173 *c3 = wasm_f32x4_extract_lane(vacc3x0123, 0); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
|
D | 4x8inc-minmax-sse-dup.c | 68 __m128 vacc3x0123 = _mm_load_ps(acc + 24); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup() local 95 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c0000, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup() 112 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c1111, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup() 129 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c2222, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup() 146 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c3333, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup() 173 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123)); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup() 187 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup() 197 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup() 204 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup() 225 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_dup() [all …]
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 4x8s4-minmax-wasmsimd-x86.c | 64 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() local 110 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 127 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 144 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 161 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 192 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 204 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 214 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 221 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 238 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() [all …]
|
D | 4x8-minmax-wasmsimd-x86-splat.c | 64 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat() local 114 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 130 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c1, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 146 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 162 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c3, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 192 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 204 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 214 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 221 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 238 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat() [all …]
|
D | 4x8-minmax-wasmsimd-x86-loadsplat.c | 64 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() local 113 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 124 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 134 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 141 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 158 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 163 vacc3x0123 = vacc3x4567; in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 174 *((double*) c3) = wasm_f64x2_extract_lane(vacc3x0123, 0); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 179 vacc3x0123 = wasm_v32x4_shuffle(vacc3x0123, vacc3x0123, 2, 3, 2, 3); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 190 *c3 = wasm_f32x4_extract_lane(vacc3x0123, 0); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
|
D | 4x8-minmax-sse-dup.c | 64 __m128 vacc3x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup() local 115 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c0000, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup() 132 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c1111, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup() 149 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c2222, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup() 166 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c3333, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup() 196 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123)); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup() 208 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup() 218 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup() 225 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup() 242 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8__sse_dup() [all …]
|
D | 4x8s4-minmax-wasmsimd-arm.c | 66 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm() local 112 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm() 129 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm() 146 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm() 163 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm() 194 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm() 205 vacc3x0123 = wasm_f32x4_max(vacc3x0123, vmin); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm() 214 vacc3x0123 = wasm_f32x4_min(vacc3x0123, vmax); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm() 221 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm() 238 wasm_v128_store(c3, vacc3x0123); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm() [all …]
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 4x8s4-minmax-wasmsimd-x86.c | 66 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() local 88 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 105 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 122 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 139 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 167 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 181 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 191 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 198 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 219 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() [all …]
|
D | 4x8-minmax-wasmsimd-x86-splat.c | 66 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() local 92 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 108 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 124 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 140 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 167 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 181 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 191 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 198 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 219 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() [all …]
|
D | 4x8s4-minmax-wasmsimd-arm.c | 68 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() local 90 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 107 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 124 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 141 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 169 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 182 vacc3x0123 = wasm_f32x4_max(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 191 vacc3x0123 = wasm_f32x4_min(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 198 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 219 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() [all …]
|
D | 4x8s4-minmax-sse.c | 66 __m128 vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8s4__sse() local 88 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse() 105 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse() 122 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse() 139 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse() 167 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse() 181 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse() 191 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse() 198 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse() 219 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8s4__sse() [all …]
|
D | 4x8-minmax-wasmsimd-x86-loadsplat.c | 66 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() local 88 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 101 vacc3x0123 = wasm_v128_bitselect(vmin, vacc3x0123, wasm_f32x4_lt(vacc3x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 111 vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vmax, wasm_f32x4_le(vacc3x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 118 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 139 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 144 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 155 *((double*) c3) = wasm_f64x2_extract_lane(vacc3x0123, 0); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 160 vacc3x0123 = wasm_v32x4_shuffle(vacc3x0123, vacc3x0123, 2, 3, 2, 3); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat() 171 *c3 = wasm_f32x4_extract_lane(vacc3x0123, 0); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat()
|
D | 4x8-minmax-wasmsimd-arm-loadsplat.c | 68 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() local 90 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() 102 vacc3x0123 = wasm_f32x4_max(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() 111 vacc3x0123 = wasm_f32x4_min(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() 118 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() 139 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() 144 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() 155 *((double*) c3) = wasm_f64x2_extract_lane(vacc3x0123, 0); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() 160 vacc3x0123 = wasm_v32x4_shuffle(vacc3x0123, vacc3x0123, 2, 3, 2, 3); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() 171 *c3 = wasm_f32x4_extract_lane(vacc3x0123, 0); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat()
|
D | 4x8-minmax-sse-load1.c | 66 __m128 vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1() local 88 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1() 101 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1() 111 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1() 118 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1() 139 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1() 144 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1() 155 _mm_storel_pi((__m64*) c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1() 160 vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1() 171 _mm_store_ss(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
|
D | 4x8-minmax-neonfma-lane-ld128.c | 67 float32x4_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() local 84 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 96 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 108 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 120 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 139 vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 152 vacc3x0123 = vminq_f32(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 162 vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 169 vst1q_f32(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 191 vst1q_f32(c3, vacc3x0123); c3 += 4; in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() [all …]
|
D | 4x8-minmax-neon-lane-ld128.c | 67 float32x4_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() local 84 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 96 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 108 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 120 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 139 vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 152 vacc3x0123 = vminq_f32(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 162 vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 169 vst1q_f32(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 191 vst1q_f32(c3, vacc3x0123); c3 += 4; in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() [all …]
|
D | 4x8-minmax-sse2-dup.c | 66 __m128 vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup() local 93 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c0000, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup() 110 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c1111, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup() 127 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c2222, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup() 144 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c3333, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup() 171 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup() 185 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup() 195 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup() 202 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup() 223 _mm_storeu_ps(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup() [all …]
|
D | 4x8-minmax-wasmsimd-arm-splat.c | 68 v128_t vacc3x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat() local 94 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat() 110 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat() 126 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat() 142 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat() 169 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3, vb0123)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat() 182 vacc3x0123 = wasm_f32x4_max(vacc3x0123, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat() 191 vacc3x0123 = wasm_f32x4_min(vacc3x0123, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat() 198 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat() 219 wasm_v128_store(c3, vacc3x0123); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat() [all …]
|