/external/XNNPACK/src/f16-gemm/gen/ |
D | 6x8-neonfp16arith-ld64.c | 77 float16x8_t vacc4x01234567 = vacc0x01234567; in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() local 96 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 110 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 120 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 134 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 144 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 158 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 168 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 182 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 203 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 8x8-neonfp16arith-ld64.c | 89 float16x8_t vacc4x01234567 = vacc0x01234567; in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() local 112 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 130 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 142 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 160 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 172 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 190 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 202 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 220 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 245 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 5x16s4-fma3-broadcast.c | 76 __m256 vacc4x01234567 = _mm256_load_ps(acc + 64); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() local 101 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c0, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() 121 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c1, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() 141 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c2, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() 161 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c3, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() 193 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() 209 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() 221 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() 229 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() 254 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() [all …]
|
D | 5x16-avx-broadcast.c | 76 __m256 vacc4x01234567 = _mm256_load_ps(acc + 64); in xnn_f32_gemminc_ukernel_5x16__avx_broadcast() local 101 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemminc_ukernel_5x16__avx_broadcast() 116 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_ukernel_5x16__avx_broadcast() 128 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_ukernel_5x16__avx_broadcast() 136 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16__avx_broadcast() 161 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16__avx_broadcast() 167 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_gemminc_ukernel_5x16__avx_broadcast() 179 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16__avx_broadcast() 191 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemminc_ukernel_5x16__avx_broadcast()
|
D | 5x16-fma3-broadcast.c | 76 __m256 vacc4x01234567 = _mm256_load_ps(acc + 64); in xnn_f32_gemminc_ukernel_5x16__fma3_broadcast() local 101 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16__fma3_broadcast() 116 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_ukernel_5x16__fma3_broadcast() 128 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_ukernel_5x16__fma3_broadcast() 136 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16__fma3_broadcast() 161 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16__fma3_broadcast() 167 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_gemminc_ukernel_5x16__fma3_broadcast() 179 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16__fma3_broadcast() 191 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemminc_ukernel_5x16__fma3_broadcast()
|
D | 5x8-avx-broadcast.c | 72 __m256 vacc4x01234567 = _mm256_load_ps(acc + 32); in xnn_f32_gemminc_ukernel_5x8__avx_broadcast() local 95 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemminc_ukernel_5x8__avx_broadcast() 105 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_ukernel_5x8__avx_broadcast() 112 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_ukernel_5x8__avx_broadcast() 115 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x8__avx_broadcast() 134 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemminc_ukernel_5x8__avx_broadcast() 146 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemminc_ukernel_5x8__avx_broadcast()
|
D | 5x8-fma3-broadcast.c | 72 __m256 vacc4x01234567 = _mm256_load_ps(acc + 32); in xnn_f32_gemminc_ukernel_5x8__fma3_broadcast() local 95 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x8__fma3_broadcast() 105 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_ukernel_5x8__fma3_broadcast() 112 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_ukernel_5x8__fma3_broadcast() 115 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x8__fma3_broadcast() 134 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemminc_ukernel_5x8__fma3_broadcast() 146 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemminc_ukernel_5x8__fma3_broadcast()
|
D | 6x8-fma3-broadcast.c | 78 __m256 vacc4x01234567 = _mm256_load_ps(acc + 32); in xnn_f32_gemminc_ukernel_6x8__fma3_broadcast() local 104 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemminc_ukernel_6x8__fma3_broadcast() 115 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_ukernel_6x8__fma3_broadcast() 123 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_ukernel_6x8__fma3_broadcast() 129 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_ukernel_6x8__fma3_broadcast() 150 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemminc_ukernel_6x8__fma3_broadcast() 164 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemminc_ukernel_6x8__fma3_broadcast()
|
D | 6x8-avx-broadcast.c | 78 __m256 vacc4x01234567 = _mm256_load_ps(acc + 32); in xnn_f32_gemminc_ukernel_6x8__avx_broadcast() local 104 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemminc_ukernel_6x8__avx_broadcast() 115 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_ukernel_6x8__avx_broadcast() 123 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_ukernel_6x8__avx_broadcast() 129 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_ukernel_6x8__avx_broadcast() 150 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemminc_ukernel_6x8__avx_broadcast() 164 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemminc_ukernel_6x8__avx_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 5x16s4-fma3-broadcast.c | 74 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() local 99 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c0, vacc4x01234567); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 119 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c1, vacc4x01234567); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 139 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c2, vacc4x01234567); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 159 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c3, vacc4x01234567); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 191 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 207 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 219 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 227 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 252 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() [all …]
|
D | 5x8-fma3-broadcast.c | 70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_ukernel_5x8__fma3_broadcast() local 93 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemm_ukernel_5x8__fma3_broadcast() 103 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_ukernel_5x8__fma3_broadcast() 110 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_ukernel_5x8__fma3_broadcast() 113 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_ukernel_5x8__fma3_broadcast() 132 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemm_ukernel_5x8__fma3_broadcast() 144 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemm_ukernel_5x8__fma3_broadcast()
|
D | 5x16-avx-broadcast.c | 74 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_ukernel_5x16__avx_broadcast() local 99 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemm_ukernel_5x16__avx_broadcast() 114 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_ukernel_5x16__avx_broadcast() 126 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_ukernel_5x16__avx_broadcast() 134 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_ukernel_5x16__avx_broadcast() 159 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_ukernel_5x16__avx_broadcast() 165 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_gemm_ukernel_5x16__avx_broadcast() 177 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemm_ukernel_5x16__avx_broadcast() 189 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemm_ukernel_5x16__avx_broadcast()
|
D | 5x16-fma3-broadcast.c | 74 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_ukernel_5x16__fma3_broadcast() local 99 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemm_ukernel_5x16__fma3_broadcast() 114 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_ukernel_5x16__fma3_broadcast() 126 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_ukernel_5x16__fma3_broadcast() 134 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_ukernel_5x16__fma3_broadcast() 159 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_ukernel_5x16__fma3_broadcast() 165 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_gemm_ukernel_5x16__fma3_broadcast() 177 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemm_ukernel_5x16__fma3_broadcast() 189 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemm_ukernel_5x16__fma3_broadcast()
|
D | 5x8-avx-broadcast.c | 70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_ukernel_5x8__avx_broadcast() local 93 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemm_ukernel_5x8__avx_broadcast() 103 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_ukernel_5x8__avx_broadcast() 110 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_ukernel_5x8__avx_broadcast() 113 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_ukernel_5x8__avx_broadcast() 132 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemm_ukernel_5x8__avx_broadcast() 144 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemm_ukernel_5x8__avx_broadcast()
|
D | 6x8-avx-broadcast.c | 76 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_ukernel_6x8__avx_broadcast() local 102 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemm_ukernel_6x8__avx_broadcast() 113 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_ukernel_6x8__avx_broadcast() 121 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_ukernel_6x8__avx_broadcast() 127 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_ukernel_6x8__avx_broadcast() 148 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemm_ukernel_6x8__avx_broadcast() 162 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemm_ukernel_6x8__avx_broadcast()
|
D | 6x8-fma3-broadcast.c | 76 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_ukernel_6x8__fma3_broadcast() local 102 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemm_ukernel_6x8__fma3_broadcast() 113 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_ukernel_6x8__fma3_broadcast() 121 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_ukernel_6x8__fma3_broadcast() 127 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_ukernel_6x8__fma3_broadcast() 148 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemm_ukernel_6x8__fma3_broadcast() 162 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemm_ukernel_6x8__fma3_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 5x16s4-fma3-broadcast.c | 70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() local 124 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c0, vacc4x01234567); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 144 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c1, vacc4x01234567); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 164 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c2, vacc4x01234567); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 184 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c3, vacc4x01234567); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 216 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 234 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 246 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 254 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 274 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() [all …]
|
D | 5x16-fma3-broadcast.c | 70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_ukernel_5x16__fma3_broadcast() local 128 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_igemm_ukernel_5x16__fma3_broadcast() 140 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_ukernel_5x16__fma3_broadcast() 152 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_ukernel_5x16__fma3_broadcast() 160 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_ukernel_5x16__fma3_broadcast() 180 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_ukernel_5x16__fma3_broadcast() 186 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_igemm_ukernel_5x16__fma3_broadcast() 198 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_ukernel_5x16__fma3_broadcast() 210 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_ukernel_5x16__fma3_broadcast()
|
D | 5x16-avx-broadcast.c | 70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_ukernel_5x16__avx_broadcast() local 128 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_igemm_ukernel_5x16__avx_broadcast() 140 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_ukernel_5x16__avx_broadcast() 152 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_ukernel_5x16__avx_broadcast() 160 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_ukernel_5x16__avx_broadcast() 180 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_ukernel_5x16__avx_broadcast() 186 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_igemm_ukernel_5x16__avx_broadcast() 198 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_ukernel_5x16__avx_broadcast() 210 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_ukernel_5x16__avx_broadcast()
|
D | 5x8-fma3-broadcast.c | 66 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_ukernel_5x8__fma3_broadcast() local 118 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_igemm_ukernel_5x8__fma3_broadcast() 129 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_ukernel_5x8__fma3_broadcast() 136 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_ukernel_5x8__fma3_broadcast() 139 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_ukernel_5x8__fma3_broadcast() 153 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_ukernel_5x8__fma3_broadcast() 165 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_ukernel_5x8__fma3_broadcast()
|
D | 5x8-avx-broadcast.c | 66 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_ukernel_5x8__avx_broadcast() local 118 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_igemm_ukernel_5x8__avx_broadcast() 129 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_ukernel_5x8__avx_broadcast() 136 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_ukernel_5x8__avx_broadcast() 139 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_ukernel_5x8__avx_broadcast() 153 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_ukernel_5x8__avx_broadcast() 165 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_ukernel_5x8__avx_broadcast()
|
D | 6x8-fma3-broadcast.c | 70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_ukernel_6x8__fma3_broadcast() local 130 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_igemm_ukernel_6x8__fma3_broadcast() 142 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_ukernel_6x8__fma3_broadcast() 150 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_ukernel_6x8__fma3_broadcast() 156 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_ukernel_6x8__fma3_broadcast() 171 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_ukernel_6x8__fma3_broadcast() 185 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_ukernel_6x8__fma3_broadcast()
|
D | 6x8-avx-broadcast.c | 70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_ukernel_6x8__avx_broadcast() local 130 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_igemm_ukernel_6x8__avx_broadcast() 142 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_ukernel_6x8__avx_broadcast() 150 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_ukernel_6x8__avx_broadcast() 156 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_ukernel_6x8__avx_broadcast() 171 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_ukernel_6x8__avx_broadcast() 185 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_ukernel_6x8__avx_broadcast()
|
D | 7x8-fma3-broadcast.c | 74 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_ukernel_7x8__fma3_broadcast() local 142 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_igemm_ukernel_7x8__fma3_broadcast() 155 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_ukernel_7x8__fma3_broadcast() 164 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_ukernel_7x8__fma3_broadcast() 173 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_ukernel_7x8__fma3_broadcast() 189 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_ukernel_7x8__fma3_broadcast() 205 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_ukernel_7x8__fma3_broadcast()
|
D | 7x8-avx-broadcast.c | 74 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_ukernel_7x8__avx_broadcast() local 142 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_igemm_ukernel_7x8__avx_broadcast() 155 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_ukernel_7x8__avx_broadcast() 164 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_ukernel_7x8__avx_broadcast() 173 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_ukernel_7x8__avx_broadcast() 189 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_ukernel_7x8__avx_broadcast() 205 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_ukernel_7x8__avx_broadcast()
|