/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 1x16s4-fma3-broadcast.c | 45 __m256 vacc0x89ABCDEF = _mm256_load_ps(acc + 8); in xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast() local 58 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast() 66 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast() 74 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast() 82 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast() 98 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast() 106 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast() 110 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast() 114 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast() 124 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast()
|
D | 1x16-fma3-broadcast.c | 45 __m256 vacc0x89ABCDEF = _mm256_load_ps(acc + 8); in xnn_f32_gemminc_ukernel_1x16__fma3_broadcast() local 58 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_1x16__fma3_broadcast() 65 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemminc_ukernel_1x16__fma3_broadcast() 69 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemminc_ukernel_1x16__fma3_broadcast() 73 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_1x16__fma3_broadcast() 83 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemminc_ukernel_1x16__fma3_broadcast()
|
D | 1x16-avx-broadcast.c | 45 __m256 vacc0x89ABCDEF = _mm256_load_ps(acc + 8); in xnn_f32_gemminc_ukernel_1x16__avx_broadcast() local 58 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); in xnn_f32_gemminc_ukernel_1x16__avx_broadcast() 65 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemminc_ukernel_1x16__avx_broadcast() 69 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemminc_ukernel_1x16__avx_broadcast() 73 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_1x16__avx_broadcast() 83 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemminc_ukernel_1x16__avx_broadcast()
|
D | 3x16s4-fma3-broadcast.c | 57 __m256 vacc0x89ABCDEF = _mm256_load_ps(acc + 8); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() local 80 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 94 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 108 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 122 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 146 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 158 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 166 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 178 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 194 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast()
|
D | 4x16s4-fma3-broadcast.c | 63 __m256 vacc0x89ABCDEF = _mm256_load_ps(acc + 8); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() local 91 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 108 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 125 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 142 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 170 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 184 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 194 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 210 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 229 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 1x16s4-fma3-broadcast.c | 43 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast() local 56 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast() 64 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast() 72 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast() 80 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast() 96 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast() 104 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast() 108 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast() 112 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast() 122 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast()
|
D | 3x16s4-fma3-broadcast.c | 55 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() local 57 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 78 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 92 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 106 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 120 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 144 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 156 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 164 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() [all …]
|
D | 1x16-fma3-broadcast.c | 43 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_ukernel_1x16__fma3_broadcast() local 56 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_1x16__fma3_broadcast() 63 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_ukernel_1x16__fma3_broadcast() 67 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_ukernel_1x16__fma3_broadcast() 71 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_1x16__fma3_broadcast() 81 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_1x16__fma3_broadcast()
|
D | 1x16-avx-broadcast.c | 43 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_ukernel_1x16__avx_broadcast() local 56 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); in xnn_f32_gemm_ukernel_1x16__avx_broadcast() 63 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_ukernel_1x16__avx_broadcast() 67 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_ukernel_1x16__avx_broadcast() 71 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_1x16__avx_broadcast() 81 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_1x16__avx_broadcast()
|
D | 4x16s4-fma3-broadcast.c | 61 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() local 63 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 65 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 67 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 89 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 106 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 123 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 140 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 168 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 182 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() [all …]
|
D | 5x16s4-fma3-broadcast.c | 67 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() local 69 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 71 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 73 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 75 __m256 vacc4x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 100 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 120 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 140 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 160 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 192 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() [all …]
|
D | 3x16-avx-broadcast.c | 55 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_ukernel_3x16__avx_broadcast() local 57 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_3x16__avx_broadcast() 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_3x16__avx_broadcast() 78 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); in xnn_f32_gemm_ukernel_3x16__avx_broadcast() 89 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_ukernel_3x16__avx_broadcast() 97 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_ukernel_3x16__avx_broadcast() 109 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_3x16__avx_broadcast() 125 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_3x16__avx_broadcast()
|
D | 3x16-fma3-broadcast.c | 55 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_ukernel_3x16__fma3_broadcast() local 57 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_3x16__fma3_broadcast() 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_3x16__fma3_broadcast() 78 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_3x16__fma3_broadcast() 89 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_ukernel_3x16__fma3_broadcast() 97 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_ukernel_3x16__fma3_broadcast() 109 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_3x16__fma3_broadcast() 125 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_3x16__fma3_broadcast()
|
D | 4x16-fma3-broadcast.c | 61 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_ukernel_4x16__fma3_broadcast() local 63 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_4x16__fma3_broadcast() 65 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_4x16__fma3_broadcast() 67 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_4x16__fma3_broadcast() 89 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_4x16__fma3_broadcast() 102 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_ukernel_4x16__fma3_broadcast() 112 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_ukernel_4x16__fma3_broadcast() 128 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_4x16__fma3_broadcast() 147 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_4x16__fma3_broadcast()
|
D | 4x16-avx-broadcast.c | 61 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_ukernel_4x16__avx_broadcast() local 63 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_4x16__avx_broadcast() 65 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_4x16__avx_broadcast() 67 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_4x16__avx_broadcast() 89 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); in xnn_f32_gemm_ukernel_4x16__avx_broadcast() 102 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_ukernel_4x16__avx_broadcast() 112 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_ukernel_4x16__avx_broadcast() 128 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemm_ukernel_4x16__avx_broadcast() 147 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemm_ukernel_4x16__avx_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 1x16s4-fma3-broadcast.c | 47 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast() local 69 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast() 77 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast() 85 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast() 93 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast() 109 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast() 119 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast() 123 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast() 127 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast() 136 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast()
|
D | 3x16s4-fma3-broadcast.c | 55 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() local 57 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 97 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 111 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 125 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 139 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 163 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 177 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 185 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() [all …]
|
D | 1x16-fma3-broadcast.c | 47 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_ukernel_1x16__fma3_broadcast() local 69 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_1x16__fma3_broadcast() 77 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_ukernel_1x16__fma3_broadcast() 81 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_ukernel_1x16__fma3_broadcast() 85 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_1x16__fma3_broadcast() 94 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_1x16__fma3_broadcast()
|
D | 1x16-avx-broadcast.c | 47 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_ukernel_1x16__avx_broadcast() local 69 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); in xnn_f32_igemm_ukernel_1x16__avx_broadcast() 77 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_ukernel_1x16__avx_broadcast() 81 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_ukernel_1x16__avx_broadcast() 85 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_1x16__avx_broadcast() 94 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_1x16__avx_broadcast()
|
D | 4x16s4-fma3-broadcast.c | 59 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() local 61 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 63 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 65 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 111 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 128 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 145 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 162 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 190 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 206 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() [all …]
|
D | 5x16s4-fma3-broadcast.c | 63 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() local 65 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 67 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 69 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 71 __m256 vacc4x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 125 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 145 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 165 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 185 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 217 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() [all …]
|
D | 3x16-fma3-broadcast.c | 55 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_ukernel_3x16__fma3_broadcast() local 57 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_3x16__fma3_broadcast() 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_3x16__fma3_broadcast() 95 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_3x16__fma3_broadcast() 109 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_ukernel_3x16__fma3_broadcast() 117 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_ukernel_3x16__fma3_broadcast() 129 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_3x16__fma3_broadcast() 142 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_3x16__fma3_broadcast()
|
D | 3x16-avx-broadcast.c | 55 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_ukernel_3x16__avx_broadcast() local 57 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_3x16__avx_broadcast() 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_3x16__avx_broadcast() 95 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); in xnn_f32_igemm_ukernel_3x16__avx_broadcast() 109 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_ukernel_3x16__avx_broadcast() 117 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_ukernel_3x16__avx_broadcast() 129 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_3x16__avx_broadcast() 142 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_3x16__avx_broadcast()
|
D | 4x16-fma3-broadcast.c | 59 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_ukernel_4x16__fma3_broadcast() local 61 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_4x16__fma3_broadcast() 63 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_4x16__fma3_broadcast() 65 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_4x16__fma3_broadcast() 108 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_4x16__fma3_broadcast() 125 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_ukernel_4x16__fma3_broadcast() 135 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_ukernel_4x16__fma3_broadcast() 151 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_4x16__fma3_broadcast() 166 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_4x16__fma3_broadcast()
|
D | 4x16-avx-broadcast.c | 59 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_ukernel_4x16__avx_broadcast() local 61 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_4x16__avx_broadcast() 63 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_4x16__avx_broadcast() 65 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_4x16__avx_broadcast() 108 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); in xnn_f32_igemm_ukernel_4x16__avx_broadcast() 125 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_ukernel_4x16__avx_broadcast() 135 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_ukernel_4x16__avx_broadcast() 151 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_igemm_ukernel_4x16__avx_broadcast() 166 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_igemm_ukernel_4x16__avx_broadcast()
|