/external/XNNPACK/src/f32-gemm/gen/ |
D | 4x8s4-psimd.c | 76 psimd_f32 va2 = psimd_load_f32(a2); in xnn_f32_gemm_ukernel_4x8s4__psimd() local 87 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemm_ukernel_4x8s4__psimd() 91 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemm_ukernel_4x8s4__psimd() 97 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_4x8s4__psimd() 102 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemm_ukernel_4x8s4__psimd() 111 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemm_ukernel_4x8s4__psimd() 115 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemm_ukernel_4x8s4__psimd() 121 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_4x8s4__psimd() 126 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemm_ukernel_4x8s4__psimd() 135 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_ukernel_4x8s4__psimd() [all …]
|
D | 6x8s4-psimd.c | 92 psimd_f32 va2 = psimd_load_f32(a2); in xnn_f32_gemm_ukernel_6x8s4__psimd() local 107 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemm_ukernel_6x8s4__psimd() 113 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemm_ukernel_6x8s4__psimd() 121 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_6x8s4__psimd() 128 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemm_ukernel_6x8s4__psimd() 139 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemm_ukernel_6x8s4__psimd() 145 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemm_ukernel_6x8s4__psimd() 153 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_6x8s4__psimd() 160 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemm_ukernel_6x8s4__psimd() 171 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_ukernel_6x8s4__psimd() [all …]
|
D | 4x8s4-sse.c | 76 __m128 va2 = _mm_loadu_ps(a2); in xnn_f32_gemm_ukernel_4x8s4__sse() local 87 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c0)); in xnn_f32_gemm_ukernel_4x8s4__sse() 91 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c0)); in xnn_f32_gemm_ukernel_4x8s4__sse() 96 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_4x8s4__sse() 104 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c1)); in xnn_f32_gemm_ukernel_4x8s4__sse() 108 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c1)); in xnn_f32_gemm_ukernel_4x8s4__sse() 113 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_4x8s4__sse() 121 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c2)); in xnn_f32_gemm_ukernel_4x8s4__sse() 125 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c2)); in xnn_f32_gemm_ukernel_4x8s4__sse() 130 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_4x8s4__sse() [all …]
|
D | 4x8s4-neonfma.c | 74 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemm_ukernel_4x8s4__neonfma() local 83 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemm_ukernel_4x8s4__neonfma() 87 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemm_ukernel_4x8s4__neonfma() 92 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_4x8s4__neonfma() 100 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemm_ukernel_4x8s4__neonfma() 104 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemm_ukernel_4x8s4__neonfma() 109 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_4x8s4__neonfma() 117 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_ukernel_4x8s4__neonfma() 121 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemm_ukernel_4x8s4__neonfma() 126 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_4x8s4__neonfma() [all …]
|
D | 4x8s4-neon.c | 74 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemm_ukernel_4x8s4__neon() local 83 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemm_ukernel_4x8s4__neon() 87 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemm_ukernel_4x8s4__neon() 92 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_4x8s4__neon() 100 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemm_ukernel_4x8s4__neon() 104 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemm_ukernel_4x8s4__neon() 109 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_4x8s4__neon() 117 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_ukernel_4x8s4__neon() 121 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemm_ukernel_4x8s4__neon() 126 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_4x8s4__neon() [all …]
|
D | 3x16s4-fma3-broadcast.c | 68 __m256 va2 = _mm256_broadcast_ps((const __m128*) a2); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() local 77 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 80 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 84 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 91 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 94 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 98 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 105 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 108 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 112 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() [all …]
|
D | 4x16s4-fma3-broadcast.c | 76 __m256 va2 = _mm256_broadcast_ps((const __m128*) a2); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() local 87 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 91 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 96 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 104 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 108 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 113 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 121 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 125 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 130 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() [all …]
|
D | 6x8s4-neon.c | 90 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemm_ukernel_6x8s4__neon() local 101 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemm_ukernel_6x8s4__neon() 107 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemm_ukernel_6x8s4__neon() 114 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_6x8s4__neon() 124 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemm_ukernel_6x8s4__neon() 130 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemm_ukernel_6x8s4__neon() 137 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_6x8s4__neon() 147 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_ukernel_6x8s4__neon() 153 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemm_ukernel_6x8s4__neon() 160 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_6x8s4__neon() [all …]
|
D | 6x8s4-neonfma.c | 90 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemm_ukernel_6x8s4__neonfma() local 101 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 107 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 114 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 124 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 130 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 137 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 147 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 153 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 160 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_6x8s4__neonfma() [all …]
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 4x8s4-psimd.c | 98 psimd_f32 va2 = psimd_load_f32(a2); in xnn_f32_igemm_ukernel_4x8s4__psimd() local 109 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_igemm_ukernel_4x8s4__psimd() 113 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_igemm_ukernel_4x8s4__psimd() 119 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_4x8s4__psimd() 124 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_igemm_ukernel_4x8s4__psimd() 133 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_igemm_ukernel_4x8s4__psimd() 137 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_igemm_ukernel_4x8s4__psimd() 143 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_4x8s4__psimd() 148 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_igemm_ukernel_4x8s4__psimd() 157 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_igemm_ukernel_4x8s4__psimd() [all …]
|
D | 6x8s4-psimd.c | 120 psimd_f32 va2 = psimd_load_f32(a2); in xnn_f32_igemm_ukernel_6x8s4__psimd() local 135 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_igemm_ukernel_6x8s4__psimd() 141 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_igemm_ukernel_6x8s4__psimd() 149 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_6x8s4__psimd() 156 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_igemm_ukernel_6x8s4__psimd() 167 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_igemm_ukernel_6x8s4__psimd() 173 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_igemm_ukernel_6x8s4__psimd() 181 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_6x8s4__psimd() 188 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_igemm_ukernel_6x8s4__psimd() 199 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_igemm_ukernel_6x8s4__psimd() [all …]
|
D | 4x8s4-neon.c | 95 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_igemm_ukernel_4x8s4__neon() local 104 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_igemm_ukernel_4x8s4__neon() 108 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_igemm_ukernel_4x8s4__neon() 113 va2 = vextq_f32(va2, va2, 1); in xnn_f32_igemm_ukernel_4x8s4__neon() 121 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_igemm_ukernel_4x8s4__neon() 125 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_igemm_ukernel_4x8s4__neon() 130 va2 = vextq_f32(va2, va2, 1); in xnn_f32_igemm_ukernel_4x8s4__neon() 138 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_igemm_ukernel_4x8s4__neon() 142 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_igemm_ukernel_4x8s4__neon() 147 va2 = vextq_f32(va2, va2, 1); in xnn_f32_igemm_ukernel_4x8s4__neon() [all …]
|
D | 4x8s4-neonfma.c | 95 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_igemm_ukernel_4x8s4__neonfma() local 104 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_igemm_ukernel_4x8s4__neonfma() 108 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_igemm_ukernel_4x8s4__neonfma() 113 va2 = vextq_f32(va2, va2, 1); in xnn_f32_igemm_ukernel_4x8s4__neonfma() 121 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_igemm_ukernel_4x8s4__neonfma() 125 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_igemm_ukernel_4x8s4__neonfma() 130 va2 = vextq_f32(va2, va2, 1); in xnn_f32_igemm_ukernel_4x8s4__neonfma() 138 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_igemm_ukernel_4x8s4__neonfma() 142 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_igemm_ukernel_4x8s4__neonfma() 147 va2 = vextq_f32(va2, va2, 1); in xnn_f32_igemm_ukernel_4x8s4__neonfma() [all …]
|
D | 3x16s4-fma3-broadcast.c | 87 __m256 va2 = _mm256_broadcast_ps((const __m128*) a2); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() local 96 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 99 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 103 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 110 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 113 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 117 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 124 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 127 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 131 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() [all …]
|
D | 4x8s4-sse.c | 98 __m128 va2 = _mm_loadu_ps(a2); in xnn_f32_igemm_ukernel_4x8s4__sse() local 109 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c0)); in xnn_f32_igemm_ukernel_4x8s4__sse() 113 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c0)); in xnn_f32_igemm_ukernel_4x8s4__sse() 118 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_4x8s4__sse() 126 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c1)); in xnn_f32_igemm_ukernel_4x8s4__sse() 130 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c1)); in xnn_f32_igemm_ukernel_4x8s4__sse() 135 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_4x8s4__sse() 143 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c2)); in xnn_f32_igemm_ukernel_4x8s4__sse() 147 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c2)); in xnn_f32_igemm_ukernel_4x8s4__sse() 152 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_4x8s4__sse() [all …]
|
D | 4x16s4-fma3-broadcast.c | 98 __m256 va2 = _mm256_broadcast_ps((const __m128*) a2); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() local 109 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 113 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 118 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 126 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 130 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 135 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 143 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 147 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 152 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() [all …]
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 4x8s4-psimd.c | 78 psimd_f32 va2 = psimd_load_f32(a2); in xnn_f32_gemminc_ukernel_4x8s4__psimd() local 89 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemminc_ukernel_4x8s4__psimd() 93 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemminc_ukernel_4x8s4__psimd() 99 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemminc_ukernel_4x8s4__psimd() 104 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemminc_ukernel_4x8s4__psimd() 113 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemminc_ukernel_4x8s4__psimd() 117 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemminc_ukernel_4x8s4__psimd() 123 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemminc_ukernel_4x8s4__psimd() 128 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemminc_ukernel_4x8s4__psimd() 137 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_ukernel_4x8s4__psimd() [all …]
|
D | 6x8s4-psimd.c | 94 psimd_f32 va2 = psimd_load_f32(a2); in xnn_f32_gemminc_ukernel_6x8s4__psimd() local 109 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 115 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 123 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 130 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 141 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 147 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 155 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 162 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 173 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_ukernel_6x8s4__psimd() [all …]
|
D | 4x8s4-neon.c | 76 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemminc_ukernel_4x8s4__neon() local 85 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemminc_ukernel_4x8s4__neon() 89 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemminc_ukernel_4x8s4__neon() 94 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_4x8s4__neon() 102 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemminc_ukernel_4x8s4__neon() 106 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemminc_ukernel_4x8s4__neon() 111 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_4x8s4__neon() 119 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_ukernel_4x8s4__neon() 123 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemminc_ukernel_4x8s4__neon() 128 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_4x8s4__neon() [all …]
|
D | 4x8s4-neonfma.c | 76 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemminc_ukernel_4x8s4__neonfma() local 85 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemminc_ukernel_4x8s4__neonfma() 89 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemminc_ukernel_4x8s4__neonfma() 94 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_4x8s4__neonfma() 102 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemminc_ukernel_4x8s4__neonfma() 106 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemminc_ukernel_4x8s4__neonfma() 111 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_4x8s4__neonfma() 119 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_ukernel_4x8s4__neonfma() 123 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemminc_ukernel_4x8s4__neonfma() 128 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_4x8s4__neonfma() [all …]
|
D | 4x8s4-sse.c | 78 __m128 va2 = _mm_loadu_ps(a2); in xnn_f32_gemminc_ukernel_4x8s4__sse() local 89 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c0)); in xnn_f32_gemminc_ukernel_4x8s4__sse() 93 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c0)); in xnn_f32_gemminc_ukernel_4x8s4__sse() 98 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_4x8s4__sse() 106 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c1)); in xnn_f32_gemminc_ukernel_4x8s4__sse() 110 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c1)); in xnn_f32_gemminc_ukernel_4x8s4__sse() 115 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_4x8s4__sse() 123 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c2)); in xnn_f32_gemminc_ukernel_4x8s4__sse() 127 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c2)); in xnn_f32_gemminc_ukernel_4x8s4__sse() 132 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_4x8s4__sse() [all …]
|
D | 3x16s4-fma3-broadcast.c | 70 __m256 va2 = _mm256_broadcast_ps((const __m128*) a2); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() local 79 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 82 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 86 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 93 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 96 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 100 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 107 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 110 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 114 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() [all …]
|
D | 4x16s4-fma3-broadcast.c | 78 __m256 va2 = _mm256_broadcast_ps((const __m128*) a2); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() local 89 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 93 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 98 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 106 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 110 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 115 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 123 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 127 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 132 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() [all …]
|
D | 6x8s4-neonfma.c | 92 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemminc_ukernel_6x8s4__neonfma() local 103 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 109 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 116 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 126 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 132 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 139 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 149 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 155 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 162 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() [all …]
|
D | 6x8s4-neon.c | 92 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemminc_ukernel_6x8s4__neon() local 103 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemminc_ukernel_6x8s4__neon() 109 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemminc_ukernel_6x8s4__neon() 116 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_6x8s4__neon() 126 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemminc_ukernel_6x8s4__neon() 132 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemminc_ukernel_6x8s4__neon() 139 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_6x8s4__neon() 149 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_ukernel_6x8s4__neon() 155 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemminc_ukernel_6x8s4__neon() 162 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_6x8s4__neon() [all …]
|