Home
last modified time | relevance | path

Searched refs:va2 (Results 1 – 25 of 229) sorted by relevance

12345678910

/external/XNNPACK/src/f32-gemm/gen/
D4x8s4-psimd.c76 psimd_f32 va2 = psimd_load_f32(a2); in xnn_f32_gemm_ukernel_4x8s4__psimd() local
87 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemm_ukernel_4x8s4__psimd()
91 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemm_ukernel_4x8s4__psimd()
97 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_4x8s4__psimd()
102 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemm_ukernel_4x8s4__psimd()
111 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemm_ukernel_4x8s4__psimd()
115 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemm_ukernel_4x8s4__psimd()
121 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_4x8s4__psimd()
126 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemm_ukernel_4x8s4__psimd()
135 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_ukernel_4x8s4__psimd()
[all …]
D6x8s4-psimd.c92 psimd_f32 va2 = psimd_load_f32(a2); in xnn_f32_gemm_ukernel_6x8s4__psimd() local
107 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemm_ukernel_6x8s4__psimd()
113 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemm_ukernel_6x8s4__psimd()
121 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_6x8s4__psimd()
128 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemm_ukernel_6x8s4__psimd()
139 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemm_ukernel_6x8s4__psimd()
145 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemm_ukernel_6x8s4__psimd()
153 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_6x8s4__psimd()
160 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemm_ukernel_6x8s4__psimd()
171 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_ukernel_6x8s4__psimd()
[all …]
D4x8s4-sse.c76 __m128 va2 = _mm_loadu_ps(a2); in xnn_f32_gemm_ukernel_4x8s4__sse() local
87 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c0)); in xnn_f32_gemm_ukernel_4x8s4__sse()
91 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c0)); in xnn_f32_gemm_ukernel_4x8s4__sse()
96 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_4x8s4__sse()
104 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c1)); in xnn_f32_gemm_ukernel_4x8s4__sse()
108 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c1)); in xnn_f32_gemm_ukernel_4x8s4__sse()
113 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_4x8s4__sse()
121 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c2)); in xnn_f32_gemm_ukernel_4x8s4__sse()
125 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c2)); in xnn_f32_gemm_ukernel_4x8s4__sse()
130 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_4x8s4__sse()
[all …]
D4x8s4-neonfma.c74 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemm_ukernel_4x8s4__neonfma() local
83 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemm_ukernel_4x8s4__neonfma()
87 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemm_ukernel_4x8s4__neonfma()
92 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_4x8s4__neonfma()
100 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemm_ukernel_4x8s4__neonfma()
104 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemm_ukernel_4x8s4__neonfma()
109 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_4x8s4__neonfma()
117 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_ukernel_4x8s4__neonfma()
121 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemm_ukernel_4x8s4__neonfma()
126 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_4x8s4__neonfma()
[all …]
D4x8s4-neon.c74 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemm_ukernel_4x8s4__neon() local
83 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemm_ukernel_4x8s4__neon()
87 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemm_ukernel_4x8s4__neon()
92 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_4x8s4__neon()
100 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemm_ukernel_4x8s4__neon()
104 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemm_ukernel_4x8s4__neon()
109 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_4x8s4__neon()
117 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_ukernel_4x8s4__neon()
121 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemm_ukernel_4x8s4__neon()
126 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_4x8s4__neon()
[all …]
D3x16s4-fma3-broadcast.c68 __m256 va2 = _mm256_broadcast_ps((const __m128*) a2); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() local
77 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast()
80 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast()
84 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast()
91 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast()
94 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast()
98 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast()
105 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast()
108 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast()
112 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast()
[all …]
D4x16s4-fma3-broadcast.c76 __m256 va2 = _mm256_broadcast_ps((const __m128*) a2); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() local
87 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast()
91 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast()
96 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast()
104 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast()
108 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast()
113 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast()
121 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast()
125 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast()
130 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast()
[all …]
D6x8s4-neon.c90 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemm_ukernel_6x8s4__neon() local
101 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemm_ukernel_6x8s4__neon()
107 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemm_ukernel_6x8s4__neon()
114 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_6x8s4__neon()
124 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemm_ukernel_6x8s4__neon()
130 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemm_ukernel_6x8s4__neon()
137 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_6x8s4__neon()
147 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_ukernel_6x8s4__neon()
153 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemm_ukernel_6x8s4__neon()
160 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_6x8s4__neon()
[all …]
D6x8s4-neonfma.c90 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemm_ukernel_6x8s4__neonfma() local
101 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
107 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
114 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
124 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
130 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
137 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
147 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
153 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
160 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
[all …]
/external/XNNPACK/src/f32-igemm/gen/
D4x8s4-psimd.c98 psimd_f32 va2 = psimd_load_f32(a2); in xnn_f32_igemm_ukernel_4x8s4__psimd() local
109 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_igemm_ukernel_4x8s4__psimd()
113 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_igemm_ukernel_4x8s4__psimd()
119 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_4x8s4__psimd()
124 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_igemm_ukernel_4x8s4__psimd()
133 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_igemm_ukernel_4x8s4__psimd()
137 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_igemm_ukernel_4x8s4__psimd()
143 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_4x8s4__psimd()
148 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_igemm_ukernel_4x8s4__psimd()
157 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_igemm_ukernel_4x8s4__psimd()
[all …]
D6x8s4-psimd.c120 psimd_f32 va2 = psimd_load_f32(a2); in xnn_f32_igemm_ukernel_6x8s4__psimd() local
135 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_igemm_ukernel_6x8s4__psimd()
141 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_igemm_ukernel_6x8s4__psimd()
149 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_6x8s4__psimd()
156 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_igemm_ukernel_6x8s4__psimd()
167 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_igemm_ukernel_6x8s4__psimd()
173 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_igemm_ukernel_6x8s4__psimd()
181 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_6x8s4__psimd()
188 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_igemm_ukernel_6x8s4__psimd()
199 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_igemm_ukernel_6x8s4__psimd()
[all …]
D4x8s4-neon.c95 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_igemm_ukernel_4x8s4__neon() local
104 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_igemm_ukernel_4x8s4__neon()
108 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_igemm_ukernel_4x8s4__neon()
113 va2 = vextq_f32(va2, va2, 1); in xnn_f32_igemm_ukernel_4x8s4__neon()
121 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_igemm_ukernel_4x8s4__neon()
125 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_igemm_ukernel_4x8s4__neon()
130 va2 = vextq_f32(va2, va2, 1); in xnn_f32_igemm_ukernel_4x8s4__neon()
138 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_igemm_ukernel_4x8s4__neon()
142 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_igemm_ukernel_4x8s4__neon()
147 va2 = vextq_f32(va2, va2, 1); in xnn_f32_igemm_ukernel_4x8s4__neon()
[all …]
D4x8s4-neonfma.c95 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_igemm_ukernel_4x8s4__neonfma() local
104 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_igemm_ukernel_4x8s4__neonfma()
108 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_igemm_ukernel_4x8s4__neonfma()
113 va2 = vextq_f32(va2, va2, 1); in xnn_f32_igemm_ukernel_4x8s4__neonfma()
121 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_igemm_ukernel_4x8s4__neonfma()
125 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_igemm_ukernel_4x8s4__neonfma()
130 va2 = vextq_f32(va2, va2, 1); in xnn_f32_igemm_ukernel_4x8s4__neonfma()
138 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_igemm_ukernel_4x8s4__neonfma()
142 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_igemm_ukernel_4x8s4__neonfma()
147 va2 = vextq_f32(va2, va2, 1); in xnn_f32_igemm_ukernel_4x8s4__neonfma()
[all …]
D3x16s4-fma3-broadcast.c87 __m256 va2 = _mm256_broadcast_ps((const __m128*) a2); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() local
96 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast()
99 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast()
103 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast()
110 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast()
113 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast()
117 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast()
124 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast()
127 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast()
131 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast()
[all …]
D4x8s4-sse.c98 __m128 va2 = _mm_loadu_ps(a2); in xnn_f32_igemm_ukernel_4x8s4__sse() local
109 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c0)); in xnn_f32_igemm_ukernel_4x8s4__sse()
113 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c0)); in xnn_f32_igemm_ukernel_4x8s4__sse()
118 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_4x8s4__sse()
126 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c1)); in xnn_f32_igemm_ukernel_4x8s4__sse()
130 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c1)); in xnn_f32_igemm_ukernel_4x8s4__sse()
135 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_4x8s4__sse()
143 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c2)); in xnn_f32_igemm_ukernel_4x8s4__sse()
147 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c2)); in xnn_f32_igemm_ukernel_4x8s4__sse()
152 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_4x8s4__sse()
[all …]
D4x16s4-fma3-broadcast.c98 __m256 va2 = _mm256_broadcast_ps((const __m128*) a2); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() local
109 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast()
113 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast()
118 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast()
126 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast()
130 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast()
135 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast()
143 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast()
147 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast()
152 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast()
[all …]
/external/XNNPACK/src/f32-gemm/gen-inc/
D4x8s4-psimd.c78 psimd_f32 va2 = psimd_load_f32(a2); in xnn_f32_gemminc_ukernel_4x8s4__psimd() local
89 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemminc_ukernel_4x8s4__psimd()
93 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemminc_ukernel_4x8s4__psimd()
99 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemminc_ukernel_4x8s4__psimd()
104 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemminc_ukernel_4x8s4__psimd()
113 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemminc_ukernel_4x8s4__psimd()
117 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemminc_ukernel_4x8s4__psimd()
123 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemminc_ukernel_4x8s4__psimd()
128 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemminc_ukernel_4x8s4__psimd()
137 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_ukernel_4x8s4__psimd()
[all …]
D6x8s4-psimd.c94 psimd_f32 va2 = psimd_load_f32(a2); in xnn_f32_gemminc_ukernel_6x8s4__psimd() local
109 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
115 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
123 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
130 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
141 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
147 vacc2x4567 = psimd_qfma_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
155 va2 = __builtin_shufflevector(va2, va2, 1, 2, 3, 0); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
162 va2 = __builtin_shuffle(va2, va2, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
173 vacc2x0123 = psimd_qfma_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
[all …]
D4x8s4-neon.c76 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemminc_ukernel_4x8s4__neon() local
85 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemminc_ukernel_4x8s4__neon()
89 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemminc_ukernel_4x8s4__neon()
94 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_4x8s4__neon()
102 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemminc_ukernel_4x8s4__neon()
106 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemminc_ukernel_4x8s4__neon()
111 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_4x8s4__neon()
119 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_ukernel_4x8s4__neon()
123 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemminc_ukernel_4x8s4__neon()
128 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_4x8s4__neon()
[all …]
D4x8s4-neonfma.c76 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemminc_ukernel_4x8s4__neonfma() local
85 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemminc_ukernel_4x8s4__neonfma()
89 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemminc_ukernel_4x8s4__neonfma()
94 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_4x8s4__neonfma()
102 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemminc_ukernel_4x8s4__neonfma()
106 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemminc_ukernel_4x8s4__neonfma()
111 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_4x8s4__neonfma()
119 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_ukernel_4x8s4__neonfma()
123 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemminc_ukernel_4x8s4__neonfma()
128 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_4x8s4__neonfma()
[all …]
D4x8s4-sse.c78 __m128 va2 = _mm_loadu_ps(a2); in xnn_f32_gemminc_ukernel_4x8s4__sse() local
89 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c0)); in xnn_f32_gemminc_ukernel_4x8s4__sse()
93 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c0)); in xnn_f32_gemminc_ukernel_4x8s4__sse()
98 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_4x8s4__sse()
106 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c1)); in xnn_f32_gemminc_ukernel_4x8s4__sse()
110 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c1)); in xnn_f32_gemminc_ukernel_4x8s4__sse()
115 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_4x8s4__sse()
123 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123c2)); in xnn_f32_gemminc_ukernel_4x8s4__sse()
127 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567c2)); in xnn_f32_gemminc_ukernel_4x8s4__sse()
132 va2 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_4x8s4__sse()
[all …]
D3x16s4-fma3-broadcast.c70 __m256 va2 = _mm256_broadcast_ps((const __m128*) a2); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() local
79 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast()
82 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast()
86 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast()
93 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast()
96 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast()
100 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast()
107 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast()
110 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast()
114 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast()
[all …]
D4x16s4-fma3-broadcast.c78 __m256 va2 = _mm256_broadcast_ps((const __m128*) a2); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() local
89 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast()
93 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast()
98 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast()
106 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast()
110 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast()
115 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast()
123 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast()
127 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast()
132 va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast()
[all …]
D6x8s4-neonfma.c92 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemminc_ukernel_6x8s4__neonfma() local
103 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
109 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
116 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
126 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
132 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
139 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
149 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
155 vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
162 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
[all …]
D6x8s4-neon.c92 float32x4_t va2 = vld1q_f32(a2); a2 += 4; in xnn_f32_gemminc_ukernel_6x8s4__neon() local
103 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c0); in xnn_f32_gemminc_ukernel_6x8s4__neon()
109 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c0); in xnn_f32_gemminc_ukernel_6x8s4__neon()
116 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_6x8s4__neon()
126 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c1); in xnn_f32_gemminc_ukernel_6x8s4__neon()
132 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c1); in xnn_f32_gemminc_ukernel_6x8s4__neon()
139 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_6x8s4__neon()
149 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_ukernel_6x8s4__neon()
155 vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c2); in xnn_f32_gemminc_ukernel_6x8s4__neon()
162 va2 = vextq_f32(va2, va2, 1); in xnn_f32_gemminc_ukernel_6x8s4__neon()
[all …]

12345678910