Home
last modified time | relevance | path

Searched refs:va5 (Results 1 – 25 of 72) sorted by relevance

123

/external/XNNPACK/src/f32-gemm/gen-inc/
D6x8s4-psimd.c100 psimd_f32 va5 = psimd_load_f32(a5); in xnn_f32_gemminc_ukernel_6x8s4__psimd() local
112 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
118 vacc5x4567 = psimd_qfma_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
126 va5 = __builtin_shufflevector(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
133 va5 = __builtin_shuffle(va5, va5, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
144 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
150 vacc5x4567 = psimd_qfma_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
158 va5 = __builtin_shufflevector(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
165 va5 = __builtin_shuffle(va5, va5, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
176 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_ukernel_6x8s4__psimd()
[all …]
D6x8s4-neonfma.c95 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_ukernel_6x8s4__neonfma() local
106 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
112 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
119 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
129 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
135 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
142 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
152 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
158 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
165 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma()
[all …]
D6x8s4-neon.c95 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_ukernel_6x8s4__neon() local
106 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_ukernel_6x8s4__neon()
112 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_ukernel_6x8s4__neon()
119 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_6x8s4__neon()
129 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_ukernel_6x8s4__neon()
135 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_ukernel_6x8s4__neon()
142 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_6x8s4__neon()
152 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_ukernel_6x8s4__neon()
158 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_ukernel_6x8s4__neon()
165 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_6x8s4__neon()
[all …]
D8x8s4-neon.c111 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_ukernel_8x8s4__neon() local
124 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_ukernel_8x8s4__neon()
132 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_ukernel_8x8s4__neon()
141 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_8x8s4__neon()
153 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_ukernel_8x8s4__neon()
161 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_ukernel_8x8s4__neon()
170 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_8x8s4__neon()
182 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_ukernel_8x8s4__neon()
190 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_ukernel_8x8s4__neon()
199 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_8x8s4__neon()
[all …]
D8x8s4-neonfma.c111 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_ukernel_8x8s4__neonfma() local
124 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_ukernel_8x8s4__neonfma()
132 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_ukernel_8x8s4__neonfma()
141 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_8x8s4__neonfma()
153 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_ukernel_8x8s4__neonfma()
161 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_ukernel_8x8s4__neonfma()
170 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_8x8s4__neonfma()
182 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_ukernel_8x8s4__neonfma()
190 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_ukernel_8x8s4__neonfma()
199 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_8x8s4__neonfma()
[all …]
D6x8-neonfma-lane-ld128.c95 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() local
106 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128()
112 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128()
122 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128()
128 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128()
138 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128()
144 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128()
154 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128()
160 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128()
169 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() local
[all …]
D6x8-neon-lane-ld128.c95 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() local
106 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128()
112 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128()
122 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128()
128 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128()
138 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128()
144 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128()
154 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128()
160 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128()
169 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() local
[all …]
D6x8-neon-lane-ld64.c95 const float32x2_t va5 = vld1_f32(a5); a5 += 2; in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64() local
105 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, va5, 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64()
111 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, va5, 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64()
120 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, va5, 1); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64()
126 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, va5, 1); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64()
134 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64() local
144 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64()
150 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64()
/external/XNNPACK/src/f32-gemm/gen/
D6x8s4-psimd.c98 psimd_f32 va5 = psimd_load_f32(a5); in xnn_f32_gemm_ukernel_6x8s4__psimd() local
110 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_ukernel_6x8s4__psimd()
116 vacc5x4567 = psimd_qfma_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_ukernel_6x8s4__psimd()
124 va5 = __builtin_shufflevector(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_6x8s4__psimd()
131 va5 = __builtin_shuffle(va5, va5, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemm_ukernel_6x8s4__psimd()
142 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_ukernel_6x8s4__psimd()
148 vacc5x4567 = psimd_qfma_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_ukernel_6x8s4__psimd()
156 va5 = __builtin_shufflevector(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_6x8s4__psimd()
163 va5 = __builtin_shuffle(va5, va5, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemm_ukernel_6x8s4__psimd()
174 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_ukernel_6x8s4__psimd()
[all …]
D6x8s4-neon.c93 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_ukernel_6x8s4__neon() local
104 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_ukernel_6x8s4__neon()
110 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_ukernel_6x8s4__neon()
117 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_6x8s4__neon()
127 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_ukernel_6x8s4__neon()
133 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_ukernel_6x8s4__neon()
140 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_6x8s4__neon()
150 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_ukernel_6x8s4__neon()
156 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_ukernel_6x8s4__neon()
163 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_6x8s4__neon()
[all …]
D6x8s4-neonfma.c93 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_ukernel_6x8s4__neonfma() local
104 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
110 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
117 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
127 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
133 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
140 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
150 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
156 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
163 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_6x8s4__neonfma()
[all …]
D8x8s4-neonfma.c109 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_ukernel_8x8s4__neonfma() local
122 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_ukernel_8x8s4__neonfma()
130 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_ukernel_8x8s4__neonfma()
139 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_8x8s4__neonfma()
151 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_ukernel_8x8s4__neonfma()
159 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_ukernel_8x8s4__neonfma()
168 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_8x8s4__neonfma()
180 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_ukernel_8x8s4__neonfma()
188 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_ukernel_8x8s4__neonfma()
197 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_8x8s4__neonfma()
[all …]
D8x8s4-neon.c109 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_ukernel_8x8s4__neon() local
122 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_ukernel_8x8s4__neon()
130 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_ukernel_8x8s4__neon()
139 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_8x8s4__neon()
151 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_ukernel_8x8s4__neon()
159 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_ukernel_8x8s4__neon()
168 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_8x8s4__neon()
180 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_ukernel_8x8s4__neon()
188 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_ukernel_8x8s4__neon()
197 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_8x8s4__neon()
[all …]
D6x8-neonfma-lane-ld128.c93 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() local
104 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128()
110 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128()
120 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128()
126 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128()
136 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128()
142 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128()
152 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128()
158 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128()
167 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() local
[all …]
D6x8-neon-lane-ld128.c93 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() local
104 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128()
110 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128()
120 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128()
126 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128()
136 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128()
142 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128()
152 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128()
158 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128()
167 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() local
[all …]
/external/XNNPACK/src/f32-igemm/gen/
D6x8s4-psimd.c126 psimd_f32 va5 = psimd_load_f32(a5); in xnn_f32_igemm_ukernel_6x8s4__psimd() local
138 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_ukernel_6x8s4__psimd()
144 vacc5x4567 = psimd_qfma_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_ukernel_6x8s4__psimd()
152 va5 = __builtin_shufflevector(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_6x8s4__psimd()
159 va5 = __builtin_shuffle(va5, va5, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_igemm_ukernel_6x8s4__psimd()
170 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_ukernel_6x8s4__psimd()
176 vacc5x4567 = psimd_qfma_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_ukernel_6x8s4__psimd()
184 va5 = __builtin_shufflevector(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_6x8s4__psimd()
191 va5 = __builtin_shuffle(va5, va5, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_igemm_ukernel_6x8s4__psimd()
202 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_ukernel_6x8s4__psimd()
[all …]
D6x8s4-neonfma.c120 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_ukernel_6x8s4__neonfma() local
131 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_ukernel_6x8s4__neonfma()
137 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_ukernel_6x8s4__neonfma()
144 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_6x8s4__neonfma()
154 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_ukernel_6x8s4__neonfma()
160 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_ukernel_6x8s4__neonfma()
167 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_6x8s4__neonfma()
177 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_ukernel_6x8s4__neonfma()
183 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_ukernel_6x8s4__neonfma()
190 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_6x8s4__neonfma()
[all …]
D6x8s4-neon.c120 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_ukernel_6x8s4__neon() local
131 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_ukernel_6x8s4__neon()
137 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_ukernel_6x8s4__neon()
144 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_6x8s4__neon()
154 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_ukernel_6x8s4__neon()
160 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_ukernel_6x8s4__neon()
167 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_6x8s4__neon()
177 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_ukernel_6x8s4__neon()
183 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_ukernel_6x8s4__neon()
190 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_6x8s4__neon()
[all …]
D8x8s4-neon.c142 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_ukernel_8x8s4__neon() local
155 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_ukernel_8x8s4__neon()
163 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_ukernel_8x8s4__neon()
172 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_8x8s4__neon()
184 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_ukernel_8x8s4__neon()
192 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_ukernel_8x8s4__neon()
201 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_8x8s4__neon()
213 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_ukernel_8x8s4__neon()
221 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_ukernel_8x8s4__neon()
230 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_8x8s4__neon()
[all …]
D8x8s4-neonfma.c142 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_ukernel_8x8s4__neonfma() local
155 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_ukernel_8x8s4__neonfma()
163 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_ukernel_8x8s4__neonfma()
172 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_8x8s4__neonfma()
184 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_ukernel_8x8s4__neonfma()
192 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_ukernel_8x8s4__neonfma()
201 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_8x8s4__neonfma()
213 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_ukernel_8x8s4__neonfma()
221 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_ukernel_8x8s4__neonfma()
230 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_8x8s4__neonfma()
[all …]
D6x8-neonfma-lane-ld128.c121 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() local
132 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128()
138 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128()
148 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128()
154 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128()
164 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128()
170 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128()
180 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128()
186 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128()
195 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() local
[all …]
D6x8-neon-lane-ld128.c121 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() local
132 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128()
138 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128()
148 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128()
154 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128()
164 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128()
170 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128()
180 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128()
186 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128()
195 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() local
[all …]
D6x8-neon-lane-ld64.c121 const float32x2_t va5 = vld1_f32(a5); a5 += 2; in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64() local
131 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, va5, 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64()
137 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, va5, 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64()
146 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, va5, 1); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64()
152 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, va5, 1); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64()
161 const float32x4_t va5 = vld1q_dup_f32(a5); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64() local
171 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64()
177 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64()
D6x8-neonfma-lane-ld64.c121 const float32x2_t va5 = vld1_f32(a5); a5 += 2; in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64() local
131 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, va5, 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64()
137 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, va5, 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64()
146 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, va5, 1); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64()
152 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, va5, 1); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64()
161 const float32x4_t va5 = vld1q_dup_f32(a5); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64() local
171 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64()
177 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64()
/external/XNNPACK/src/f16-gemm/gen/
D6x8-neonfp16arith-ld64.c87 const float16x4_t va5 = vld1_f16(a5); a5 += 4; in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() local
97 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64()
104 const float16x8_t va5c0 = vdupq_lane_f16(va5, 0); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64()
121 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64()
128 const float16x8_t va5c1 = vdupq_lane_f16(va5, 1); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64()
145 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64()
152 const float16x8_t va5c2 = vdupq_lane_f16(va5, 2); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64()
169 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64()
176 const float16x8_t va5c3 = vdupq_lane_f16(va5, 3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64()
195 const float16x8_t va5 = vld1q_dup_f16(a5); a5 += 1; in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() local
[all …]

123