Home
last modified time | relevance | path

Searched refs:vacc4x01234567 (Results 1 – 25 of 52) sorted by relevance

123

/external/XNNPACK/src/f16-gemm/gen/
D6x8-minmax-neonfp16arith-ld64.c79 float16x8_t vacc4x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() local
98 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64()
112 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64()
122 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64()
136 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64()
146 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64()
160 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64()
170 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64()
184 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64()
205 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64()
[all …]
D8x8-minmax-neonfp16arith-ld64.c91 float16x8_t vacc4x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() local
114 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64()
132 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64()
144 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64()
162 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64()
174 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64()
192 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64()
204 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64()
222 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64()
247 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64()
[all …]
D6x16-minmax-neonfp16arith-ld64.c83 float16x8_t vacc4x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local
105 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
125 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
142 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
162 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
179 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
199 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
216 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
236 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
264 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
[all …]
/external/XNNPACK/src/f16-igemm/gen/
D6x8-minmax-neonfp16arith-ld64.c71 float16x8_t vacc4x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() local
124 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64()
138 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64()
148 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64()
162 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64()
172 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64()
186 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64()
196 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64()
210 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64()
229 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64()
[all …]
D8x8-minmax-neonfp16arith-ld64.c79 float16x8_t vacc4x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() local
146 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64()
164 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64()
176 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64()
194 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64()
206 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64()
224 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64()
236 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64()
254 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64()
277 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64()
[all …]
D6x16-minmax-neonfp16arith-ld64.c75 float16x8_t vacc4x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() local
131 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64()
151 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64()
168 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64()
188 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64()
205 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64()
225 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64()
242 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64()
262 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64()
288 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64()
[all …]
/external/XNNPACK/src/f16-gemm/gen-inc/
D6x8inc-minmax-neonfp16arith-ld64.c81 …float16x8_t vacc4x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() local
100 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64()
114 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64()
124 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64()
138 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64()
148 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64()
162 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64()
172 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64()
186 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64()
207 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64()
[all …]
D8x8inc-minmax-neonfp16arith-ld64.c93 …float16x8_t vacc4x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() local
116 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64()
134 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64()
146 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64()
164 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64()
176 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64()
194 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64()
206 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64()
224 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64()
249 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64()
[all …]
D6x16inc-minmax-neonfp16arith-ld64.c85 …float16x8_t vacc4x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() local
107 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64()
127 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64()
144 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64()
164 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64()
181 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64()
201 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64()
218 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64()
238 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64()
266 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64()
[all …]
D8x16inc-minmax-neonfp16arith-ld64.c97 …float16x8_t vacc4x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() local
125 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64()
151 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64()
172 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64()
198 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64()
219 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64()
245 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64()
266 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64()
292 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64()
326 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64()
[all …]
/external/XNNPACK/src/f32-gemm/gen/
D5x16s4-minmax-fma3-broadcast.c74 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() local
99 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c0, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
119 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c1, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
139 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c2, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
159 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c3, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
191 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
207 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
219 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
227 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
252 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
[all …]
D5x8-minmax-fma3-broadcast.c70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast() local
93 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast()
103 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast()
110 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast()
113 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast()
132 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast()
144 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast()
D5x8-minmax-avx-broadcast.c70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast() local
93 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast()
103 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast()
110 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast()
113 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast()
132 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast()
144 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast()
D5x16-minmax-avx-broadcast.c74 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast() local
99 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast()
114 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast()
126 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast()
134 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast()
159 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast()
165 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast()
177 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast()
189 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast()
D5x16-minmax-fma3-broadcast.c74 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast() local
99 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast()
114 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast()
126 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast()
134 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast()
159 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast()
165 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast()
177 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast()
189 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast()
/external/XNNPACK/src/f32-igemm/gen/
D5x16s4-minmax-fma3-broadcast.c70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() local
124 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c0, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
144 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c1, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
164 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c2, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
184 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c3, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
216 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
234 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
246 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
254 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
274 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
[all …]
D5x16-minmax-avx-broadcast.c70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast() local
128 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast()
140 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast()
152 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast()
160 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast()
180 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast()
186 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast()
198 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast()
210 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast()
D5x16-minmax-fma3-broadcast.c70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast() local
128 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast()
140 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast()
152 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast()
160 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast()
180 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast()
186 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast()
198 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast()
210 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast()
D5x8-minmax-fma3-broadcast.c66 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast() local
118 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast()
129 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast()
136 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast()
139 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast()
153 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast()
165 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast()
D5x8-minmax-avx-broadcast.c66 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast() local
118 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast()
129 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast()
136 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast()
139 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast()
153 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast()
165 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast()
/external/XNNPACK/src/f32-gemm/gen-inc/
D5x16s4inc-minmax-fma3-broadcast.c76 __m256 vacc4x01234567 = _mm256_load_ps(acc + 64); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() local
101 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c0, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast()
121 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c1, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast()
141 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c2, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast()
161 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c3, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast()
193 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast()
209 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast()
221 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast()
229 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast()
254 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast()
[all …]
D5x8inc-minmax-fma3-broadcast.c72 __m256 vacc4x01234567 = _mm256_load_ps(acc + 32); in xnn_f32_gemminc_minmax_ukernel_5x8__fma3_broadcast() local
95 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x8__fma3_broadcast()
105 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x8__fma3_broadcast()
112 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x8__fma3_broadcast()
115 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x8__fma3_broadcast()
134 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x8__fma3_broadcast()
146 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_5x8__fma3_broadcast()
D5x8inc-minmax-avx-broadcast.c72 __m256 vacc4x01234567 = _mm256_load_ps(acc + 32); in xnn_f32_gemminc_minmax_ukernel_5x8__avx_broadcast() local
95 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_5x8__avx_broadcast()
105 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x8__avx_broadcast()
112 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x8__avx_broadcast()
115 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x8__avx_broadcast()
134 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x8__avx_broadcast()
146 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_5x8__avx_broadcast()
D5x16inc-minmax-avx-broadcast.c76 __m256 vacc4x01234567 = _mm256_load_ps(acc + 64); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast() local
101 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast()
116 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast()
128 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast()
136 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast()
161 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast()
167 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast()
179 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast()
191 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast()
D5x16inc-minmax-fma3-broadcast.c76 __m256 vacc4x01234567 = _mm256_load_ps(acc + 64); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast() local
101 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast()
116 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast()
128 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast()
136 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast()
161 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast()
167 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast()
179 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast()
191 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast()

123