Home
last modified time | relevance | path

Searched refs:va2x1 (Results 1 – 25 of 68) sorted by relevance

123

/external/XNNPACK/src/qs8-igemm/gen/
D3x8c2s4-minmax-rndnu-neon-mlal.c85 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() local
102 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
112 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
121 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
128 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
138 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
147 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
154 vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
164 vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
173 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
[all …]
D3x16c2s4-minmax-rndnu-neon-mlal.c91 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local
116 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
126 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
136 vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
146 vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
155 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
162 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
172 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
182 vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
192 vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
[all …]
D4x8c2s4-minmax-rndnu-neon-mlal.c95 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() local
115 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
128 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
139 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
149 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
162 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
173 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
183 vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
196 vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
207 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
[all …]
D4x16c2s4-minmax-rndnu-neon-mlal.c103 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local
131 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
144 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
157 vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
170 vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
181 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
191 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
204 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
217 vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
230 vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
[all …]
D3x16c4s2-minmax-rndnu-neon-mlal.c103 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() local
128 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
138 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
148 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
158 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
168 vprod2x89c0 = vmlal_s8(vprod2x89c0, vb89c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
178 vprod2xABc0 = vmlal_s8(vprod2xABc0, vbABc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
188 vprod2xCDc0 = vmlal_s8(vprod2xCDc0, vbCDc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
198 vprod2xEFc0 = vmlal_s8(vprod2xEFc0, vbEFc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
207 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
[all …]
D3x8c4s2-minmax-rndnu-neon-mlal.c91 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() local
108 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
118 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
128 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
138 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
147 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
154 vprod2x01c1 = vmlal_s8(vprod2x01c1, vb01c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
164 vprod2x23c1 = vmlal_s8(vprod2x23c1, vb23c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
174 vprod2x45c1 = vmlal_s8(vprod2x45c1, vb45c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
184 vprod2x67c1 = vmlal_s8(vprod2x67c1, vb67c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
D4x8c4s2-minmax-rndnu-neon-mlal.c103 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() local
123 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
136 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
149 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
162 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
173 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
183 vprod2x01c1 = vmlal_s8(vprod2x01c1, vb01c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
196 vprod2x23c1 = vmlal_s8(vprod2x23c1, vb23c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
209 vprod2x45c1 = vmlal_s8(vprod2x45c1, vb45c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
222 vprod2x67c1 = vmlal_s8(vprod2x67c1, vb67c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
D4x16c4s2-minmax-rndnu-neon-mlal.c119 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() local
147 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
160 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
173 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
186 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
199 vprod2x89c0 = vmlal_s8(vprod2x89c0, vb89c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
212 vprod2xABc0 = vmlal_s8(vprod2xABc0, vbABc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
225 vprod2xCDc0 = vmlal_s8(vprod2xCDc0, vbCDc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
238 vprod2xEFc0 = vmlal_s8(vprod2xEFc0, vbEFc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
249 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
[all …]
D3x16c8-minmax-rndnu-neon-mlal.c128 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() local
153 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
163 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
173 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
183 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
193 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
203 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
213 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
223 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
233 vprod2x8 = vmlal_s8(vprod2x8, vb8x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
[all …]
D3x8c8-minmax-rndnu-neon-mlal.c104 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() local
121 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
131 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
141 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
151 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
161 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
171 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
181 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
191 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
D4x16c8-minmax-rndnu-neon-mlal.c152 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() local
180 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
193 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
206 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
219 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
232 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
245 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
258 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
271 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
284 vprod2x8 = vmlal_s8(vprod2x8, vb8x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
[all …]
D4x8c8-minmax-rndnu-neon-mlal.c120 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() local
140 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
153 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
166 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
179 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
192 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
205 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
218 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
231 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
/external/XNNPACK/src/qs8-gemm/gen/
D3x8c2s4-minmax-rndnu-neon-mlal.c70 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() local
87 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
97 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
106 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
113 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
123 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
132 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
139 vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
149 vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
158 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal()
[all …]
D3x16c2s4-minmax-rndnu-neon-mlal.c76 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local
101 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
111 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
121 vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
131 vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
140 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
147 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
157 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
167 vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
177 vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
[all …]
D4x8c2s4-minmax-rndnu-neon-mlal.c78 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() local
98 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
111 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
122 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
132 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
145 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
156 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
166 vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
179 vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
190 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
[all …]
D4x16c2s4-minmax-rndnu-neon-mlal.c86 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local
114 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
127 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
140 vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
153 vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
164 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
174 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
187 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
200 vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
213 vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
[all …]
D3x16c4s2-minmax-rndnu-neon-mlal.c88 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() local
113 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
123 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
133 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
143 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
153 vprod2x89c0 = vmlal_s8(vprod2x89c0, vb89c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
163 vprod2xABc0 = vmlal_s8(vprod2xABc0, vbABc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
173 vprod2xCDc0 = vmlal_s8(vprod2xCDc0, vbCDc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
183 vprod2xEFc0 = vmlal_s8(vprod2xEFc0, vbEFc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
192 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
[all …]
D3x8c4s2-minmax-rndnu-neon-mlal.c76 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() local
93 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
103 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
113 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
123 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
132 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
139 vprod2x01c1 = vmlal_s8(vprod2x01c1, vb01c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
149 vprod2x23c1 = vmlal_s8(vprod2x23c1, vb23c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
159 vprod2x45c1 = vmlal_s8(vprod2x45c1, vb45c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
169 vprod2x67c1 = vmlal_s8(vprod2x67c1, vb67c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
D4x8c4s2-minmax-rndnu-neon-mlal.c86 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() local
106 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
119 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
132 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
145 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
156 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
166 vprod2x01c1 = vmlal_s8(vprod2x01c1, vb01c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
179 vprod2x23c1 = vmlal_s8(vprod2x23c1, vb23c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
192 vprod2x45c1 = vmlal_s8(vprod2x45c1, vb45c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
205 vprod2x67c1 = vmlal_s8(vprod2x67c1, vb67c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
D4x16c4s2-minmax-rndnu-neon-mlal.c102 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() local
130 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
143 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
156 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
169 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
182 vprod2x89c0 = vmlal_s8(vprod2x89c0, vb89c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
195 vprod2xABc0 = vmlal_s8(vprod2xABc0, vbABc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
208 vprod2xCDc0 = vmlal_s8(vprod2xCDc0, vbCDc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
221 vprod2xEFc0 = vmlal_s8(vprod2xEFc0, vbEFc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
232 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
[all …]
D3x16c8-minmax-rndnu-neon-mlal.c113 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() local
138 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
148 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
158 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
168 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
178 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
188 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
198 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
208 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
218 vprod2x8 = vmlal_s8(vprod2x8, vb8x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
[all …]
D3x8c8-minmax-rndnu-neon-mlal.c89 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() local
106 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
116 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
126 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
136 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
146 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
156 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
166 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
176 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
D4x16c8-minmax-rndnu-neon-mlal.c135 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() local
163 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
176 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
189 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
202 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
215 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
228 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
241 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
254 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
267 vprod2x8 = vmlal_s8(vprod2x8, vb8x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
[all …]
D4x8c8-minmax-rndnu-neon-mlal.c103 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() local
123 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
136 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
149 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
162 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
175 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
188 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
201 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
214 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
/external/XNNPACK/src/bf16-gemm/gen/
D3x4c8-minmax-neonbf16-bfmlal.c136 … const bfloat16x8_t va2x1 = vreinterpretq_bf16_u16(vbicq_u16(vreinterpretq_u16_bf16(va2), vm1)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() local
137 vacc2x1 = vbfmlalbq_f32(vacc2x1, va2x1, vb1); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal()
138 vacc2x1 = vbfmlaltq_f32(vacc2x1, va2x1, vb1); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal()

123