Home
last modified time | relevance | path

Searched refs:va3x0 (Results 1 – 25 of 44) sorted by relevance

12

/external/XNNPACK/src/qs8-igemm/gen/
D4x8c2s4-minmax-rndnu-neon-mlal.c96 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() local
111 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
124 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
140 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
145 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
158 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
174 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
179 int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
192 int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
208 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
[all …]
D4x16c2s4-minmax-rndnu-neon-mlal.c104 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local
127 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
140 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
153 int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
166 int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
182 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
187 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
200 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
213 int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
226 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
[all …]
D4x8c2s4-minmax-rndnu-neon-mull.c93 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() local
107 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
115 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
123 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
127 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
135 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
143 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
147 int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
155 int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
163 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
[all …]
D4x16c2s4-minmax-rndnu-neon-mull.c101 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() local
123 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
131 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
139 int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
147 int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
155 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
159 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
167 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
175 int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
183 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
[all …]
D4x8c4s2-minmax-rndnu-neon-mlal.c104 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() local
119 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
132 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
145 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
158 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
174 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
179 int16x8_t vprod3x01c1 = vmull_s8(vb01c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
192 int16x8_t vprod3x23c1 = vmull_s8(vb23c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
205 int16x8_t vprod3x45c1 = vmull_s8(vb45c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
218 int16x8_t vprod3x67c1 = vmull_s8(vb67c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
[all …]
D4x16c4s2-minmax-rndnu-neon-mlal.c120 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() local
143 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
156 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
169 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
182 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
195 int16x8_t vprod3x89c0 = vmull_s8(vb89c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
208 int16x8_t vprod3xABc0 = vmull_s8(vbABc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
221 int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
234 int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
250 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
[all …]
D4x8c4s2-minmax-rndnu-neon-mull.c101 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() local
115 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
123 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
131 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
139 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
147 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
151 int16x8_t vprod3x01c1 = vmull_s8(vb01c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
159 int16x8_t vprod3x23c1 = vmull_s8(vb23c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
167 int16x8_t vprod3x45c1 = vmull_s8(vb45c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
175 int16x8_t vprod3x67c1 = vmull_s8(vb67c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
D4x16c4s2-minmax-rndnu-neon-mull.c117 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() local
139 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
147 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
155 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
163 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
171 int16x8_t vprod3x89c0 = vmull_s8(vb89c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
179 int16x8_t vprod3xABc0 = vmull_s8(vbABc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
187 int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
195 int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
203 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
[all …]
D4x16c8-minmax-rndnu-neon-mlal.c153 const int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() local
177 int16x8_t vprod3x0 = vmull_s8(vb0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
190 int16x8_t vprod3x1 = vmull_s8(vb1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
203 int16x8_t vprod3x2 = vmull_s8(vb2x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
216 int16x8_t vprod3x3 = vmull_s8(vb3x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
229 int16x8_t vprod3x4 = vmull_s8(vb4x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
242 int16x8_t vprod3x5 = vmull_s8(vb5x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
255 int16x8_t vprod3x6 = vmull_s8(vb6x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
268 int16x8_t vprod3x7 = vmull_s8(vb7x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
281 int16x8_t vprod3x8 = vmull_s8(vb8x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
[all …]
D4x8c8-minmax-rndnu-neon-mlal.c121 const int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() local
137 int16x8_t vprod3x0 = vmull_s8(vb0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
150 int16x8_t vprod3x1 = vmull_s8(vb1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
163 int16x8_t vprod3x2 = vmull_s8(vb2x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
176 int16x8_t vprod3x3 = vmull_s8(vb3x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
189 int16x8_t vprod3x4 = vmull_s8(vb4x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
202 int16x8_t vprod3x5 = vmull_s8(vb5x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
215 int16x8_t vprod3x6 = vmull_s8(vb6x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
228 int16x8_t vprod3x7 = vmull_s8(vb7x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
D4x8c2-minmax-rndnu-neon-mlal-dup.c96 const int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup() local
114 const int8x8_t va3c0x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup()
149 const int8x8_t va3c1x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup()
184 const int8x8_t va3c2x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup()
219 const int8x8_t va3c3x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup()
D4x8c2-minmax-rndnu-neon-mlal-ld4r.c96 const int16x4x4_t va3x0 = vld4_dup_s16((const void*)a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r() local
114 const int8x8_t va3c0x0 = vreinterpret_s8_s16(va3x0.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r()
149 const int8x8_t va3c1x0 = vreinterpret_s8_s16(va3x0.val[1]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r()
184 const int8x8_t va3c2x0 = vreinterpret_s8_s16(va3x0.val[2]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r()
219 const int8x8_t va3c3x0 = vreinterpret_s8_s16(va3x0.val[3]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r()
/external/XNNPACK/src/qs8-gemm/gen/
D4x8c2s4-minmax-rndnu-neon-mlal.c79 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() local
94 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
107 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
123 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
128 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
141 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
157 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
162 int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
175 int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
191 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal()
[all …]
D4x16c2s4-minmax-rndnu-neon-mlal.c87 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local
110 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
123 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
136 int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
149 int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
165 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
170 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
183 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
196 int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
209 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
[all …]
D4x8c2s4-minmax-rndnu-neon-mull.c76 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() local
90 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
98 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
106 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
110 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
118 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
126 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
130 int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
138 int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
146 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull()
[all …]
D4x16c2s4-minmax-rndnu-neon-mull.c84 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() local
106 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
114 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
122 int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
130 int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
138 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
142 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
150 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
158 int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
166 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
[all …]
D4x8c4s2-minmax-rndnu-neon-mlal.c87 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() local
102 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
115 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
128 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
141 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
157 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
162 int16x8_t vprod3x01c1 = vmull_s8(vb01c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
175 int16x8_t vprod3x23c1 = vmull_s8(vb23c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
188 int16x8_t vprod3x45c1 = vmull_s8(vb45c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
201 int16x8_t vprod3x67c1 = vmull_s8(vb67c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
[all …]
D4x16c4s2-minmax-rndnu-neon-mlal.c103 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() local
126 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
139 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
152 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
165 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
178 int16x8_t vprod3x89c0 = vmull_s8(vb89c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
191 int16x8_t vprod3xABc0 = vmull_s8(vbABc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
204 int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
217 int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
233 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
[all …]
D4x8c4s2-minmax-rndnu-neon-mull.c84 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() local
98 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
106 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
114 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
122 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
130 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
134 int16x8_t vprod3x01c1 = vmull_s8(vb01c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
142 int16x8_t vprod3x23c1 = vmull_s8(vb23c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
150 int16x8_t vprod3x45c1 = vmull_s8(vb45c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
158 int16x8_t vprod3x67c1 = vmull_s8(vb67c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
D4x16c4s2-minmax-rndnu-neon-mull.c100 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() local
122 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
130 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
138 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
146 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
154 int16x8_t vprod3x89c0 = vmull_s8(vb89c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
162 int16x8_t vprod3xABc0 = vmull_s8(vbABc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
170 int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
178 int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
186 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
[all …]
D4x16c8-minmax-rndnu-neon-mlal.c136 const int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() local
160 int16x8_t vprod3x0 = vmull_s8(vb0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
173 int16x8_t vprod3x1 = vmull_s8(vb1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
186 int16x8_t vprod3x2 = vmull_s8(vb2x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
199 int16x8_t vprod3x3 = vmull_s8(vb3x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
212 int16x8_t vprod3x4 = vmull_s8(vb4x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
225 int16x8_t vprod3x5 = vmull_s8(vb5x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
238 int16x8_t vprod3x6 = vmull_s8(vb6x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
251 int16x8_t vprod3x7 = vmull_s8(vb7x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
264 int16x8_t vprod3x8 = vmull_s8(vb8x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
[all …]
D4x8c8-minmax-rndnu-neon-mlal.c104 const int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() local
120 int16x8_t vprod3x0 = vmull_s8(vb0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
133 int16x8_t vprod3x1 = vmull_s8(vb1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
146 int16x8_t vprod3x2 = vmull_s8(vb2x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
159 int16x8_t vprod3x3 = vmull_s8(vb3x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
172 int16x8_t vprod3x4 = vmull_s8(vb4x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
185 int16x8_t vprod3x5 = vmull_s8(vb5x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
198 int16x8_t vprod3x6 = vmull_s8(vb6x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
211 int16x8_t vprod3x7 = vmull_s8(vb7x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
D4x8c2-minmax-rndnu-neon-mlal-ld4r.c79 const int16x4x4_t va3x0 = vld4_dup_s16((const void*)a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r() local
96 const int8x8_t va3c0x0 = vreinterpret_s8_s16(va3x0.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r()
131 const int8x8_t va3c1x0 = vreinterpret_s8_s16(va3x0.val[1]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r()
166 const int8x8_t va3c2x0 = vreinterpret_s8_s16(va3x0.val[2]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r()
201 const int8x8_t va3c3x0 = vreinterpret_s8_s16(va3x0.val[3]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r()
D4x8c2-minmax-rndnu-neon-mlal-dup.c79 const int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup() local
96 const int8x8_t va3c0x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup()
131 const int8x8_t va3c1x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup()
166 const int8x8_t va3c2x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup()
201 const int8x8_t va3c3x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup()
/external/XNNPACK/src/bf16-gemm/gen/
D4x4c8-minmax-neonbf16-bfmlal.c150 … const bfloat16x8_t va3x0 = vreinterpretq_bf16_u16(vbicq_u16(vreinterpretq_u16_bf16(va3), vm0)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() local
151 vacc3x0 = vbfmlalbq_f32(vacc3x0, va3x0, vb0); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal()
152 vacc3x0 = vbfmlaltq_f32(vacc3x0, va3x0, vb0); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal()

12