• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/qs8-gemm/neon-mull-addw-dup.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/gemm.h>
16 
17 
xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup(
19     size_t mr,
20     size_t nc,
21     size_t kc,
22     const int8_t* restrict a,
23     size_t a_stride,
24     const void* restrict w,
25     int8_t* restrict c,
26     size_t cm_stride,
27     size_t cn_stride,
28     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
29 {
30   assert(mr != 0);
31   assert(mr <= 4);
32   assert(nc != 0);
33   assert(kc != 0);
34   assert(kc % sizeof(int8_t) == 0);
35   assert(a != NULL);
36   assert(w != NULL);
37   assert(c != NULL);
38 
39   const int8_t* a0 = a;
40   int8_t* c0 = c;
41   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
42   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
43   if XNN_UNPREDICTABLE(mr < 2) {
44     a1 = a0;
45     c1 = c0;
46   }
47   const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
48   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
49   if XNN_UNPREDICTABLE(mr <= 2) {
50     a2 = a1;
51     c2 = c1;
52   }
53   const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
54   int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
55   if XNN_UNPREDICTABLE(mr != 4) {
56     a3 = a2;
57     c3 = c2;
58   }
59 
60   do {
61     int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
62     int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
63     int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
64     int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
65     int32x4_t vacc1x0123 = vacc0x0123;
66     int32x4_t vacc1x4567 = vacc0x4567;
67     int32x4_t vacc1x89AB = vacc0x89AB;
68     int32x4_t vacc1xCDEF = vacc0xCDEF;
69     int32x4_t vacc2x0123 = vacc0x0123;
70     int32x4_t vacc2x4567 = vacc0x4567;
71     int32x4_t vacc2x89AB = vacc0x89AB;
72     int32x4_t vacc2xCDEF = vacc0xCDEF;
73     int32x4_t vacc3x0123 = vacc0x0123;
74     int32x4_t vacc3x4567 = vacc0x4567;
75     int32x4_t vacc3x89AB = vacc0x89AB;
76     int32x4_t vacc3xCDEF = vacc0xCDEF;
77 
78     size_t k = kc;
79     while (k >= 8 * sizeof(int8_t)) {
80       const int8x8_t va0 = vld1_s8(a0); a0 += 8;
81       const int8x8_t va1 = vld1_s8(a1); a1 += 8;
82       const int8x8_t va2 = vld1_s8(a2); a2 += 8;
83       const int8x8_t va3 = vld1_s8(a3); a3 += 8;
84 
85       const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
86 
87       const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
88       vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
89       vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
90       const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
91       vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
92       vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
93       const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
94       vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
95       vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
96       const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0));
97       vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c0));
98       vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0));
99       const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
100 
101       const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
102       vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
103       vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
104       const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
105       vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
106       vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
107       const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0));
108       vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0));
109       vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0));
110       const int16x8_t vprod3x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va3, 0));
111       vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0));
112       vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0));
113       const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
114 
115       const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
116       vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
117       vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
118       const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
119       vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
120       vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
121       const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
122       vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
123       vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
124       const int16x8_t vprod3x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va3, 1));
125       vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c1));
126       vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1));
127       const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
128 
129       const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
130       vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
131       vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
132       const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
133       vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
134       vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
135       const int16x8_t vprod2x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va2, 1));
136       vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1));
137       vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1));
138       const int16x8_t vprod3x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va3, 1));
139       vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc1));
140       vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc1));
141       const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
142 
143       const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
144       vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
145       vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
146       const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
147       vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
148       vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
149       const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
150       vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
151       vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
152       const int16x8_t vprod3x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va3, 2));
153       vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c2));
154       vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2));
155       const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
156 
157       const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
158       vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
159       vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
160       const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
161       vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
162       vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
163       const int16x8_t vprod2x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va2, 2));
164       vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2));
165       vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2));
166       const int16x8_t vprod3x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va3, 2));
167       vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc2));
168       vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc2));
169       const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
170 
171       const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
172       vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
173       vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
174       const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
175       vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
176       vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
177       const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
178       vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
179       vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
180       const int16x8_t vprod3x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va3, 3));
181       vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c3));
182       vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3));
183       const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
184 
185       const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
186       vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
187       vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
188       const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
189       vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
190       vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
191       const int16x8_t vprod2x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va2, 3));
192       vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3));
193       vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3));
194       const int16x8_t vprod3x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va3, 3));
195       vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc3));
196       vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc3));
197       const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
198 
199       const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
200       vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
201       vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
202       const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
203       vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
204       vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
205       const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
206       vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
207       vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
208       const int16x8_t vprod3x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va3, 4));
209       vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c4));
210       vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4));
211       const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
212 
213       const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
214       vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
215       vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
216       const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
217       vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
218       vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
219       const int16x8_t vprod2x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va2, 4));
220       vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4));
221       vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4));
222       const int16x8_t vprod3x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va3, 4));
223       vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc4));
224       vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc4));
225       const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
226 
227       const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
228       vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
229       vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
230       const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
231       vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
232       vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
233       const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
234       vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
235       vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
236       const int16x8_t vprod3x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va3, 5));
237       vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c5));
238       vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5));
239       const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
240 
241       const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
242       vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
243       vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
244       const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
245       vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
246       vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
247       const int16x8_t vprod2x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va2, 5));
248       vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5));
249       vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5));
250       const int16x8_t vprod3x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va3, 5));
251       vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc5));
252       vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc5));
253       const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
254 
255       const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
256       vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
257       vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
258       const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
259       vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
260       vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
261       const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
262       vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
263       vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
264       const int16x8_t vprod3x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va3, 6));
265       vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c6));
266       vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6));
267       const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
268 
269       const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
270       vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
271       vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
272       const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
273       vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
274       vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
275       const int16x8_t vprod2x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va2, 6));
276       vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6));
277       vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6));
278       const int16x8_t vprod3x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va3, 6));
279       vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc6));
280       vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc6));
281       const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
282 
283       const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
284       vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
285       vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
286       const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
287       vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
288       vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
289       const int16x8_t vprod2x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va2, 7));
290       vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c7));
291       vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c7));
292       const int16x8_t vprod3x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va3, 7));
293       vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c7));
294       vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c7));
295       const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
296 
297       const int16x8_t vprod0x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va0, 7));
298       vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7));
299       vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7));
300       const int16x8_t vprod1x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va1, 7));
301       vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc7));
302       vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7));
303       const int16x8_t vprod2x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va2, 7));
304       vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7));
305       vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7));
306       const int16x8_t vprod3x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va3, 7));
307       vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc7));
308       vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc7));
309 
310       k -= 8 * sizeof(int8_t);
311     }
312     if XNN_UNLIKELY(k != 0) {
313       const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
314       const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
315       const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
316       const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
317 
318       const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
319       const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
320 
321       const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
322       vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
323       vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
324       const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
325       vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
326       vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
327       const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
328       vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
329       vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
330       const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
331       vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
332       vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
333       const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
334       vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
335       vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
336       const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0));
337       vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0));
338       vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0));
339       const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0));
340       vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c0));
341       vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0));
342       const int16x8_t vprod3x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va3, 0));
343       vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0));
344       vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0));
345 
346       if (k >= 2 * sizeof(int8_t)) {
347         const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
348         const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
349 
350         const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
351         vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
352         vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
353         const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
354         vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
355         vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
356         const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
357         vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
358         vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
359         const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
360         vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
361         vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
362         const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
363         vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
364         vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
365         const int16x8_t vprod2x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va2, 1));
366         vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1));
367         vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1));
368         const int16x8_t vprod3x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va3, 1));
369         vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c1));
370         vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1));
371         const int16x8_t vprod3x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va3, 1));
372         vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc1));
373         vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc1));
374 
375         if (k > 2 * sizeof(int8_t)) {
376           const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
377           const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
378 
379           const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
380           vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
381           vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
382           const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
383           vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
384           vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
385           const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
386           vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
387           vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
388           const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
389           vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
390           vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
391           const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
392           vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
393           vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
394           const int16x8_t vprod2x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va2, 2));
395           vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2));
396           vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2));
397           const int16x8_t vprod3x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va3, 2));
398           vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c2));
399           vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2));
400           const int16x8_t vprod3x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va3, 2));
401           vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc2));
402           vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc2));
403 
404           if (k >= 4 * sizeof(int8_t)) {
405             const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
406             const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
407 
408             const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
409             vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
410             vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
411             const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
412             vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
413             vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
414             const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
415             vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
416             vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
417             const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
418             vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
419             vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
420             const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
421             vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
422             vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
423             const int16x8_t vprod2x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va2, 3));
424             vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3));
425             vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3));
426             const int16x8_t vprod3x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va3, 3));
427             vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c3));
428             vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3));
429             const int16x8_t vprod3x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va3, 3));
430             vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc3));
431             vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc3));
432 
433             if (k > 4 * sizeof(int8_t)) {
434               const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
435               const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
436 
437               const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
438               vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
439               vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
440               const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
441               vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
442               vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
443               const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
444               vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
445               vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
446               const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
447               vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
448               vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
449               const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
450               vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
451               vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
452               const int16x8_t vprod2x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va2, 4));
453               vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4));
454               vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4));
455               const int16x8_t vprod3x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va3, 4));
456               vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c4));
457               vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4));
458               const int16x8_t vprod3x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va3, 4));
459               vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc4));
460               vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc4));
461 
462               if (k >= 6 * sizeof(int8_t)) {
463                 const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
464                 const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
465 
466                 const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
467                 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
468                 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
469                 const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
470                 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
471                 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
472                 const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
473                 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
474                 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
475                 const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
476                 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
477                 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
478                 const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
479                 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
480                 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
481                 const int16x8_t vprod2x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va2, 5));
482                 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5));
483                 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5));
484                 const int16x8_t vprod3x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va3, 5));
485                 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c5));
486                 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5));
487                 const int16x8_t vprod3x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va3, 5));
488                 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc5));
489                 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc5));
490 
491                 if (k > 6 * sizeof(int8_t)) {
492                   const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
493                   const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
494 
495                   const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
496                   vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
497                   vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
498                   const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
499                   vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
500                   vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
501                   const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
502                   vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
503                   vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
504                   const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
505                   vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
506                   vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
507                   const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
508                   vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
509                   vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
510                   const int16x8_t vprod2x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va2, 6));
511                   vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6));
512                   vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6));
513                   const int16x8_t vprod3x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va3, 6));
514                   vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c6));
515                   vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6));
516                   const int16x8_t vprod3x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va3, 6));
517                   vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc6));
518                   vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc6));
519                 }
520               }
521             }
522           }
523         }
524       }
525     }
526 
527     // Post-accumulation work
528     const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
529     const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
530     const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);
531 
532     vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
533     vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
534     vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift);
535     vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift);
536     vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift);
537     vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift);
538     vacc1x89AB = vqshlq_s32(vacc1x89AB, vright_pre_shift);
539     vacc1xCDEF = vqshlq_s32(vacc1xCDEF, vright_pre_shift);
540     vacc2x0123 = vqshlq_s32(vacc2x0123, vright_pre_shift);
541     vacc2x4567 = vqshlq_s32(vacc2x4567, vright_pre_shift);
542     vacc2x89AB = vqshlq_s32(vacc2x89AB, vright_pre_shift);
543     vacc2xCDEF = vqshlq_s32(vacc2xCDEF, vright_pre_shift);
544     vacc3x0123 = vqshlq_s32(vacc3x0123, vright_pre_shift);
545     vacc3x4567 = vqshlq_s32(vacc3x4567, vright_pre_shift);
546     vacc3x89AB = vqshlq_s32(vacc3x89AB, vright_pre_shift);
547     vacc3xCDEF = vqshlq_s32(vacc3xCDEF, vright_pre_shift);
548 
549     vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
550     vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
551     vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier);
552     vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier);
553     vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier);
554     vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier);
555     vacc1x89AB = vqdmulhq_s32(vacc1x89AB, vmultiplier);
556     vacc1xCDEF = vqdmulhq_s32(vacc1xCDEF, vmultiplier);
557     vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier);
558     vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier);
559     vacc2x89AB = vqdmulhq_s32(vacc2x89AB, vmultiplier);
560     vacc2xCDEF = vqdmulhq_s32(vacc2xCDEF, vmultiplier);
561     vacc3x0123 = vqdmulhq_s32(vacc3x0123, vmultiplier);
562     vacc3x4567 = vqdmulhq_s32(vacc3x4567, vmultiplier);
563     vacc3x89AB = vqdmulhq_s32(vacc3x89AB, vmultiplier);
564     vacc3xCDEF = vqdmulhq_s32(vacc3xCDEF, vmultiplier);
565 
566     vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
567     vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
568     vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift);
569     vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift);
570     vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift);
571     vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift);
572     vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_post_shift);
573     vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_post_shift);
574     vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift);
575     vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift);
576     vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_post_shift);
577     vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_post_shift);
578     vacc3x0123 = vrshlq_s32(vacc3x0123, vright_post_shift);
579     vacc3x4567 = vrshlq_s32(vacc3x4567, vright_post_shift);
580     vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_post_shift);
581     vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_post_shift);
582 
583     const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
584 #if XNN_ARCH_ARM64
585     const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
586     const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
587     const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
588     const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
589     const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
590     const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
591     const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
592     const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
593 
594     int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
595     int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
596     int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
597     int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
598 #else
599     const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
600     const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
601     const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
602     const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
603     const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
604     const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
605     const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
606     const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
607 
608     int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
609     int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
610     int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
611     int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
612 #endif
613     const int8x16_t voutput_min = vld1q_dup_s8(&params->rndnu_neon.output_min);
614     const int8x16_t voutput_max = vld1q_dup_s8(&params->rndnu_neon.output_max);
615 
616     vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
617     vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
618     vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
619     vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
620 
621     vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
622     vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
623     vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
624     vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
625 
626     if (nc >= 16) {
627       // Main case where there the 16 columns fit in the destination.
628       vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
629       vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
630       vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
631       vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
632 
633       // Advance to the next 16 columns.
634       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
635       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
636       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
637       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
638 
639       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
640       a1 = (const int8_t*) ((uintptr_t) a1 - kc);
641       a2 = (const int8_t*) ((uintptr_t) a2 - kc);
642       a3 = (const int8_t*) ((uintptr_t) a3 - kc);
643 
644       nc -= 16;
645     } else {
646       // Final case where not all of the 16 columns fit in the destination.
647       int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
648       int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
649       if (nc & 8) {
650         vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
651         vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
652         vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
653         vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
654         vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
655         vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
656       }
657       if (nc & 4) {
658         vst1q_lane_u32((void*) c0, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
659         vst1q_lane_u32((void*) c1, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
660         vst1q_lane_u32((void*) c2, vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
661         vst1q_lane_u32((void*) c3, vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
662         vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
663         vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
664       }
665       if (nc & 2) {
666         vst1q_lane_u16((void*) c0, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
667         vst1q_lane_u16((void*) c1, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
668         vst1q_lane_u16((void*) c2, vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
669         vst1q_lane_u16((void*) c3, vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
670         vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
671         vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
672       }
673       if (nc & 1) {
674         vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
675         vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
676         vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
677         vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
678       }
679 
680       nc = 0;
681     }
682   } while (nc != 0);
683 }
684