1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gemm/neon-mull-addw-dup.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/common.h>
15 #include <xnnpack/gemm.h>
16
17
xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup(
19 size_t mr,
20 size_t nc,
21 size_t kc,
22 const int8_t* restrict a,
23 size_t a_stride,
24 const void* restrict w,
25 int8_t* restrict c,
26 size_t cm_stride,
27 size_t cn_stride,
28 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
29 {
30 assert(mr != 0);
31 assert(mr <= 4);
32 assert(nc != 0);
33 assert(kc != 0);
34 assert(kc % sizeof(int8_t) == 0);
35 assert(a != NULL);
36 assert(w != NULL);
37 assert(c != NULL);
38
39 const int8_t* a0 = a;
40 int8_t* c0 = c;
41 const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
42 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
43 if XNN_UNPREDICTABLE(mr < 2) {
44 a1 = a0;
45 c1 = c0;
46 }
47 const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
48 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
49 if XNN_UNPREDICTABLE(mr <= 2) {
50 a2 = a1;
51 c2 = c1;
52 }
53 const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
54 int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
55 if XNN_UNPREDICTABLE(mr != 4) {
56 a3 = a2;
57 c3 = c2;
58 }
59
60 do {
61 int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
62 int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
63 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
64 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
65 int32x4_t vacc1x0123 = vacc0x0123;
66 int32x4_t vacc1x4567 = vacc0x4567;
67 int32x4_t vacc1x89AB = vacc0x89AB;
68 int32x4_t vacc1xCDEF = vacc0xCDEF;
69 int32x4_t vacc2x0123 = vacc0x0123;
70 int32x4_t vacc2x4567 = vacc0x4567;
71 int32x4_t vacc2x89AB = vacc0x89AB;
72 int32x4_t vacc2xCDEF = vacc0xCDEF;
73 int32x4_t vacc3x0123 = vacc0x0123;
74 int32x4_t vacc3x4567 = vacc0x4567;
75 int32x4_t vacc3x89AB = vacc0x89AB;
76 int32x4_t vacc3xCDEF = vacc0xCDEF;
77
78 size_t k = kc;
79 while (k >= 8 * sizeof(int8_t)) {
80 const int8x8_t va0 = vld1_s8(a0); a0 += 8;
81 const int8x8_t va1 = vld1_s8(a1); a1 += 8;
82 const int8x8_t va2 = vld1_s8(a2); a2 += 8;
83 const int8x8_t va3 = vld1_s8(a3); a3 += 8;
84
85 const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
86
87 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
88 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
89 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
90 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
91 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
92 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
93 const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
94 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
95 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
96 const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0));
97 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c0));
98 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0));
99 const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
100
101 const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
102 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
103 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
104 const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
105 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
106 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
107 const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0));
108 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0));
109 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0));
110 const int16x8_t vprod3x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va3, 0));
111 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0));
112 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0));
113 const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
114
115 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
116 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
117 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
118 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
119 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
120 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
121 const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
122 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
123 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
124 const int16x8_t vprod3x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va3, 1));
125 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c1));
126 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1));
127 const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
128
129 const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
130 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
131 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
132 const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
133 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
134 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
135 const int16x8_t vprod2x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va2, 1));
136 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1));
137 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1));
138 const int16x8_t vprod3x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va3, 1));
139 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc1));
140 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc1));
141 const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
142
143 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
144 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
145 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
146 const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
147 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
148 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
149 const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
150 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
151 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
152 const int16x8_t vprod3x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va3, 2));
153 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c2));
154 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2));
155 const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
156
157 const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
158 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
159 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
160 const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
161 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
162 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
163 const int16x8_t vprod2x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va2, 2));
164 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2));
165 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2));
166 const int16x8_t vprod3x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va3, 2));
167 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc2));
168 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc2));
169 const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
170
171 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
172 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
173 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
174 const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
175 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
176 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
177 const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
178 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
179 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
180 const int16x8_t vprod3x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va3, 3));
181 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c3));
182 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3));
183 const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
184
185 const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
186 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
187 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
188 const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
189 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
190 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
191 const int16x8_t vprod2x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va2, 3));
192 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3));
193 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3));
194 const int16x8_t vprod3x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va3, 3));
195 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc3));
196 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc3));
197 const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
198
199 const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
200 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
201 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
202 const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
203 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
204 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
205 const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
206 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
207 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
208 const int16x8_t vprod3x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va3, 4));
209 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c4));
210 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4));
211 const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
212
213 const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
214 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
215 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
216 const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
217 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
218 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
219 const int16x8_t vprod2x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va2, 4));
220 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4));
221 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4));
222 const int16x8_t vprod3x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va3, 4));
223 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc4));
224 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc4));
225 const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
226
227 const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
228 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
229 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
230 const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
231 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
232 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
233 const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
234 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
235 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
236 const int16x8_t vprod3x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va3, 5));
237 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c5));
238 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5));
239 const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
240
241 const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
242 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
243 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
244 const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
245 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
246 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
247 const int16x8_t vprod2x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va2, 5));
248 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5));
249 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5));
250 const int16x8_t vprod3x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va3, 5));
251 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc5));
252 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc5));
253 const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
254
255 const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
256 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
257 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
258 const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
259 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
260 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
261 const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
262 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
263 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
264 const int16x8_t vprod3x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va3, 6));
265 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c6));
266 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6));
267 const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
268
269 const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
270 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
271 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
272 const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
273 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
274 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
275 const int16x8_t vprod2x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va2, 6));
276 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6));
277 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6));
278 const int16x8_t vprod3x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va3, 6));
279 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc6));
280 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc6));
281 const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
282
283 const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
284 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
285 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
286 const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
287 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
288 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
289 const int16x8_t vprod2x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va2, 7));
290 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c7));
291 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c7));
292 const int16x8_t vprod3x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va3, 7));
293 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c7));
294 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c7));
295 const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
296
297 const int16x8_t vprod0x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va0, 7));
298 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7));
299 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7));
300 const int16x8_t vprod1x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va1, 7));
301 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc7));
302 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7));
303 const int16x8_t vprod2x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va2, 7));
304 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7));
305 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7));
306 const int16x8_t vprod3x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va3, 7));
307 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc7));
308 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc7));
309
310 k -= 8 * sizeof(int8_t);
311 }
312 if XNN_UNLIKELY(k != 0) {
313 const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
314 const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
315 const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
316 const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
317
318 const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
319 const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
320
321 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
322 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
323 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
324 const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
325 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
326 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
327 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
328 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
329 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
330 const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
331 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
332 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
333 const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
334 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
335 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
336 const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0));
337 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0));
338 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0));
339 const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0));
340 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c0));
341 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0));
342 const int16x8_t vprod3x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va3, 0));
343 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0));
344 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0));
345
346 if (k >= 2 * sizeof(int8_t)) {
347 const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
348 const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
349
350 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
351 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
352 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
353 const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
354 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
355 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
356 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
357 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
358 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
359 const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
360 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
361 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
362 const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
363 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
364 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
365 const int16x8_t vprod2x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va2, 1));
366 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1));
367 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1));
368 const int16x8_t vprod3x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va3, 1));
369 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c1));
370 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1));
371 const int16x8_t vprod3x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va3, 1));
372 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc1));
373 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc1));
374
375 if (k > 2 * sizeof(int8_t)) {
376 const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
377 const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
378
379 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
380 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
381 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
382 const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
383 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
384 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
385 const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
386 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
387 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
388 const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
389 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
390 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
391 const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
392 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
393 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
394 const int16x8_t vprod2x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va2, 2));
395 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2));
396 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2));
397 const int16x8_t vprod3x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va3, 2));
398 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c2));
399 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2));
400 const int16x8_t vprod3x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va3, 2));
401 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc2));
402 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc2));
403
404 if (k >= 4 * sizeof(int8_t)) {
405 const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
406 const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
407
408 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
409 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
410 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
411 const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
412 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
413 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
414 const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
415 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
416 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
417 const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
418 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
419 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
420 const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
421 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
422 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
423 const int16x8_t vprod2x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va2, 3));
424 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3));
425 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3));
426 const int16x8_t vprod3x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va3, 3));
427 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c3));
428 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3));
429 const int16x8_t vprod3x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va3, 3));
430 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc3));
431 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc3));
432
433 if (k > 4 * sizeof(int8_t)) {
434 const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
435 const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
436
437 const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
438 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
439 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
440 const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
441 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
442 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
443 const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
444 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
445 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
446 const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
447 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
448 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
449 const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
450 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
451 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
452 const int16x8_t vprod2x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va2, 4));
453 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4));
454 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4));
455 const int16x8_t vprod3x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va3, 4));
456 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c4));
457 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4));
458 const int16x8_t vprod3x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va3, 4));
459 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc4));
460 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc4));
461
462 if (k >= 6 * sizeof(int8_t)) {
463 const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
464 const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
465
466 const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
467 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
468 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
469 const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
470 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
471 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
472 const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
473 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
474 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
475 const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
476 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
477 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
478 const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
479 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
480 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
481 const int16x8_t vprod2x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va2, 5));
482 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5));
483 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5));
484 const int16x8_t vprod3x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va3, 5));
485 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c5));
486 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5));
487 const int16x8_t vprod3x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va3, 5));
488 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc5));
489 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc5));
490
491 if (k > 6 * sizeof(int8_t)) {
492 const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
493 const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
494
495 const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
496 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
497 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
498 const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
499 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
500 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
501 const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
502 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
503 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
504 const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
505 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
506 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
507 const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
508 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
509 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
510 const int16x8_t vprod2x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va2, 6));
511 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6));
512 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6));
513 const int16x8_t vprod3x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va3, 6));
514 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c6));
515 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6));
516 const int16x8_t vprod3x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va3, 6));
517 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc6));
518 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc6));
519 }
520 }
521 }
522 }
523 }
524 }
525 }
526
527 // Post-accumulation work
528 const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift);
529 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier);
530 const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift);
531
532 vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
533 vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
534 vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift);
535 vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift);
536 vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift);
537 vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift);
538 vacc1x89AB = vqshlq_s32(vacc1x89AB, vright_pre_shift);
539 vacc1xCDEF = vqshlq_s32(vacc1xCDEF, vright_pre_shift);
540 vacc2x0123 = vqshlq_s32(vacc2x0123, vright_pre_shift);
541 vacc2x4567 = vqshlq_s32(vacc2x4567, vright_pre_shift);
542 vacc2x89AB = vqshlq_s32(vacc2x89AB, vright_pre_shift);
543 vacc2xCDEF = vqshlq_s32(vacc2xCDEF, vright_pre_shift);
544 vacc3x0123 = vqshlq_s32(vacc3x0123, vright_pre_shift);
545 vacc3x4567 = vqshlq_s32(vacc3x4567, vright_pre_shift);
546 vacc3x89AB = vqshlq_s32(vacc3x89AB, vright_pre_shift);
547 vacc3xCDEF = vqshlq_s32(vacc3xCDEF, vright_pre_shift);
548
549 vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
550 vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
551 vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier);
552 vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier);
553 vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier);
554 vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier);
555 vacc1x89AB = vqdmulhq_s32(vacc1x89AB, vmultiplier);
556 vacc1xCDEF = vqdmulhq_s32(vacc1xCDEF, vmultiplier);
557 vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier);
558 vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier);
559 vacc2x89AB = vqdmulhq_s32(vacc2x89AB, vmultiplier);
560 vacc2xCDEF = vqdmulhq_s32(vacc2xCDEF, vmultiplier);
561 vacc3x0123 = vqdmulhq_s32(vacc3x0123, vmultiplier);
562 vacc3x4567 = vqdmulhq_s32(vacc3x4567, vmultiplier);
563 vacc3x89AB = vqdmulhq_s32(vacc3x89AB, vmultiplier);
564 vacc3xCDEF = vqdmulhq_s32(vacc3xCDEF, vmultiplier);
565
566 vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
567 vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
568 vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift);
569 vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift);
570 vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift);
571 vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift);
572 vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_post_shift);
573 vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_post_shift);
574 vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift);
575 vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift);
576 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_post_shift);
577 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_post_shift);
578 vacc3x0123 = vrshlq_s32(vacc3x0123, vright_post_shift);
579 vacc3x4567 = vrshlq_s32(vacc3x4567, vright_post_shift);
580 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_post_shift);
581 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_post_shift);
582
583 const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point);
584 #if XNN_ARCH_ARM64
585 const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
586 const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
587 const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
588 const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
589 const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
590 const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
591 const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
592 const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
593
594 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
595 int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
596 int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
597 int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
598 #else
599 const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
600 const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
601 const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
602 const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
603 const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
604 const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
605 const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
606 const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
607
608 int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
609 int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
610 int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
611 int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
612 #endif
613 const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min);
614 const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max);
615
616 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
617 vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
618 vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
619 vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
620
621 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
622 vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
623 vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
624 vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
625
626 if (nc >= 16) {
627 // Main case where there the 16 columns fit in the destination.
628 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
629 vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
630 vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
631 vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
632
633 // Advance to the next 16 columns.
634 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
635 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
636 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
637 c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
638
639 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
640 a1 = (const int8_t*) ((uintptr_t) a1 - kc);
641 a2 = (const int8_t*) ((uintptr_t) a2 - kc);
642 a3 = (const int8_t*) ((uintptr_t) a3 - kc);
643
644 nc -= 16;
645 } else {
646 // Final case where not all of the 16 columns fit in the destination.
647 int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
648 int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
649 if (nc & 8) {
650 vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
651 vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
652 vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
653 vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
654 vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
655 vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
656 }
657 if (nc & 4) {
658 vst1q_lane_u32((void*) c0, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
659 vst1q_lane_u32((void*) c1, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
660 vst1q_lane_u32((void*) c2, vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
661 vst1q_lane_u32((void*) c3, vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
662 vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
663 vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
664 }
665 if (nc & 2) {
666 vst1q_lane_u16((void*) c0, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
667 vst1q_lane_u16((void*) c1, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
668 vst1q_lane_u16((void*) c2, vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
669 vst1q_lane_u16((void*) c3, vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
670 vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
671 vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
672 }
673 if (nc & 1) {
674 vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
675 vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
676 vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
677 vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
678 }
679
680 nc = 0;
681 }
682 } while (nc != 0);
683 }
684