1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <assert.h>
10
11 #include <arm_neon.h>
12
13 #include <xnnpack/common.h>
14 #include <xnnpack/igemm.h>
15
16
xnn_q8_igemm_ukernel_8x8__neon(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_q8_gemm_params params[restrict static1])17 void xnn_q8_igemm_ukernel_8x8__neon(
18 size_t mr,
19 size_t nc,
20 size_t kc,
21 size_t ks,
22 const uint8_t** restrict a,
23 const void* restrict w,
24 uint8_t* restrict c,
25 size_t cm_stride,
26 size_t cn_stride,
27 size_t a_offset,
28 const uint8_t* zero,
29 const union xnn_q8_gemm_params params[restrict static 1])
30 {
31 assert(mr != 0);
32 assert(mr <= 8);
33 assert(nc != 0);
34 assert(kc != 0);
35 assert(ks != 0);
36 assert(ks % (8 * sizeof(void*)) == 0);
37
38 uint8_t* c0 = c;
39 uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
40 if XNN_UNPREDICTABLE(mr < 2) {
41 c1 = c0;
42 }
43 uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
44 if XNN_UNPREDICTABLE(mr <= 2) {
45 c2 = c1;
46 }
47 uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
48 if XNN_UNPREDICTABLE(mr < 4) {
49 c3 = c2;
50 }
51 uint8_t* c4 = (uint8_t*) ((uintptr_t) c3 + cm_stride);
52 if XNN_UNPREDICTABLE(mr <= 4) {
53 c4 = c3;
54 }
55 uint8_t* c5 = (uint8_t*) ((uintptr_t) c4 + cm_stride);
56 if XNN_UNPREDICTABLE(mr < 6) {
57 c5 = c4;
58 }
59 uint8_t* c6 = (uint8_t*) ((uintptr_t) c5 + cm_stride);
60 if XNN_UNPREDICTABLE(mr <= 6) {
61 c6 = c5;
62 }
63 uint8_t* c7 = (uint8_t*) ((uintptr_t) c6 + cm_stride);
64 if XNN_UNPREDICTABLE(mr != 8) {
65 c7 = c6;
66 }
67
68 const uint8x8_t vb_zero_point = vld1_dup_u8((const uint8_t*) ¶ms->neon.kernel_zero_point);
69
70 do {
71 int32x4_t vacc0x0123 = vld1q_s32(w); w = (void*) ((uintptr_t) w + sizeof(int32x4_t));
72 int32x4_t vacc0x4567 = vld1q_s32(w); w = (void*) ((uintptr_t) w + sizeof(int32x4_t));
73 int32x4_t vacc1x0123 = vacc0x0123;
74 int32x4_t vacc1x4567 = vacc0x4567;
75 int32x4_t vacc2x0123 = vacc0x0123;
76 int32x4_t vacc2x4567 = vacc0x4567;
77 int32x4_t vacc3x0123 = vacc0x0123;
78 int32x4_t vacc3x4567 = vacc0x4567;
79 int32x4_t vacc4x0123 = vacc0x0123;
80 int32x4_t vacc4x4567 = vacc0x4567;
81 int32x4_t vacc5x0123 = vacc0x0123;
82 int32x4_t vacc5x4567 = vacc0x4567;
83 int32x4_t vacc6x0123 = vacc0x0123;
84 int32x4_t vacc6x4567 = vacc0x4567;
85 int32x4_t vacc7x0123 = vacc0x0123;
86 int32x4_t vacc7x4567 = vacc0x4567;
87
88 size_t p = ks;
89 do {
90 const uint8_t* restrict a0 = a[0];
91 if XNN_UNPREDICTABLE(a0 != zero) {
92 a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
93 }
94 const uint8_t* restrict a1 = a[1];
95 if XNN_UNPREDICTABLE(a1 != zero) {
96 a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
97 }
98 const uint8_t* restrict a2 = a[2];
99 if XNN_UNPREDICTABLE(a2 != zero) {
100 a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
101 }
102 const uint8_t* restrict a3 = a[3];
103 if XNN_UNPREDICTABLE(a3 != zero) {
104 a3 = (const uint8_t*) ((uintptr_t) a3 + a_offset);
105 }
106 const uint8_t* restrict a4 = a[4];
107 if XNN_UNPREDICTABLE(a4 != zero) {
108 a4 = (const uint8_t*) ((uintptr_t) a4 + a_offset);
109 }
110 const uint8_t* restrict a5 = a[5];
111 if XNN_UNPREDICTABLE(a5 != zero) {
112 a5 = (const uint8_t*) ((uintptr_t) a5 + a_offset);
113 }
114 const uint8_t* restrict a6 = a[6];
115 if XNN_UNPREDICTABLE(a6 != zero) {
116 a6 = (const uint8_t*) ((uintptr_t) a6 + a_offset);
117 }
118 const uint8_t* restrict a7 = a[7];
119 if XNN_UNPREDICTABLE(a7 != zero) {
120 a7 = (const uint8_t*) ((uintptr_t) a7 + a_offset);
121 }
122 a += 8;
123
124 size_t k = kc;
125 while (k >= 8 * sizeof(uint8_t)) {
126 const uint8x8_t va0 = vld1_u8(a0); a0 += 8;
127 const uint8x8_t va1 = vld1_u8(a1); a1 += 8;
128 const uint8x8_t va2 = vld1_u8(a2); a2 += 8;
129 const uint8x8_t va3 = vld1_u8(a3); a3 += 8;
130 const uint8x8_t va4 = vld1_u8(a4); a4 += 8;
131 const uint8x8_t va5 = vld1_u8(a5); a5 += 8;
132 const uint8x8_t va6 = vld1_u8(a6); a6 += 8;
133 const uint8x8_t va7 = vld1_u8(a7); a7 += 8;
134 const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
135 const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
136 const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));
137 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3));
138 const int16x8_t vxa4 = vreinterpretq_s16_u16(vmovl_u8(va4));
139 const int16x8_t vxa5 = vreinterpretq_s16_u16(vmovl_u8(va5));
140 const int16x8_t vxa6 = vreinterpretq_s16_u16(vmovl_u8(va6));
141 const int16x8_t vxa7 = vreinterpretq_s16_u16(vmovl_u8(va7));
142
143 {
144 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
145 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
146
147 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 0);
148 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 0);
149 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 0);
150 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 0);
151 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 0);
152 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 0);
153 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 0);
154 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 0);
155 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 0);
156 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 0);
157 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 0);
158 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 0);
159 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 0);
160 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 0);
161 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 0);
162 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 0);
163 }
164
165 {
166 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
167 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
168
169 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 1);
170 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 1);
171 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 1);
172 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 1);
173 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 1);
174 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 1);
175 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 1);
176 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 1);
177 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 1);
178 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 1);
179 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 1);
180 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 1);
181 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 1);
182 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 1);
183 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 1);
184 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 1);
185 }
186
187 {
188 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
189 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
190
191 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 2);
192 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 2);
193 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 2);
194 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 2);
195 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 2);
196 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 2);
197 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 2);
198 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 2);
199 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 2);
200 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 2);
201 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 2);
202 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 2);
203 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 2);
204 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 2);
205 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 2);
206 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 2);
207 }
208
209 {
210 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
211 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
212
213 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 3);
214 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 3);
215 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 3);
216 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 3);
217 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 3);
218 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 3);
219 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 3);
220 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 3);
221 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 3);
222 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 3);
223 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 3);
224 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 3);
225 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 3);
226 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 3);
227 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 3);
228 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 3);
229 }
230
231 {
232 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
233 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
234
235 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 0);
236 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 0);
237 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 0);
238 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 0);
239 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 0);
240 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 0);
241 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 0);
242 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 0);
243 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 0);
244 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 0);
245 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 0);
246 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 0);
247 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 0);
248 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 0);
249 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 0);
250 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 0);
251 }
252
253 {
254 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
255 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
256
257 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 1);
258 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 1);
259 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 1);
260 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 1);
261 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 1);
262 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 1);
263 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 1);
264 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 1);
265 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 1);
266 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 1);
267 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 1);
268 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 1);
269 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 1);
270 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 1);
271 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 1);
272 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 1);
273 }
274
275 {
276 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
277 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
278
279 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 2);
280 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 2);
281 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 2);
282 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 2);
283 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 2);
284 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 2);
285 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 2);
286 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 2);
287 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 2);
288 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 2);
289 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 2);
290 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 2);
291 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 2);
292 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 2);
293 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 2);
294 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 2);
295 }
296
297 {
298 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
299 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
300
301 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 3);
302 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 3);
303 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 3);
304 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 3);
305 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 3);
306 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 3);
307 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 3);
308 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 3);
309 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 3);
310 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 3);
311 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 3);
312 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 3);
313 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 3);
314 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 3);
315 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 3);
316 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 3);
317 }
318
319 k -= 8 * sizeof(uint8_t);
320 }
321 if (k != 0) {
322 const uint8x8_t va0 = vld1_u8(a0);
323 const uint8x8_t va1 = vld1_u8(a1);
324 const uint8x8_t va2 = vld1_u8(a2);
325 const uint8x8_t va3 = vld1_u8(a3);
326 const uint8x8_t va4 = vld1_u8(a4);
327 const uint8x8_t va5 = vld1_u8(a5);
328 const uint8x8_t va6 = vld1_u8(a6);
329 const uint8x8_t va7 = vld1_u8(a7);
330 const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
331 const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
332 const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));
333 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3));
334 const int16x8_t vxa4 = vreinterpretq_s16_u16(vmovl_u8(va4));
335 const int16x8_t vxa5 = vreinterpretq_s16_u16(vmovl_u8(va5));
336 const int16x8_t vxa6 = vreinterpretq_s16_u16(vmovl_u8(va6));
337 const int16x8_t vxa7 = vreinterpretq_s16_u16(vmovl_u8(va7));
338
339 {
340 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
341 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
342
343 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 0);
344 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 0);
345 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 0);
346 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 0);
347 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 0);
348 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 0);
349 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 0);
350 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 0);
351 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 0);
352 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 0);
353 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 0);
354 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 0);
355 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 0);
356 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 0);
357 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 0);
358 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 0);
359 }
360
361 if (k >= 2 * sizeof(uint8_t)) {
362 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
363 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
364
365 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 1);
366 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 1);
367 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 1);
368 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 1);
369 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 1);
370 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 1);
371 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 1);
372 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 1);
373 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 1);
374 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 1);
375 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 1);
376 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 1);
377 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 1);
378 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 1);
379 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 1);
380 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 1);
381
382 if (k > 2 * sizeof(uint8_t)) {
383 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
384 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
385
386 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 2);
387 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 2);
388 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 2);
389 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 2);
390 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 2);
391 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 2);
392 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 2);
393 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 2);
394 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 2);
395 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 2);
396 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 2);
397 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 2);
398 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 2);
399 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 2);
400 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 2);
401 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 2);
402
403 if (k >= 4 * sizeof(uint8_t)) {
404 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
405 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
406
407 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 3);
408 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 3);
409 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 3);
410 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 3);
411 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 3);
412 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 3);
413 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 3);
414 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 3);
415 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 3);
416 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 3);
417 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 3);
418 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 3);
419 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 3);
420 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 3);
421 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 3);
422 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 3);
423
424 if (k > 4 * sizeof(uint8_t)) {
425 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
426 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
427
428 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 0);
429 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 0);
430 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 0);
431 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 0);
432 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 0);
433 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 0);
434 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 0);
435 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 0);
436 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 0);
437 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 0);
438 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 0);
439 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 0);
440 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 0);
441 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 0);
442 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 0);
443 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 0);
444
445 if (k >= 6 * sizeof(uint8_t)) {
446 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
447 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
448
449 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 1);
450 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 1);
451 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 1);
452 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 1);
453 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 1);
454 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 1);
455 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 1);
456 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 1);
457 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 1);
458 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 1);
459 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 1);
460 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 1);
461 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 1);
462 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 1);
463 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 1);
464 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 1);
465
466 if (k > 6 * sizeof(uint8_t)) {
467 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
468 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
469
470 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 2);
471 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 2);
472 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 2);
473 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 2);
474 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 2);
475 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 2);
476 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 2);
477 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 2);
478 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 2);
479 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 2);
480 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 2);
481 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 2);
482 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 2);
483 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 2);
484 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 2);
485 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 2);
486 }
487 }
488 }
489 }
490 }
491 }
492 }
493 p -= 8 * sizeof(void*);
494 } while (p != 0);
495
496 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
497 vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
498 vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
499 vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
500 vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
501 vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
502 vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
503 vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
504 vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
505 vacc4x0123 = vqrdmulhq_s32(vacc4x0123, vmultiplier);
506 vacc4x4567 = vqrdmulhq_s32(vacc4x4567, vmultiplier);
507 vacc5x0123 = vqrdmulhq_s32(vacc5x0123, vmultiplier);
508 vacc5x4567 = vqrdmulhq_s32(vacc5x4567, vmultiplier);
509 vacc6x0123 = vqrdmulhq_s32(vacc6x0123, vmultiplier);
510 vacc6x4567 = vqrdmulhq_s32(vacc6x4567, vmultiplier);
511 vacc7x0123 = vqrdmulhq_s32(vacc7x0123, vmultiplier);
512 vacc7x4567 = vqrdmulhq_s32(vacc7x4567, vmultiplier);
513
514 const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
515 const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
516 vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
517 vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
518 vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
519 vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
520 vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
521 vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
522 vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
523 vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
524 vacc4x0123 = vsraq_n_s32(vacc4x0123, vbicq_s32(vacc4x0123, vzero_shift_mask), 31);
525 vacc4x4567 = vsraq_n_s32(vacc4x4567, vbicq_s32(vacc4x4567, vzero_shift_mask), 31);
526 vacc5x0123 = vsraq_n_s32(vacc5x0123, vbicq_s32(vacc5x0123, vzero_shift_mask), 31);
527 vacc5x4567 = vsraq_n_s32(vacc5x4567, vbicq_s32(vacc5x4567, vzero_shift_mask), 31);
528 vacc6x0123 = vsraq_n_s32(vacc6x0123, vbicq_s32(vacc6x0123, vzero_shift_mask), 31);
529 vacc6x4567 = vsraq_n_s32(vacc6x4567, vbicq_s32(vacc6x4567, vzero_shift_mask), 31);
530 vacc7x0123 = vsraq_n_s32(vacc7x0123, vbicq_s32(vacc7x0123, vzero_shift_mask), 31);
531 vacc7x4567 = vsraq_n_s32(vacc7x4567, vbicq_s32(vacc7x4567, vzero_shift_mask), 31);
532
533 vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
534 vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
535 vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
536 vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
537 vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
538 vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
539 vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
540 vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
541 vacc4x0123 = vrshlq_s32(vacc4x0123, vright_shift);
542 vacc4x4567 = vrshlq_s32(vacc4x4567, vright_shift);
543 vacc5x0123 = vrshlq_s32(vacc5x0123, vright_shift);
544 vacc5x4567 = vrshlq_s32(vacc5x4567, vright_shift);
545 vacc6x0123 = vrshlq_s32(vacc6x0123, vright_shift);
546 vacc6x4567 = vrshlq_s32(vacc6x4567, vright_shift);
547 vacc7x0123 = vrshlq_s32(vacc7x0123, vright_shift);
548 vacc7x4567 = vrshlq_s32(vacc7x4567, vright_shift);
549
550 const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
551 #if XNN_ARCH_ARM64
552 const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
553 const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
554 const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
555 const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
556 const int16x8_t vacc4x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc4x0123), vacc4x4567), voutput_zero_point);
557 const int16x8_t vacc5x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x0123), vacc5x4567), voutput_zero_point);
558 const int16x8_t vacc6x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc6x0123), vacc6x4567), voutput_zero_point);
559 const int16x8_t vacc7x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc7x0123), vacc7x4567), voutput_zero_point);
560
561 uint8x16_t vout0x01234567_1x01234567 = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc1x01234567);
562 uint8x16_t vout2x01234567_3x01234567 = vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc3x01234567);
563 uint8x16_t vout4x01234567_5x01234567 = vqmovun_high_s16(vqmovun_s16(vacc4x01234567), vacc5x01234567);
564 uint8x16_t vout6x01234567_7x01234567 = vqmovun_high_s16(vqmovun_s16(vacc6x01234567), vacc7x01234567);
565 #else
566 const int16x8_t vacc0x01234567 =
567 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
568 const int16x8_t vacc1x01234567 =
569 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
570 const int16x8_t vacc2x01234567 =
571 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
572 const int16x8_t vacc3x01234567 =
573 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
574 const int16x8_t vacc4x01234567 =
575 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc4x0123), vqmovn_s32(vacc4x4567)), voutput_zero_point);
576 const int16x8_t vacc5x01234567 =
577 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x0123), vqmovn_s32(vacc5x4567)), voutput_zero_point);
578 const int16x8_t vacc6x01234567 =
579 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc6x0123), vqmovn_s32(vacc6x4567)), voutput_zero_point);
580 const int16x8_t vacc7x01234567 =
581 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc7x0123), vqmovn_s32(vacc7x4567)), voutput_zero_point);
582
583 uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc1x01234567));
584 uint8x16_t vout2x01234567_3x01234567 = vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc3x01234567));
585 uint8x16_t vout4x01234567_5x01234567 = vcombine_u8(vqmovun_s16(vacc4x01234567), vqmovun_s16(vacc5x01234567));
586 uint8x16_t vout6x01234567_7x01234567 = vcombine_u8(vqmovun_s16(vacc6x01234567), vqmovun_s16(vacc7x01234567));
587 #endif
588 const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->neon.output_min);
589 const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->neon.output_max);
590
591 vout0x01234567_1x01234567 = vmaxq_u8(vout0x01234567_1x01234567, voutput_min);
592 vout2x01234567_3x01234567 = vmaxq_u8(vout2x01234567_3x01234567, voutput_min);
593 vout4x01234567_5x01234567 = vmaxq_u8(vout4x01234567_5x01234567, voutput_min);
594 vout6x01234567_7x01234567 = vmaxq_u8(vout6x01234567_7x01234567, voutput_min);
595 vout0x01234567_1x01234567 = vminq_u8(vout0x01234567_1x01234567, voutput_max);
596 vout2x01234567_3x01234567 = vminq_u8(vout2x01234567_3x01234567, voutput_max);
597 vout4x01234567_5x01234567 = vminq_u8(vout4x01234567_5x01234567, voutput_max);
598 vout6x01234567_7x01234567 = vminq_u8(vout6x01234567_7x01234567, voutput_max);
599
600 if XNN_LIKELY(nc >= 8) {
601 vst1_u8(c7, vget_high_u8(vout6x01234567_7x01234567)); c7 += cn_stride;
602 vst1_u8(c6, vget_low_u8(vout6x01234567_7x01234567)); c6 += cn_stride;
603 vst1_u8(c5, vget_high_u8(vout4x01234567_5x01234567)); c5 += cn_stride;
604 vst1_u8(c4, vget_low_u8(vout4x01234567_5x01234567)); c4 += cn_stride;
605 vst1_u8(c3, vget_high_u8(vout2x01234567_3x01234567)); c3 += cn_stride;
606 vst1_u8(c2, vget_low_u8(vout2x01234567_3x01234567)); c2 += cn_stride;
607 vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567)); c1 += cn_stride;
608 vst1_u8(c0, vget_low_u8(vout0x01234567_1x01234567)); c0 += cn_stride;
609
610 a = (const uint8_t**restrict) ((uintptr_t) a - ks);
611
612 nc -= 8;
613 } else {
614 if (nc & 4) {
615 vst1q_lane_u32(__builtin_assume_aligned(c7, 1), vreinterpretq_u32_u8(vout6x01234567_7x01234567), 2); c7 += 4;
616 vst1q_lane_u32(__builtin_assume_aligned(c6, 1), vreinterpretq_u32_u8(vout6x01234567_7x01234567), 0); c6 += 4;
617 vst1q_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_u8(vout4x01234567_5x01234567), 2); c5 += 4;
618 vst1q_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_u8(vout4x01234567_5x01234567), 0); c4 += 4;
619 vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_u8(vout2x01234567_3x01234567), 2); c3 += 4;
620 vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_u8(vout2x01234567_3x01234567), 0); c2 += 4;
621 vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4;
622 vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4;
623 vout6x01234567_7x01234567 = vextq_u8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 4);
624 vout4x01234567_5x01234567 = vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4);
625 vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
626 vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
627 }
628 if (nc & 2) {
629 vst1q_lane_u16(__builtin_assume_aligned(c7, 1), vreinterpretq_u16_u8(vout6x01234567_7x01234567), 4); c7 += 2;
630 vst1q_lane_u16(__builtin_assume_aligned(c6, 1), vreinterpretq_u16_u8(vout6x01234567_7x01234567), 0); c6 += 2;
631 vst1q_lane_u16(__builtin_assume_aligned(c5, 1), vreinterpretq_u16_u8(vout4x01234567_5x01234567), 4); c5 += 2;
632 vst1q_lane_u16(__builtin_assume_aligned(c4, 1), vreinterpretq_u16_u8(vout4x01234567_5x01234567), 0); c4 += 2;
633 vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_u8(vout2x01234567_3x01234567), 4); c3 += 2;
634 vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_u8(vout2x01234567_3x01234567), 0); c2 += 2;
635 vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2;
636 vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2;
637 vout6x01234567_7x01234567 = vextq_u8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 2);
638 vout4x01234567_5x01234567 = vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2);
639 vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
640 vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
641 }
642 if (nc & 1) {
643 vst1q_lane_u8(c7, vout6x01234567_7x01234567, 8);
644 vst1q_lane_u8(c6, vout6x01234567_7x01234567, 0);
645 vst1q_lane_u8(c5, vout4x01234567_5x01234567, 8);
646 vst1q_lane_u8(c4, vout4x01234567_5x01234567, 0);
647 vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8);
648 vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0);
649 vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
650 vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
651 }
652
653 nc = 0;
654 }
655 } while (nc != 0);
656 }
657