• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <assert.h>
10 
11 #include <arm_neon.h>
12 
13 #include <xnnpack/common.h>
14 #include <xnnpack/igemm.h>
15 
16 
xnn_q8_igemm_ukernel_8x8__neon(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_q8_gemm_params params[restrict static1])17 void xnn_q8_igemm_ukernel_8x8__neon(
18     size_t mr,
19     size_t nc,
20     size_t kc,
21     size_t ks,
22     const uint8_t** restrict a,
23     const void* restrict w,
24     uint8_t* restrict c,
25     size_t cm_stride,
26     size_t cn_stride,
27     size_t a_offset,
28     const uint8_t* zero,
29     const union xnn_q8_gemm_params params[restrict static 1])
30 {
31   assert(mr != 0);
32   assert(mr <= 8);
33   assert(nc != 0);
34   assert(kc != 0);
35   assert(ks != 0);
36   assert(ks % (8 * sizeof(void*)) == 0);
37 
38   uint8_t* c0 = c;
39   uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
40   if XNN_UNPREDICTABLE(mr < 2) {
41     c1 = c0;
42   }
43   uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
44   if XNN_UNPREDICTABLE(mr <= 2) {
45     c2 = c1;
46   }
47   uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
48   if XNN_UNPREDICTABLE(mr < 4) {
49     c3 = c2;
50   }
51   uint8_t* c4 = (uint8_t*) ((uintptr_t) c3 + cm_stride);
52   if XNN_UNPREDICTABLE(mr <= 4) {
53     c4 = c3;
54   }
55   uint8_t* c5 = (uint8_t*) ((uintptr_t) c4 + cm_stride);
56   if XNN_UNPREDICTABLE(mr < 6) {
57     c5 = c4;
58   }
59   uint8_t* c6 = (uint8_t*) ((uintptr_t) c5 + cm_stride);
60   if XNN_UNPREDICTABLE(mr <= 6) {
61     c6 = c5;
62   }
63   uint8_t* c7 = (uint8_t*) ((uintptr_t) c6 + cm_stride);
64   if XNN_UNPREDICTABLE(mr != 8) {
65     c7 = c6;
66   }
67 
68   const uint8x8_t vb_zero_point = vld1_dup_u8((const uint8_t*) &params->neon.kernel_zero_point);
69 
70   do {
71     int32x4_t vacc0x0123 = vld1q_s32(w); w = (void*) ((uintptr_t) w + sizeof(int32x4_t));
72     int32x4_t vacc0x4567 = vld1q_s32(w); w = (void*) ((uintptr_t) w + sizeof(int32x4_t));
73     int32x4_t vacc1x0123 = vacc0x0123;
74     int32x4_t vacc1x4567 = vacc0x4567;
75     int32x4_t vacc2x0123 = vacc0x0123;
76     int32x4_t vacc2x4567 = vacc0x4567;
77     int32x4_t vacc3x0123 = vacc0x0123;
78     int32x4_t vacc3x4567 = vacc0x4567;
79     int32x4_t vacc4x0123 = vacc0x0123;
80     int32x4_t vacc4x4567 = vacc0x4567;
81     int32x4_t vacc5x0123 = vacc0x0123;
82     int32x4_t vacc5x4567 = vacc0x4567;
83     int32x4_t vacc6x0123 = vacc0x0123;
84     int32x4_t vacc6x4567 = vacc0x4567;
85     int32x4_t vacc7x0123 = vacc0x0123;
86     int32x4_t vacc7x4567 = vacc0x4567;
87 
88     size_t p = ks;
89     do {
90       const uint8_t* restrict a0 = a[0];
91       if XNN_UNPREDICTABLE(a0 != zero) {
92         a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
93       }
94       const uint8_t* restrict a1 = a[1];
95       if XNN_UNPREDICTABLE(a1 != zero) {
96         a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
97       }
98       const uint8_t* restrict a2 = a[2];
99       if XNN_UNPREDICTABLE(a2 != zero) {
100         a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
101       }
102       const uint8_t* restrict a3 = a[3];
103       if XNN_UNPREDICTABLE(a3 != zero) {
104         a3 = (const uint8_t*) ((uintptr_t) a3 + a_offset);
105       }
106       const uint8_t* restrict a4 = a[4];
107       if XNN_UNPREDICTABLE(a4 != zero) {
108         a4 = (const uint8_t*) ((uintptr_t) a4 + a_offset);
109       }
110       const uint8_t* restrict a5 = a[5];
111       if XNN_UNPREDICTABLE(a5 != zero) {
112         a5 = (const uint8_t*) ((uintptr_t) a5 + a_offset);
113       }
114       const uint8_t* restrict a6 = a[6];
115       if XNN_UNPREDICTABLE(a6 != zero) {
116         a6 = (const uint8_t*) ((uintptr_t) a6 + a_offset);
117       }
118       const uint8_t* restrict a7 = a[7];
119       if XNN_UNPREDICTABLE(a7 != zero) {
120         a7 = (const uint8_t*) ((uintptr_t) a7 + a_offset);
121       }
122       a += 8;
123 
124       size_t k = kc;
125       while (k >= 8 * sizeof(uint8_t)) {
126         const uint8x8_t va0 = vld1_u8(a0); a0 += 8;
127         const uint8x8_t va1 = vld1_u8(a1); a1 += 8;
128         const uint8x8_t va2 = vld1_u8(a2); a2 += 8;
129         const uint8x8_t va3 = vld1_u8(a3); a3 += 8;
130         const uint8x8_t va4 = vld1_u8(a4); a4 += 8;
131         const uint8x8_t va5 = vld1_u8(a5); a5 += 8;
132         const uint8x8_t va6 = vld1_u8(a6); a6 += 8;
133         const uint8x8_t va7 = vld1_u8(a7); a7 += 8;
134         const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
135         const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
136         const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));
137         const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3));
138         const int16x8_t vxa4 = vreinterpretq_s16_u16(vmovl_u8(va4));
139         const int16x8_t vxa5 = vreinterpretq_s16_u16(vmovl_u8(va5));
140         const int16x8_t vxa6 = vreinterpretq_s16_u16(vmovl_u8(va6));
141         const int16x8_t vxa7 = vreinterpretq_s16_u16(vmovl_u8(va7));
142 
143         {
144           const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
145           const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
146 
147           vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 0);
148           vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 0);
149           vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 0);
150           vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 0);
151           vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 0);
152           vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 0);
153           vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 0);
154           vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 0);
155           vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 0);
156           vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 0);
157           vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 0);
158           vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 0);
159           vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 0);
160           vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 0);
161           vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 0);
162           vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 0);
163         }
164 
165         {
166           const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
167           const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
168 
169           vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 1);
170           vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 1);
171           vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 1);
172           vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 1);
173           vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 1);
174           vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 1);
175           vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 1);
176           vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 1);
177           vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 1);
178           vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 1);
179           vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 1);
180           vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 1);
181           vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 1);
182           vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 1);
183           vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 1);
184           vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 1);
185         }
186 
187         {
188           const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
189           const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
190 
191           vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 2);
192           vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 2);
193           vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 2);
194           vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 2);
195           vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 2);
196           vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 2);
197           vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 2);
198           vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 2);
199           vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 2);
200           vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 2);
201           vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 2);
202           vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 2);
203           vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 2);
204           vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 2);
205           vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 2);
206           vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 2);
207         }
208 
209         {
210           const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
211           const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
212 
213           vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 3);
214           vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 3);
215           vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 3);
216           vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 3);
217           vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 3);
218           vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 3);
219           vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 3);
220           vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 3);
221           vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 3);
222           vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 3);
223           vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 3);
224           vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 3);
225           vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 3);
226           vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 3);
227           vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 3);
228           vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 3);
229         }
230 
231         {
232           const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
233           const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
234 
235           vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 0);
236           vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 0);
237           vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 0);
238           vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 0);
239           vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 0);
240           vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 0);
241           vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 0);
242           vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 0);
243           vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 0);
244           vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 0);
245           vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 0);
246           vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 0);
247           vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 0);
248           vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 0);
249           vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 0);
250           vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 0);
251         }
252 
253         {
254           const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
255           const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
256 
257           vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 1);
258           vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 1);
259           vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 1);
260           vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 1);
261           vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 1);
262           vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 1);
263           vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 1);
264           vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 1);
265           vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 1);
266           vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 1);
267           vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 1);
268           vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 1);
269           vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 1);
270           vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 1);
271           vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 1);
272           vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 1);
273         }
274 
275         {
276           const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
277           const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
278 
279           vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 2);
280           vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 2);
281           vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 2);
282           vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 2);
283           vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 2);
284           vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 2);
285           vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 2);
286           vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 2);
287           vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 2);
288           vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 2);
289           vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 2);
290           vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 2);
291           vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 2);
292           vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 2);
293           vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 2);
294           vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 2);
295         }
296 
297         {
298           const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
299           const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
300 
301           vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 3);
302           vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 3);
303           vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 3);
304           vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 3);
305           vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 3);
306           vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 3);
307           vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 3);
308           vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 3);
309           vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 3);
310           vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 3);
311           vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 3);
312           vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 3);
313           vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 3);
314           vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 3);
315           vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 3);
316           vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 3);
317         }
318 
319         k -= 8 * sizeof(uint8_t);
320       }
321       if (k != 0) {
322         const uint8x8_t va0 = vld1_u8(a0);
323         const uint8x8_t va1 = vld1_u8(a1);
324         const uint8x8_t va2 = vld1_u8(a2);
325         const uint8x8_t va3 = vld1_u8(a3);
326         const uint8x8_t va4 = vld1_u8(a4);
327         const uint8x8_t va5 = vld1_u8(a5);
328         const uint8x8_t va6 = vld1_u8(a6);
329         const uint8x8_t va7 = vld1_u8(a7);
330         const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
331         const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
332         const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));
333         const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3));
334         const int16x8_t vxa4 = vreinterpretq_s16_u16(vmovl_u8(va4));
335         const int16x8_t vxa5 = vreinterpretq_s16_u16(vmovl_u8(va5));
336         const int16x8_t vxa6 = vreinterpretq_s16_u16(vmovl_u8(va6));
337         const int16x8_t vxa7 = vreinterpretq_s16_u16(vmovl_u8(va7));
338 
339         {
340           const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
341           const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
342 
343           vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 0);
344           vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 0);
345           vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 0);
346           vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 0);
347           vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 0);
348           vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 0);
349           vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 0);
350           vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 0);
351           vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 0);
352           vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 0);
353           vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 0);
354           vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 0);
355           vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 0);
356           vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 0);
357           vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 0);
358           vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 0);
359         }
360 
361         if (k >= 2 * sizeof(uint8_t)) {
362           const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
363           const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
364 
365           vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 1);
366           vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 1);
367           vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 1);
368           vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 1);
369           vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 1);
370           vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 1);
371           vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 1);
372           vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 1);
373           vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 1);
374           vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 1);
375           vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 1);
376           vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 1);
377           vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 1);
378           vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 1);
379           vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 1);
380           vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 1);
381 
382           if (k > 2 * sizeof(uint8_t)) {
383             const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
384             const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
385 
386             vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 2);
387             vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 2);
388             vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 2);
389             vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 2);
390             vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 2);
391             vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 2);
392             vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 2);
393             vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 2);
394             vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 2);
395             vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 2);
396             vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 2);
397             vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 2);
398             vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 2);
399             vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 2);
400             vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 2);
401             vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 2);
402 
403             if (k >= 4 * sizeof(uint8_t)) {
404               const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
405               const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
406 
407               vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 3);
408               vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 3);
409               vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 3);
410               vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 3);
411               vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 3);
412               vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 3);
413               vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 3);
414               vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 3);
415               vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 3);
416               vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 3);
417               vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 3);
418               vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 3);
419               vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 3);
420               vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 3);
421               vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 3);
422               vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 3);
423 
424               if (k > 4 * sizeof(uint8_t)) {
425                 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
426                 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
427 
428                 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 0);
429                 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 0);
430                 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 0);
431                 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 0);
432                 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 0);
433                 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 0);
434                 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 0);
435                 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 0);
436                 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 0);
437                 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 0);
438                 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 0);
439                 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 0);
440                 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 0);
441                 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 0);
442                 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 0);
443                 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 0);
444 
445                 if (k >= 6 * sizeof(uint8_t)) {
446                   const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
447                   const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
448 
449                   vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 1);
450                   vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 1);
451                   vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 1);
452                   vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 1);
453                   vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 1);
454                   vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 1);
455                   vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 1);
456                   vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 1);
457                   vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 1);
458                   vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 1);
459                   vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 1);
460                   vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 1);
461                   vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 1);
462                   vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 1);
463                   vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 1);
464                   vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 1);
465 
466                   if (k > 6 * sizeof(uint8_t)) {
467                     const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t));
468                     const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
469 
470                     vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 2);
471                     vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 2);
472                     vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 2);
473                     vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 2);
474                     vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 2);
475                     vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 2);
476                     vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 2);
477                     vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 2);
478                     vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 2);
479                     vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 2);
480                     vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 2);
481                     vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 2);
482                     vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 2);
483                     vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 2);
484                     vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 2);
485                     vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 2);
486                   }
487                 }
488               }
489             }
490           }
491         }
492       }
493       p -= 8 * sizeof(void*);
494     } while (p != 0);
495 
496     const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
497     vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
498     vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
499     vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
500     vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
501     vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
502     vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
503     vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
504     vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
505     vacc4x0123 = vqrdmulhq_s32(vacc4x0123, vmultiplier);
506     vacc4x4567 = vqrdmulhq_s32(vacc4x4567, vmultiplier);
507     vacc5x0123 = vqrdmulhq_s32(vacc5x0123, vmultiplier);
508     vacc5x4567 = vqrdmulhq_s32(vacc5x4567, vmultiplier);
509     vacc6x0123 = vqrdmulhq_s32(vacc6x0123, vmultiplier);
510     vacc6x4567 = vqrdmulhq_s32(vacc6x4567, vmultiplier);
511     vacc7x0123 = vqrdmulhq_s32(vacc7x0123, vmultiplier);
512     vacc7x4567 = vqrdmulhq_s32(vacc7x4567, vmultiplier);
513 
514     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
515     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
516     vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
517     vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
518     vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
519     vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
520     vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
521     vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
522     vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
523     vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
524     vacc4x0123 = vsraq_n_s32(vacc4x0123, vbicq_s32(vacc4x0123, vzero_shift_mask), 31);
525     vacc4x4567 = vsraq_n_s32(vacc4x4567, vbicq_s32(vacc4x4567, vzero_shift_mask), 31);
526     vacc5x0123 = vsraq_n_s32(vacc5x0123, vbicq_s32(vacc5x0123, vzero_shift_mask), 31);
527     vacc5x4567 = vsraq_n_s32(vacc5x4567, vbicq_s32(vacc5x4567, vzero_shift_mask), 31);
528     vacc6x0123 = vsraq_n_s32(vacc6x0123, vbicq_s32(vacc6x0123, vzero_shift_mask), 31);
529     vacc6x4567 = vsraq_n_s32(vacc6x4567, vbicq_s32(vacc6x4567, vzero_shift_mask), 31);
530     vacc7x0123 = vsraq_n_s32(vacc7x0123, vbicq_s32(vacc7x0123, vzero_shift_mask), 31);
531     vacc7x4567 = vsraq_n_s32(vacc7x4567, vbicq_s32(vacc7x4567, vzero_shift_mask), 31);
532 
533     vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
534     vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
535     vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
536     vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
537     vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
538     vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
539     vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
540     vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
541     vacc4x0123 = vrshlq_s32(vacc4x0123, vright_shift);
542     vacc4x4567 = vrshlq_s32(vacc4x4567, vright_shift);
543     vacc5x0123 = vrshlq_s32(vacc5x0123, vright_shift);
544     vacc5x4567 = vrshlq_s32(vacc5x4567, vright_shift);
545     vacc6x0123 = vrshlq_s32(vacc6x0123, vright_shift);
546     vacc6x4567 = vrshlq_s32(vacc6x4567, vright_shift);
547     vacc7x0123 = vrshlq_s32(vacc7x0123, vright_shift);
548     vacc7x4567 = vrshlq_s32(vacc7x4567, vright_shift);
549 
550     const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
551   #if XNN_ARCH_ARM64
552     const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
553     const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
554     const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
555     const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
556     const int16x8_t vacc4x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc4x0123), vacc4x4567), voutput_zero_point);
557     const int16x8_t vacc5x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x0123), vacc5x4567), voutput_zero_point);
558     const int16x8_t vacc6x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc6x0123), vacc6x4567), voutput_zero_point);
559     const int16x8_t vacc7x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc7x0123), vacc7x4567), voutput_zero_point);
560 
561     uint8x16_t vout0x01234567_1x01234567 = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc1x01234567);
562     uint8x16_t vout2x01234567_3x01234567 = vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc3x01234567);
563     uint8x16_t vout4x01234567_5x01234567 = vqmovun_high_s16(vqmovun_s16(vacc4x01234567), vacc5x01234567);
564     uint8x16_t vout6x01234567_7x01234567 = vqmovun_high_s16(vqmovun_s16(vacc6x01234567), vacc7x01234567);
565   #else
566     const int16x8_t vacc0x01234567 =
567       vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
568     const int16x8_t vacc1x01234567 =
569       vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
570     const int16x8_t vacc2x01234567 =
571       vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
572     const int16x8_t vacc3x01234567 =
573       vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
574     const int16x8_t vacc4x01234567 =
575       vqaddq_s16(vcombine_s16(vqmovn_s32(vacc4x0123), vqmovn_s32(vacc4x4567)), voutput_zero_point);
576     const int16x8_t vacc5x01234567 =
577       vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x0123), vqmovn_s32(vacc5x4567)), voutput_zero_point);
578     const int16x8_t vacc6x01234567 =
579       vqaddq_s16(vcombine_s16(vqmovn_s32(vacc6x0123), vqmovn_s32(vacc6x4567)), voutput_zero_point);
580     const int16x8_t vacc7x01234567 =
581       vqaddq_s16(vcombine_s16(vqmovn_s32(vacc7x0123), vqmovn_s32(vacc7x4567)), voutput_zero_point);
582 
583     uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc1x01234567));
584     uint8x16_t vout2x01234567_3x01234567 = vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc3x01234567));
585     uint8x16_t vout4x01234567_5x01234567 = vcombine_u8(vqmovun_s16(vacc4x01234567), vqmovun_s16(vacc5x01234567));
586     uint8x16_t vout6x01234567_7x01234567 = vcombine_u8(vqmovun_s16(vacc6x01234567), vqmovun_s16(vacc7x01234567));
587   #endif
588     const uint8x16_t voutput_min = vld1q_dup_u8(&params->neon.output_min);
589     const uint8x16_t voutput_max = vld1q_dup_u8(&params->neon.output_max);
590 
591     vout0x01234567_1x01234567 = vmaxq_u8(vout0x01234567_1x01234567, voutput_min);
592     vout2x01234567_3x01234567 = vmaxq_u8(vout2x01234567_3x01234567, voutput_min);
593     vout4x01234567_5x01234567 = vmaxq_u8(vout4x01234567_5x01234567, voutput_min);
594     vout6x01234567_7x01234567 = vmaxq_u8(vout6x01234567_7x01234567, voutput_min);
595     vout0x01234567_1x01234567 = vminq_u8(vout0x01234567_1x01234567, voutput_max);
596     vout2x01234567_3x01234567 = vminq_u8(vout2x01234567_3x01234567, voutput_max);
597     vout4x01234567_5x01234567 = vminq_u8(vout4x01234567_5x01234567, voutput_max);
598     vout6x01234567_7x01234567 = vminq_u8(vout6x01234567_7x01234567, voutput_max);
599 
600     if XNN_LIKELY(nc >= 8) {
601       vst1_u8(c7, vget_high_u8(vout6x01234567_7x01234567)); c7 += cn_stride;
602       vst1_u8(c6,  vget_low_u8(vout6x01234567_7x01234567)); c6 += cn_stride;
603       vst1_u8(c5, vget_high_u8(vout4x01234567_5x01234567)); c5 += cn_stride;
604       vst1_u8(c4,  vget_low_u8(vout4x01234567_5x01234567)); c4 += cn_stride;
605       vst1_u8(c3, vget_high_u8(vout2x01234567_3x01234567)); c3 += cn_stride;
606       vst1_u8(c2,  vget_low_u8(vout2x01234567_3x01234567)); c2 += cn_stride;
607       vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567)); c1 += cn_stride;
608       vst1_u8(c0,  vget_low_u8(vout0x01234567_1x01234567)); c0 += cn_stride;
609 
610       a = (const uint8_t**restrict) ((uintptr_t) a - ks);
611 
612       nc -= 8;
613     } else {
614       if (nc & 4) {
615         vst1q_lane_u32(__builtin_assume_aligned(c7, 1), vreinterpretq_u32_u8(vout6x01234567_7x01234567), 2); c7 += 4;
616         vst1q_lane_u32(__builtin_assume_aligned(c6, 1), vreinterpretq_u32_u8(vout6x01234567_7x01234567), 0); c6 += 4;
617         vst1q_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_u8(vout4x01234567_5x01234567), 2); c5 += 4;
618         vst1q_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_u8(vout4x01234567_5x01234567), 0); c4 += 4;
619         vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_u8(vout2x01234567_3x01234567), 2); c3 += 4;
620         vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_u8(vout2x01234567_3x01234567), 0); c2 += 4;
621         vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4;
622         vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4;
623         vout6x01234567_7x01234567 = vextq_u8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 4);
624         vout4x01234567_5x01234567 = vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4);
625         vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
626         vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
627       }
628       if (nc & 2) {
629         vst1q_lane_u16(__builtin_assume_aligned(c7, 1), vreinterpretq_u16_u8(vout6x01234567_7x01234567), 4); c7 += 2;
630         vst1q_lane_u16(__builtin_assume_aligned(c6, 1), vreinterpretq_u16_u8(vout6x01234567_7x01234567), 0); c6 += 2;
631         vst1q_lane_u16(__builtin_assume_aligned(c5, 1), vreinterpretq_u16_u8(vout4x01234567_5x01234567), 4); c5 += 2;
632         vst1q_lane_u16(__builtin_assume_aligned(c4, 1), vreinterpretq_u16_u8(vout4x01234567_5x01234567), 0); c4 += 2;
633         vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_u8(vout2x01234567_3x01234567), 4); c3 += 2;
634         vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_u8(vout2x01234567_3x01234567), 0); c2 += 2;
635         vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2;
636         vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2;
637         vout6x01234567_7x01234567 = vextq_u8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 2);
638         vout4x01234567_5x01234567 = vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2);
639         vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
640         vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
641       }
642       if (nc & 1) {
643         vst1q_lane_u8(c7, vout6x01234567_7x01234567, 8);
644         vst1q_lane_u8(c6, vout6x01234567_7x01234567, 0);
645         vst1q_lane_u8(c5, vout4x01234567_5x01234567, 8);
646         vst1q_lane_u8(c4, vout4x01234567_5x01234567, 0);
647         vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8);
648         vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0);
649         vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
650         vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
651       }
652 
653       nc = 0;
654     }
655   } while (nc != 0);
656 }
657