1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <assert.h>
10
11 #include <arm_neon.h>
12
13 #include <xnnpack/common.h>
14 #include <xnnpack/gemm.h>
15
16
xnn_q8_gemm_ukernel_8x8__neon(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_q8_gemm_params params[restrict static1])17 void xnn_q8_gemm_ukernel_8x8__neon(
18 size_t mr,
19 size_t nc,
20 size_t kc,
21 const uint8_t* restrict a,
22 size_t a_stride,
23 const void* restrict w,
24 uint8_t* restrict c,
25 size_t cm_stride,
26 size_t cn_stride,
27 const union xnn_q8_gemm_params params[restrict static 1])
28 {
29 assert(mr != 0);
30 assert(mr <= 8);
31 assert(nc != 0);
32 assert(kc != 0);
33
34 const uint8_t* a0 = a;
35 uint8_t* c0 = c;
36 const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
37 uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
38 if XNN_UNPREDICTABLE(mr < 2) {
39 a1 = a0;
40 c1 = c0;
41 }
42 const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
43 uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
44 if XNN_UNPREDICTABLE(mr <= 2) {
45 a2 = a1;
46 c2 = c1;
47 }
48 const uint8_t* a3 = (const uint8_t*) ((uintptr_t) a2 + a_stride);
49 uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
50 if XNN_UNPREDICTABLE(mr < 4) {
51 a3 = a2;
52 c3 = c2;
53 }
54 const uint8_t* a4 = (const uint8_t*) ((uintptr_t) a3 + a_stride);
55 uint8_t* c4 = (uint8_t*) ((uintptr_t) c3 + cm_stride);
56 if XNN_UNPREDICTABLE(mr <= 4) {
57 a4 = a3;
58 c4 = c3;
59 }
60 const uint8_t* a5 = (const uint8_t*) ((uintptr_t) a4 + a_stride);
61 uint8_t* c5 = (uint8_t*) ((uintptr_t) c4 + cm_stride);
62 if XNN_UNPREDICTABLE(mr < 6) {
63 a5 = a4;
64 c5 = c4;
65 }
66 const uint8_t* a6 = (const uint8_t*) ((uintptr_t) a5 + a_stride);
67 uint8_t* c6 = (uint8_t*) ((uintptr_t) c5 + cm_stride);
68 if XNN_UNPREDICTABLE(mr <= 6) {
69 a6 = a5;
70 c6 = c5;
71 }
72 const uint8_t* a7 = (const uint8_t*) ((uintptr_t) a6 + a_stride);
73 uint8_t* c7 = (uint8_t*) ((uintptr_t) c6 + cm_stride);
74 if XNN_UNPREDICTABLE(mr != 8) {
75 a7 = a6;
76 c7 = c6;
77 }
78
79 const uint8x8_t vb_zero_point = vld1_dup_u8((const uint8_t*) ¶ms->neon.kernel_zero_point);
80
81 do {
82 int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 16);
83 int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 16);
84 int32x4_t vacc1x0123 = vacc0x0123;
85 int32x4_t vacc1x4567 = vacc0x4567;
86 int32x4_t vacc2x0123 = vacc0x0123;
87 int32x4_t vacc2x4567 = vacc0x4567;
88 int32x4_t vacc3x0123 = vacc0x0123;
89 int32x4_t vacc3x4567 = vacc0x4567;
90 int32x4_t vacc4x0123 = vacc0x0123;
91 int32x4_t vacc4x4567 = vacc0x4567;
92 int32x4_t vacc5x0123 = vacc0x0123;
93 int32x4_t vacc5x4567 = vacc0x4567;
94 int32x4_t vacc6x0123 = vacc0x0123;
95 int32x4_t vacc6x4567 = vacc0x4567;
96 int32x4_t vacc7x0123 = vacc0x0123;
97 int32x4_t vacc7x4567 = vacc0x4567;
98
99 size_t k = kc;
100 while (k >= 8 * sizeof(uint8_t)) {
101 const uint8x8_t va0 = vld1_u8(a0);
102 const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); a0 += 8;
103 const uint8x8_t va1 = vld1_u8(a1);
104 const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); a1 += 8;
105 const uint8x8_t va2 = vld1_u8(a2);
106 const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2)); a2 += 8;
107 const uint8x8_t va3 = vld1_u8(a3);
108 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); a3 += 8;
109 const uint8x8_t va4 = vld1_u8(a4);
110 const int16x8_t vxa4 = vreinterpretq_s16_u16(vmovl_u8(va4)); a4 += 8;
111 const uint8x8_t va5 = vld1_u8(a5);
112 const int16x8_t vxa5 = vreinterpretq_s16_u16(vmovl_u8(va5)); a5 += 8;
113 const uint8x8_t va6 = vld1_u8(a6);
114 const int16x8_t vxa6 = vreinterpretq_s16_u16(vmovl_u8(va6)); a6 += 8;
115 const uint8x8_t va7 = vld1_u8(a7);
116 const int16x8_t vxa7 = vreinterpretq_s16_u16(vmovl_u8(va7)); a7 += 8;
117
118 const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
119 const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
120
121 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
122 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
123 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
124 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
125 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
126 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
127 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
128 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
129 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0);
130 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0);
131 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa5), 0);
132 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa5), 0);
133 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa6), 0);
134 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa6), 0);
135 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa7), 0);
136 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa7), 0);
137
138 const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
139 const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
140
141 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
142 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
143 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
144 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
145 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
146 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
147 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
148 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
149 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1);
150 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1);
151 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa5), 1);
152 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa5), 1);
153 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa6), 1);
154 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa6), 1);
155 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa7), 1);
156 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa7), 1);
157
158 const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
159 const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
160
161 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
162 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
163 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
164 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
165 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
166 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
167 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
168 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
169 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2);
170 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2);
171 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa5), 2);
172 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa5), 2);
173 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa6), 2);
174 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa6), 2);
175 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa7), 2);
176 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa7), 2);
177
178 const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
179 const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
180
181 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
182 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
183 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
184 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
185 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
186 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
187 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
188 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
189 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3);
190 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3);
191 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa5), 3);
192 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa5), 3);
193 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa6), 3);
194 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa6), 3);
195 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa7), 3);
196 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa7), 3);
197
198 const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
199 const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
200
201 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
202 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
203 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
204 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
205 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
206 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
207 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
208 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
209 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0);
210 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa4), 0);
211 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa5), 0);
212 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa5), 0);
213 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa6), 0);
214 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa6), 0);
215 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa7), 0);
216 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa7), 0);
217
218 const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
219 const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
220
221 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
222 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
223 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
224 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
225 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
226 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
227 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
228 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
229 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa4), 1);
230 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa4), 1);
231 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa5), 1);
232 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa5), 1);
233 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa6), 1);
234 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa6), 1);
235 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa7), 1);
236 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa7), 1);
237
238 const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
239 const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
240
241 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
242 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
243 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
244 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
245 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
246 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
247 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
248 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
249 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa4), 2);
250 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa4), 2);
251 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa5), 2);
252 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa5), 2);
253 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa6), 2);
254 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa6), 2);
255 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa7), 2);
256 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa7), 2);
257
258 const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
259 const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point));
260
261 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
262 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
263 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
264 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
265 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
266 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
267 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
268 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
269 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa4), 3);
270 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa4), 3);
271 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa5), 3);
272 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa5), 3);
273 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa6), 3);
274 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa6), 3);
275 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa7), 3);
276 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa7), 3);
277
278 k -= 8 * sizeof(uint8_t);
279 }
280 if (k != 0) {
281 const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k);
282 const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
283 const uint8x8_t va1 = vld1_u8(a1); a1 = (const uint8_t*) ((uintptr_t) a1 + k);
284 const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
285 const uint8x8_t va2 = vld1_u8(a2); a2 = (const uint8_t*) ((uintptr_t) a2 + k);
286 const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));
287 const uint8x8_t va3 = vld1_u8(a3); a3 = (const uint8_t*) ((uintptr_t) a3 + k);
288 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3));
289 const uint8x8_t va4 = vld1_u8(a4); a4 = (const uint8_t*) ((uintptr_t) a4 + k);
290 const int16x8_t vxa4 = vreinterpretq_s16_u16(vmovl_u8(va4));
291 const uint8x8_t va5 = vld1_u8(a5); a5 = (const uint8_t*) ((uintptr_t) a5 + k);
292 const int16x8_t vxa5 = vreinterpretq_s16_u16(vmovl_u8(va5));
293 const uint8x8_t va6 = vld1_u8(a6); a6 = (const uint8_t*) ((uintptr_t) a6 + k);
294 const int16x8_t vxa6 = vreinterpretq_s16_u16(vmovl_u8(va6));
295 const uint8x8_t va7 = vld1_u8(a7); a7 = (const uint8_t*) ((uintptr_t) a7 + k);
296 const int16x8_t vxa7 = vreinterpretq_s16_u16(vmovl_u8(va7));
297
298 const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
299 const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
300
301 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
302 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
303 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
304 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
305 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
306 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
307 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
308 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
309 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0);
310 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0);
311 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa5), 0);
312 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa5), 0);
313 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa6), 0);
314 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa6), 0);
315 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa7), 0);
316 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa7), 0);
317
318 if (k >= 2 * sizeof(uint8_t)) {
319 const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
320 const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
321
322 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
323 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
324 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
325 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
326 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
327 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
328 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
329 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
330 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1);
331 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1);
332 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa5), 1);
333 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa5), 1);
334 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa6), 1);
335 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa6), 1);
336 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa7), 1);
337 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa7), 1);
338
339 if (k > 2 * sizeof(uint8_t)) {
340 const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
341 const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
342
343 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
344 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
345 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
346 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
347 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
348 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
349 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
350 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
351 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2);
352 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2);
353 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa5), 2);
354 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa5), 2);
355 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa6), 2);
356 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa6), 2);
357 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa7), 2);
358 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa7), 2);
359
360 if (k >= 4 * sizeof(uint8_t)) {
361 const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
362 const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
363
364 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
365 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
366 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
367 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
368 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
369 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
370 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
371 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
372 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3);
373 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3);
374 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa5), 3);
375 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa5), 3);
376 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa6), 3);
377 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa6), 3);
378 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa7), 3);
379 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa7), 3);
380
381 if (k > 4 * sizeof(uint8_t)) {
382 const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
383 const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
384
385 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
386 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
387 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
388 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
389 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
390 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
391 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
392 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
393 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0);
394 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa4), 0);
395 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa5), 0);
396 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa5), 0);
397 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa6), 0);
398 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa6), 0);
399 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa7), 0);
400 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa7), 0);
401
402 if (k >= 6 * sizeof(uint8_t)) {
403 const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
404 const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
405
406 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
407 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
408 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
409 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
410 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
411 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
412 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
413 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
414 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa4), 1);
415 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa4), 1);
416 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa5), 1);
417 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa5), 1);
418 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa6), 1);
419 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa6), 1);
420 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa7), 1);
421 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa7), 1);
422
423 if (k > 6 * sizeof(uint8_t)) {
424 const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8);
425 const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
426
427 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
428 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
429 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
430 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
431 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
432 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
433 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
434 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
435 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa4), 2);
436 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa4), 2);
437 vacc5x0123 = vmlal_lane_s16(vacc5x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa5), 2);
438 vacc5x4567 = vmlal_lane_s16(vacc5x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa5), 2);
439 vacc6x0123 = vmlal_lane_s16(vacc6x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa6), 2);
440 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa6), 2);
441 vacc7x0123 = vmlal_lane_s16(vacc7x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa7), 2);
442 vacc7x4567 = vmlal_lane_s16(vacc7x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa7), 2);
443 }
444 }
445 }
446 }
447 }
448 }
449 }
450
451 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
452 vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
453 vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
454 vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
455 vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
456 vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
457 vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
458 vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
459 vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
460 vacc4x0123 = vqrdmulhq_s32(vacc4x0123, vmultiplier);
461 vacc4x4567 = vqrdmulhq_s32(vacc4x4567, vmultiplier);
462 vacc5x0123 = vqrdmulhq_s32(vacc5x0123, vmultiplier);
463 vacc5x4567 = vqrdmulhq_s32(vacc5x4567, vmultiplier);
464 vacc6x0123 = vqrdmulhq_s32(vacc6x0123, vmultiplier);
465 vacc6x4567 = vqrdmulhq_s32(vacc6x4567, vmultiplier);
466 vacc7x0123 = vqrdmulhq_s32(vacc7x0123, vmultiplier);
467 vacc7x4567 = vqrdmulhq_s32(vacc7x4567, vmultiplier);
468
469 const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
470 const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
471 vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
472 vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
473 vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
474 vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
475 vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
476 vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
477 vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
478 vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
479 vacc4x0123 = vsraq_n_s32(vacc4x0123, vbicq_s32(vacc4x0123, vzero_shift_mask), 31);
480 vacc4x4567 = vsraq_n_s32(vacc4x4567, vbicq_s32(vacc4x4567, vzero_shift_mask), 31);
481 vacc5x0123 = vsraq_n_s32(vacc5x0123, vbicq_s32(vacc5x0123, vzero_shift_mask), 31);
482 vacc5x4567 = vsraq_n_s32(vacc5x4567, vbicq_s32(vacc5x4567, vzero_shift_mask), 31);
483 vacc6x0123 = vsraq_n_s32(vacc6x0123, vbicq_s32(vacc6x0123, vzero_shift_mask), 31);
484 vacc6x4567 = vsraq_n_s32(vacc6x4567, vbicq_s32(vacc6x4567, vzero_shift_mask), 31);
485 vacc7x0123 = vsraq_n_s32(vacc7x0123, vbicq_s32(vacc7x0123, vzero_shift_mask), 31);
486 vacc7x4567 = vsraq_n_s32(vacc7x4567, vbicq_s32(vacc7x4567, vzero_shift_mask), 31);
487
488 vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
489 vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
490 vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
491 vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
492 vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
493 vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
494 vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
495 vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
496 vacc4x0123 = vrshlq_s32(vacc4x0123, vright_shift);
497 vacc4x4567 = vrshlq_s32(vacc4x4567, vright_shift);
498 vacc5x0123 = vrshlq_s32(vacc5x0123, vright_shift);
499 vacc5x4567 = vrshlq_s32(vacc5x4567, vright_shift);
500 vacc6x0123 = vrshlq_s32(vacc6x0123, vright_shift);
501 vacc6x4567 = vrshlq_s32(vacc6x4567, vright_shift);
502 vacc7x0123 = vrshlq_s32(vacc7x0123, vright_shift);
503 vacc7x4567 = vrshlq_s32(vacc7x4567, vright_shift);
504
505 const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
506 #if XNN_ARCH_ARM64
507 const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
508 const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
509 const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
510 const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
511 const int16x8_t vacc4x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc4x0123), vacc4x4567), voutput_zero_point);
512 const int16x8_t vacc5x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x0123), vacc5x4567), voutput_zero_point);
513 const int16x8_t vacc6x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc6x0123), vacc6x4567), voutput_zero_point);
514 const int16x8_t vacc7x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc7x0123), vacc7x4567), voutput_zero_point);
515
516 uint8x16_t vout0x01234567_1x01234567 = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc1x01234567);
517 uint8x16_t vout2x01234567_3x01234567 = vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc3x01234567);
518 uint8x16_t vout4x01234567_5x01234567 = vqmovun_high_s16(vqmovun_s16(vacc4x01234567), vacc5x01234567);
519 uint8x16_t vout6x01234567_7x01234567 = vqmovun_high_s16(vqmovun_s16(vacc6x01234567), vacc7x01234567);
520 #else
521 const int16x8_t vacc0x01234567 =
522 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
523 const int16x8_t vacc1x01234567 =
524 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
525 const int16x8_t vacc2x01234567 =
526 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
527 const int16x8_t vacc3x01234567 =
528 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
529 const int16x8_t vacc4x01234567 =
530 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc4x0123), vqmovn_s32(vacc4x4567)), voutput_zero_point);
531 const int16x8_t vacc5x01234567 =
532 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x0123), vqmovn_s32(vacc5x4567)), voutput_zero_point);
533 const int16x8_t vacc6x01234567 =
534 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc6x0123), vqmovn_s32(vacc6x4567)), voutput_zero_point);
535 const int16x8_t vacc7x01234567 =
536 vqaddq_s16(vcombine_s16(vqmovn_s32(vacc7x0123), vqmovn_s32(vacc7x4567)), voutput_zero_point);
537
538 uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc1x01234567));
539 uint8x16_t vout2x01234567_3x01234567 = vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc3x01234567));
540 uint8x16_t vout4x01234567_5x01234567 = vcombine_u8(vqmovun_s16(vacc4x01234567), vqmovun_s16(vacc5x01234567));
541 uint8x16_t vout6x01234567_7x01234567 = vcombine_u8(vqmovun_s16(vacc6x01234567), vqmovun_s16(vacc7x01234567));
542 #endif
543 const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->neon.output_min);
544 const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->neon.output_max);
545
546 vout0x01234567_1x01234567 = vmaxq_u8(vout0x01234567_1x01234567, voutput_min);
547 vout2x01234567_3x01234567 = vmaxq_u8(vout2x01234567_3x01234567, voutput_min);
548 vout4x01234567_5x01234567 = vmaxq_u8(vout4x01234567_5x01234567, voutput_min);
549 vout6x01234567_7x01234567 = vmaxq_u8(vout6x01234567_7x01234567, voutput_min);
550 vout0x01234567_1x01234567 = vminq_u8(vout0x01234567_1x01234567, voutput_max);
551 vout2x01234567_3x01234567 = vminq_u8(vout2x01234567_3x01234567, voutput_max);
552 vout4x01234567_5x01234567 = vminq_u8(vout4x01234567_5x01234567, voutput_max);
553 vout6x01234567_7x01234567 = vminq_u8(vout6x01234567_7x01234567, voutput_max);
554
555 if (nc >= 8) {
556 vst1_u8(c0, vget_low_u8(vout0x01234567_1x01234567)); c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
557 vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567)); c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
558 vst1_u8(c2, vget_low_u8(vout2x01234567_3x01234567)); c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
559 vst1_u8(c3, vget_high_u8(vout2x01234567_3x01234567)); c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
560 vst1_u8(c4, vget_low_u8(vout4x01234567_5x01234567)); c4 = (uint8_t*) ((uintptr_t) c4 + cn_stride);
561 vst1_u8(c5, vget_high_u8(vout4x01234567_5x01234567)); c5 = (uint8_t*) ((uintptr_t) c5 + cn_stride);
562 vst1_u8(c6, vget_low_u8(vout6x01234567_7x01234567)); c6 = (uint8_t*) ((uintptr_t) c6 + cn_stride);
563 vst1_u8(c7, vget_high_u8(vout6x01234567_7x01234567)); c7 = (uint8_t*) ((uintptr_t) c7 + cn_stride);
564
565 a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
566 a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
567 a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
568 a3 = (const uint8_t*) ((uintptr_t) a3 - kc);
569 a4 = (const uint8_t*) ((uintptr_t) a4 - kc);
570 a5 = (const uint8_t*) ((uintptr_t) a5 - kc);
571 a6 = (const uint8_t*) ((uintptr_t) a6 - kc);
572 a7 = (const uint8_t*) ((uintptr_t) a7 - kc);
573
574 nc -= 8;
575 } else {
576 if (nc & 4) {
577 vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4;
578 vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4;
579 vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_u8(vout2x01234567_3x01234567), 0); c2 += 4;
580 vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_u8(vout2x01234567_3x01234567), 2); c3 += 4;
581 vst1q_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_u8(vout4x01234567_5x01234567), 0); c4 += 4;
582 vst1q_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_u8(vout4x01234567_5x01234567), 2); c5 += 4;
583 vst1q_lane_u32(__builtin_assume_aligned(c6, 1), vreinterpretq_u32_u8(vout6x01234567_7x01234567), 0); c6 += 4;
584 vst1q_lane_u32(__builtin_assume_aligned(c7, 1), vreinterpretq_u32_u8(vout6x01234567_7x01234567), 2); c7 += 4;
585 vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
586 vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
587 vout4x01234567_5x01234567 = vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4);
588 vout6x01234567_7x01234567 = vextq_u8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 4);
589 }
590 if (nc & 2) {
591 vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2;
592 vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2;
593 vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_u8(vout2x01234567_3x01234567), 0); c2 += 2;
594 vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_u8(vout2x01234567_3x01234567), 4); c3 += 2;
595 vst1q_lane_u16(__builtin_assume_aligned(c4, 1), vreinterpretq_u16_u8(vout4x01234567_5x01234567), 0); c4 += 2;
596 vst1q_lane_u16(__builtin_assume_aligned(c5, 1), vreinterpretq_u16_u8(vout4x01234567_5x01234567), 4); c5 += 2;
597 vst1q_lane_u16(__builtin_assume_aligned(c6, 1), vreinterpretq_u16_u8(vout6x01234567_7x01234567), 0); c6 += 2;
598 vst1q_lane_u16(__builtin_assume_aligned(c7, 1), vreinterpretq_u16_u8(vout6x01234567_7x01234567), 4); c7 += 2;
599 vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
600 vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
601 vout4x01234567_5x01234567 = vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2);
602 vout6x01234567_7x01234567 = vextq_u8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 2);
603 }
604 if (nc & 1) {
605 vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
606 vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
607 vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0);
608 vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8);
609 vst1q_lane_u8(c4, vout4x01234567_5x01234567, 0);
610 vst1q_lane_u8(c5, vout4x01234567_5x01234567, 8);
611 vst1q_lane_u8(c6, vout6x01234567_7x01234567, 0);
612 vst1q_lane_u8(c7, vout6x01234567_7x01234567, 8);
613 }
614
615 nc = 0;
616 }
617 } while (nc != 0);
618 }
619