1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <stdint.h>
10 #include <stddef.h>
11
12 #include <xnnpack/math.h>
13 #include <xnnpack/pack.h>
14
15
xnn_pack_f32_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,const void * params)16 void xnn_pack_f32_gemm_goi_w(
17 size_t g,
18 size_t nc,
19 size_t kc,
20 size_t nr,
21 size_t kr,
22 size_t sr,
23 const float* k,
24 const float* b,
25 float* packed_w,
26 const void* params)
27 {
28 const size_t skr = sr * kr;
29 const size_t skc = round_down_po2(kc, skr);
30 const size_t sr_mask = (sr - 1) * kr;
31 do {
32 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
33 const size_t nr_block_size = min(nc - nr_block_start, nr);
34 if XNN_LIKELY(b != NULL) {
35 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
36 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
37 }
38 }
39 packed_w += nr;
40
41 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
42 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
43 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
44 *packed_w++ =
45 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
46 }
47 }
48 packed_w += (nr - nr_block_size) * kr;
49 }
50
51 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
52 const size_t kr_block_size = min(kc - kr_block_start, kr);
53 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
54 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
55 *packed_w++ =
56 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
57 }
58 packed_w += kr - kr_block_size;
59 }
60 packed_w += (nr - nr_block_size) * kr;
61 }
62 }
63 k += nc * kc;
64 if XNN_UNPREDICTABLE(b != NULL) {
65 b += nc;
66 }
67 } while (--g != 0);
68 }
69
xnn_pack_f16_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)70 void xnn_pack_f16_gemm_goi_w(
71 size_t g,
72 size_t nc,
73 size_t kc,
74 size_t nr,
75 size_t kr,
76 size_t sr,
77 const uint16_t* k,
78 const uint16_t* b,
79 uint16_t* packed_w,
80 const void* params)
81 {
82 const size_t skr = sr * kr;
83 const size_t skc = round_down_po2(kc, skr);
84 const size_t sr_mask = (sr - 1) * kr;
85 do {
86 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
87 const size_t nr_block_size = min(nc - nr_block_start, nr);
88 if XNN_LIKELY(b != NULL) {
89 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
90 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
91 }
92 }
93 packed_w += nr;
94
95 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
96 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
97 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
98 *packed_w++ =
99 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
100 }
101 }
102 packed_w += (nr - nr_block_size) * kr;
103 }
104
105 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
106 const size_t kr_block_size = min(kc - kr_block_start, kr);
107 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
108 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
109 *packed_w++ =
110 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
111 }
112 packed_w += kr - kr_block_size;
113 }
114 packed_w += (nr - nr_block_size) * kr;
115 }
116 }
117 k += nc * kc;
118 if XNN_UNPREDICTABLE(b != NULL) {
119 b += nc;
120 }
121 } while (--g != 0);
122 }
123
xnn_pack_qu8_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)124 void xnn_pack_qu8_gemm_goi_w(
125 size_t g,
126 size_t nc,
127 size_t kc,
128 size_t nr,
129 size_t kr,
130 size_t sr,
131 const uint8_t* k,
132 const int32_t* b,
133 void* packed_w,
134 const struct xnn_qu8_packing_params* params)
135 {
136 assert(sr == 1);
137 const int32_t izp = (int32_t) params->input_zero_point;
138 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
139 do {
140 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
141 const size_t nr_block_size = min(nc - nr_block_start, nr);
142 int32_t* packed_b = (int32_t*) packed_w;
143 if XNN_LIKELY(b != NULL) {
144 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
145 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
146 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
147 }
148 } else {
149 size_t n = nr_block_size;
150 do {
151 *((int32_t*) packed_w) = boff;
152 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
153 } while (--n != 0);
154 }
155 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
156 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
157 const size_t kr_block_size = min(kc - kr_block_start, kr);
158 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
159 int32_t ksum = 0;
160 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
161 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
162 ksum += (int32_t) kv;
163 *((uint8_t*) packed_w) = kv;
164 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
165 }
166 packed_b[nr_block_offset] -= ksum * izp;
167 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
168 }
169 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
170 }
171 }
172 k += nc * kc;
173 if XNN_UNPREDICTABLE(b != NULL) {
174 b += nc;
175 }
176 } while (--g != 0);
177 }
178
xnn_pack_qs8_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)179 void xnn_pack_qs8_gemm_goi_w(
180 size_t g,
181 size_t nc,
182 size_t kc,
183 size_t nr,
184 size_t kr,
185 size_t sr,
186 const int8_t* k,
187 const int32_t* b,
188 void* packed_w,
189 const struct xnn_qs8_packing_params* params)
190 {
191 assert(sr == 1);
192 const int32_t izp = (int32_t) params->input_zero_point;
193 do {
194 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
195 const size_t nr_block_size = min(nc - nr_block_start, nr);
196 int32_t* packed_b = (int32_t*) packed_w;
197 if XNN_LIKELY(b != NULL) {
198 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
199 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
200 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
201 }
202 } else {
203 size_t n = nr_block_size;
204 do {
205 *((int32_t*) packed_w) = 0;
206 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
207 } while (--n != 0);
208 }
209 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
210 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
211 const size_t kr_block_size = min(kc - kr_block_start, kr);
212 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
213 int32_t ksum = 0;
214 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
215 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
216 ksum += (int32_t) kv;
217 *((int8_t*) packed_w) = kv;
218 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
219 }
220 packed_b[nr_block_offset] -= ksum * izp;
221 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
222 }
223 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
224 }
225 }
226 k += nc * kc;
227 if XNN_UNPREDICTABLE(b != NULL) {
228 b += nc;
229 }
230 } while (--g != 0);
231 }
232
xnn_pack_qs8_gemm_xw_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)233 void xnn_pack_qs8_gemm_xw_goi_w(
234 size_t g,
235 size_t nc,
236 size_t kc,
237 size_t nr,
238 size_t kr,
239 size_t sr,
240 const int8_t* k,
241 const int32_t* b,
242 void* packed_w,
243 const struct xnn_qs8_packing_params* params)
244 {
245 assert(sr == 1);
246 const int32_t izp = (int32_t) params->input_zero_point;
247 do {
248 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
249 const size_t nr_block_size = min(nc - nr_block_start, nr);
250 int32_t* packed_b = (int32_t*) packed_w;
251 if XNN_LIKELY(b != NULL) {
252 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
253 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
254 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
255 }
256 } else {
257 size_t n = nr_block_size;
258 do {
259 *((int32_t*) packed_w) = 0;
260 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
261 } while (--n != 0);
262 }
263 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
264 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
265 const size_t kr_block_size = min(kc - kr_block_start, kr);
266 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
267 int32_t ksum = 0;
268 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
269 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
270 ksum += (int32_t) kv;
271 *((int16_t*) packed_w) = (int16_t) kv;
272 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int16_t));
273 }
274 packed_b[nr_block_offset] -= ksum * izp;
275 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int16_t));
276 }
277 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int16_t));
278 }
279 }
280 k += nc * kc;
281 if XNN_UNPREDICTABLE(b != NULL) {
282 b += nc;
283 }
284 } while (--g != 0);
285 }
286
xnn_pack_f32_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,const void * params)287 void xnn_pack_f32_gemm_io_w(
288 size_t nc,
289 size_t kc,
290 size_t nr,
291 size_t kr,
292 size_t sr,
293 const float* k,
294 const float* b,
295 float* packed_w,
296 const void* params)
297 {
298 const size_t skr = sr * kr;
299 const size_t skc = round_down_po2(kc, skr);
300 const size_t sr_mask = (sr - 1) * kr;
301 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
302 const size_t nr_block_size = min(nc - nr_block_start, nr);
303 if XNN_LIKELY(b != NULL) {
304 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
305 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
306 }
307 }
308 packed_w += nr;
309
310 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
311 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
312 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
313 *packed_w++ =
314 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
315 }
316 }
317 packed_w += (nr - nr_block_size) * kr;
318 }
319
320 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
321 const size_t kr_block_size = min(kc - kr_block_start, kr);
322 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
323 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
324 *packed_w++ =
325 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
326 }
327 packed_w += kr - kr_block_size;
328 }
329 packed_w += (nr - nr_block_size) * kr;
330 }
331 }
332 }
333
xnn_pack_f16_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)334 void xnn_pack_f16_gemm_io_w(
335 size_t nc,
336 size_t kc,
337 size_t nr,
338 size_t kr,
339 size_t sr,
340 const uint16_t* k,
341 const uint16_t* b,
342 uint16_t* packed_w,
343 const void* params)
344 {
345 const size_t skr = sr * kr;
346 const size_t skc = round_down_po2(kc, skr);
347 const size_t sr_mask = (sr - 1) * kr;
348 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
349 const size_t nr_block_size = min(nc - nr_block_start, nr);
350 if XNN_LIKELY(b != NULL) {
351 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
352 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
353 }
354 }
355 packed_w += nr;
356
357 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
358 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
359 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
360 *packed_w++ =
361 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
362 }
363 }
364 packed_w += (nr - nr_block_size) * kr;
365 }
366
367 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
368 const size_t kr_block_size = min(kc - kr_block_start, kr);
369 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
370 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
371 *packed_w++ =
372 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
373 }
374 packed_w += kr - kr_block_size;
375 }
376 packed_w += (nr - nr_block_size) * kr;
377 }
378 }
379 }
380
xnn_pack_qu8_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)381 void xnn_pack_qu8_gemm_io_w(
382 size_t nc,
383 size_t kc,
384 size_t nr,
385 size_t kr,
386 size_t sr,
387 const uint8_t* k,
388 const int32_t* b,
389 void* packed_w,
390 const struct xnn_qu8_packing_params* params)
391 {
392 assert(sr == 1);
393 const int32_t izp = (int32_t) params->input_zero_point;
394 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
395 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
396 const size_t nr_block_size = min(nc - nr_block_start, nr);
397 int32_t* packed_b = (int32_t*) packed_w;
398 if XNN_LIKELY(b != NULL) {
399 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
400 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
401 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
402 }
403 } else {
404 size_t n = nr_block_size;
405 do {
406 *((int32_t*) packed_w) = boff;
407 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
408 } while (--n != 0);
409 }
410 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
411 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
412 const size_t kr_block_size = min(kc - kr_block_start, kr);
413 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
414 int32_t ksum = 0;
415 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
416 const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
417 ksum += (int32_t) kv;
418 *((uint8_t*) packed_w) = kv;
419 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
420 }
421 packed_b[nr_block_offset] -= ksum * izp;
422 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
423 }
424 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
425 }
426 }
427 }
428
xnn_pack_f32_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,const void * params)429 void xnn_pack_f32_conv_goki_w(
430 size_t g,
431 size_t nc,
432 size_t ks,
433 size_t kc,
434 size_t nr,
435 size_t kr,
436 size_t sr,
437 const float* k,
438 const float* b,
439 float* packed_w,
440 const void* params)
441 {
442 const size_t skr = sr * kr;
443 const size_t skc = round_down_po2(kc, skr);
444 const size_t sr_mask = (sr - 1) * kr;
445 do {
446 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
447 const size_t nr_block_size = min(nc - nr_block_start, nr);
448 if XNN_LIKELY(b != NULL) {
449 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
450 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
451 }
452 }
453 packed_w += nr;
454
455 for (size_t ki = 0; ki < ks; ki++) {
456 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
457 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
458 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
459 *packed_w++ =
460 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
461 }
462 }
463 packed_w += (nr - nr_block_size) * kr;
464 }
465
466 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
467 const size_t kr_block_size = min(kc - kr_block_start, kr);
468 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
469 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
470 *packed_w++ =
471 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
472 }
473 packed_w += kr - kr_block_size;
474 }
475 packed_w += (nr - nr_block_size) * kr;
476 }
477 }
478 }
479 k += ks * kc * nc;
480 if XNN_UNPREDICTABLE(b != NULL) {
481 b += nc;
482 }
483 } while (--g != 0);
484 }
485
xnn_pack_f16_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)486 void xnn_pack_f16_conv_goki_w(
487 size_t g,
488 size_t nc,
489 size_t ks,
490 size_t kc,
491 size_t nr,
492 size_t kr,
493 size_t sr,
494 const uint16_t* k,
495 const uint16_t* b,
496 uint16_t* packed_w,
497 const void* params)
498 {
499 const size_t skr = sr * kr;
500 const size_t skc = round_down_po2(kc, skr);
501 const size_t sr_mask = (sr - 1) * kr;
502 do {
503 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
504 const size_t nr_block_size = min(nc - nr_block_start, nr);
505 if XNN_LIKELY(b != NULL) {
506 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
507 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
508 }
509 }
510 packed_w += nr;
511
512 for (size_t ki = 0; ki < ks; ki++) {
513 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
514 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
515 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
516 *packed_w++ =
517 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
518 }
519 }
520 packed_w += (nr - nr_block_size) * kr;
521 }
522
523 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
524 const size_t kr_block_size = min(kc - kr_block_start, kr);
525 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
526 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
527 *packed_w++ =
528 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
529 }
530 packed_w += kr - kr_block_size;
531 }
532 packed_w += (nr - nr_block_size) * kr;
533 }
534 }
535 }
536 k += ks * kc * nc;
537 if XNN_UNPREDICTABLE(b != NULL) {
538 b += nc;
539 }
540 } while (--g != 0);
541 }
542
xnn_pack_qu8_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)543 void xnn_pack_qu8_conv_goki_w(
544 size_t g,
545 size_t nc,
546 size_t ks,
547 size_t kc,
548 size_t nr,
549 size_t kr,
550 size_t sr,
551 const uint8_t* k,
552 const int32_t* b,
553 void* packed_w,
554 const struct xnn_qu8_packing_params* params)
555 {
556 assert(sr == 1);
557 const int32_t izp = (int32_t) params->input_zero_point;
558 const int32_t boff = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
559 do {
560 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
561 const size_t nr_block_size = min(nc - nr_block_start, nr);
562 int32_t* packed_b = (int32_t*) packed_w;
563 if XNN_LIKELY(b != NULL) {
564 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
565 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
566 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
567 }
568 } else {
569 size_t n = nr_block_size;
570 do {
571 *((int32_t*) packed_w) = boff;
572 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
573 } while (--n != 0);
574 }
575 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
576 for (size_t ki = 0; ki < ks; ki++) {
577 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
578 const size_t kr_block_size = min(kc - kr_block_start, kr);
579 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
580 int32_t ksum = 0;
581 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
582 const uint8_t kv =
583 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
584 ksum += (int32_t) kv;
585 *((uint8_t*) packed_w) = kv;
586 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
587 }
588 packed_b[nr_block_offset] -= ksum * izp;
589 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
590 }
591 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
592 }
593 }
594 }
595 k += ks * kc * nc;
596 if XNN_UNPREDICTABLE(b != NULL) {
597 b += nc;
598 }
599 } while (--g != 0);
600 }
601
xnn_pack_qs8_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)602 void xnn_pack_qs8_conv_goki_w(
603 size_t g,
604 size_t nc,
605 size_t ks,
606 size_t kc,
607 size_t nr,
608 size_t kr,
609 size_t sr,
610 const int8_t* k,
611 const int32_t* b,
612 void* packed_w,
613 const struct xnn_qs8_packing_params* params)
614 {
615 assert(sr == 1);
616 const int32_t izp = (int32_t) params->input_zero_point;
617 do {
618 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
619 const size_t nr_block_size = min(nc - nr_block_start, nr);
620 int32_t* packed_b = (int32_t*) packed_w;
621 if XNN_LIKELY(b != NULL) {
622 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
623 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
624 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
625 }
626 } else {
627 size_t n = nr_block_size;
628 do {
629 *((int32_t*) packed_w) = 0;
630 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
631 } while (--n != 0);
632 }
633 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
634 for (size_t ki = 0; ki < ks; ki++) {
635 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
636 const size_t kr_block_size = min(kc - kr_block_start, kr);
637 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
638 int32_t ksum = 0;
639 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
640 const int8_t kv =
641 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
642 ksum += (int32_t) kv;
643 *((int8_t*) packed_w) = kv;
644 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
645 }
646 packed_b[nr_block_offset] -= ksum * izp;
647 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
648 }
649 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
650 }
651 }
652 }
653 k += ks * kc * nc;
654 if XNN_UNPREDICTABLE(b != NULL) {
655 b += nc;
656 }
657 } while (--g != 0);
658 }
659
xnn_pack_f32_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,const float * k,const float * b,float * packed_w,const void * params)660 void xnn_pack_f32_conv_kgo_w(
661 size_t g,
662 size_t nc,
663 size_t ks,
664 size_t nr,
665 size_t kr,
666 const float* k,
667 const float* b,
668 float* packed_w,
669 const void* params)
670 {
671 for (size_t i = 0; i < g; i++) {
672 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
673 const size_t nr_block_size = min(nc - nr_block_start, nr);
674 if XNN_LIKELY(b != NULL) {
675 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
676 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
677 }
678 }
679 packed_w += nr;
680 for (size_t ki = 0; ki < ks; ki++) {
681 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
682 *packed_w =
683 k[ki * g * nc + (nr_block_start + nr_block_offset)];
684 packed_w += kr;
685 }
686 packed_w += (nr - nr_block_size) * kr;
687 }
688 }
689 k += nc;
690 if XNN_UNPREDICTABLE(b != NULL) {
691 b += nc;
692 }
693 }
694 }
695
xnn_pack_f16_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)696 void xnn_pack_f16_conv_kgo_w(
697 size_t g,
698 size_t nc,
699 size_t ks,
700 size_t nr,
701 size_t kr,
702 const uint16_t* k,
703 const uint16_t* b,
704 uint16_t* packed_w,
705 const void* params)
706 {
707 for (size_t i = 0; i < g; i++) {
708 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
709 const size_t nr_block_size = min(nc - nr_block_start, nr);
710 if XNN_LIKELY(b != NULL) {
711 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
712 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
713 }
714 }
715 packed_w += nr;
716 for (size_t ki = 0; ki < ks; ki++) {
717 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
718 *packed_w =
719 k[ki * g * nc + (nr_block_start + nr_block_offset)];
720 packed_w += kr;
721 }
722 packed_w += (nr - nr_block_size) * kr;
723 }
724 }
725 k += nc;
726 if XNN_UNPREDICTABLE(b != NULL) {
727 b += nc;
728 }
729 }
730 }
731
xnn_pack_qu8_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)732 void xnn_pack_qu8_conv_kgo_w(
733 size_t g,
734 size_t nc,
735 size_t ks,
736 size_t nr,
737 size_t kr,
738 const uint8_t* k,
739 const int32_t* b,
740 void* packed_w,
741 const struct xnn_qu8_packing_params* params)
742 {
743 const int32_t izp = (int32_t) params->input_zero_point;
744 const int32_t boff = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
745 for (size_t i = 0; i < g; i++) {
746 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
747 const size_t nr_block_size = min(nc - nr_block_start, nr);
748 int32_t* packed_b = (int32_t*) packed_w;
749 if XNN_LIKELY(b != NULL) {
750 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
751 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
752 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
753 }
754 } else {
755 size_t n = nr_block_size;
756 do {
757 *((int32_t*) packed_w) = boff;
758 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
759 } while (--n != 0);
760 }
761 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
762 for (size_t ki = 0; ki < ks; ki++) {
763 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
764 const uint8_t kv =
765 k[ki * g * nc + (nr_block_start + nr_block_offset)];
766 *((uint8_t*) packed_w) = kv;
767 packed_b[nr_block_offset] -= (int32_t) kv * izp;
768 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
769 }
770 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
771 }
772 }
773 k += nc;
774 if XNN_UNPREDICTABLE(b != NULL) {
775 b += nc;
776 }
777 }
778 }
779
xnn_pack_qs8_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)780 void xnn_pack_qs8_conv_kgo_w(
781 size_t g,
782 size_t nc,
783 size_t ks,
784 size_t nr,
785 size_t kr,
786 const int8_t* k,
787 const int32_t* b,
788 void* packed_w,
789 const struct xnn_qs8_packing_params* params)
790 {
791 const int32_t izp = (int32_t) params->input_zero_point;
792 for (size_t i = 0; i < g; i++) {
793 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
794 const size_t nr_block_size = min(nc - nr_block_start, nr);
795 int32_t* packed_b = (int32_t*) packed_w;
796 if XNN_LIKELY(b != NULL) {
797 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
798 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
799 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
800 }
801 } else {
802 size_t n = nr_block_size;
803 do {
804 *((int32_t*) packed_w) = 0;
805 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
806 } while (--n != 0);
807 }
808 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
809 for (size_t ki = 0; ki < ks; ki++) {
810 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
811 const int8_t kv =
812 k[ki * g * nc + (nr_block_start + nr_block_offset)];
813 *((int8_t*) packed_w) = kv;
814 packed_b[nr_block_offset] -= (int32_t) kv * izp;
815 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(int8_t));
816 }
817 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
818 }
819 }
820 k += nc;
821 if XNN_UNPREDICTABLE(b != NULL) {
822 b += nc;
823 }
824 }
825 }
826
xnn_pack_f32_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,struct subconvolution_params * subconv_params,const void * params)827 void xnn_pack_f32_deconv_goki_w(
828 size_t g,
829 size_t nc,
830 size_t kh,
831 size_t kw,
832 size_t kc,
833 size_t sh,
834 size_t sw,
835 size_t nr,
836 size_t kr,
837 size_t sr,
838 const float* k,
839 const float* b,
840 float* packed_w,
841 struct subconvolution_params* subconv_params,
842 const void* params)
843 {
844 const size_t skr = sr * kr;
845 const size_t skc = round_down_po2(kc, skr);
846 const size_t sr_mask = (sr - 1) * kr;
847 for (size_t i = 0; i < g; i++) {
848 for (size_t oy = 0; oy < sh; oy++) {
849 for (size_t ox = 0; ox < sw; ox++) {
850 if (i == 0) {
851 (*subconv_params++).weights = packed_w;
852 }
853 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
854 const size_t nr_block_size = min(nc - nr_block_start, nr);
855 if XNN_LIKELY(b != NULL) {
856 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
857 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
858 }
859 }
860 packed_w += nr;
861 for (size_t ky = oy; ky < kh; ky += sh) {
862 for (size_t kx = ox; kx < kw; kx += sw) {
863 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
864 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
865 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
866 *packed_w++ =
867 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
868 }
869 }
870 packed_w += (nr - nr_block_size) * kr;
871 }
872
873 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
874 const size_t kr_block_size = min(kc - kr_block_start, kr);
875 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
876 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
877 *packed_w++ =
878 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
879 }
880 packed_w += kr - kr_block_size;
881 }
882 packed_w += (nr - nr_block_size) * kr;
883 }
884 }
885 }
886 }
887 }
888 }
889 k += kh * kw * kc * nc;
890 if XNN_UNPREDICTABLE(b != NULL) {
891 b += nc;
892 }
893 }
894 }
895
xnn_pack_f16_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,struct subconvolution_params * subconv_params,const void * params)896 void xnn_pack_f16_deconv_goki_w(
897 size_t g,
898 size_t nc,
899 size_t kh,
900 size_t kw,
901 size_t kc,
902 size_t sh,
903 size_t sw,
904 size_t nr,
905 size_t kr,
906 size_t sr,
907 const uint16_t* k,
908 const uint16_t* b,
909 uint16_t* packed_w,
910 struct subconvolution_params* subconv_params,
911 const void* params)
912 {
913 const size_t skr = sr * kr;
914 const size_t skc = round_down_po2(kc, skr);
915 const size_t sr_mask = (sr - 1) * kr;
916 for (size_t i = 0; i < g; i++) {
917 for (size_t oy = 0; oy < sh; oy++) {
918 for (size_t ox = 0; ox < sw; ox++) {
919 if (i == 0) {
920 (*subconv_params++).weights = packed_w;
921 }
922 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
923 const size_t nr_block_size = min(nc - nr_block_start, nr);
924 if XNN_LIKELY(b != NULL) {
925 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
926 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
927 }
928 }
929 packed_w += nr;
930 for (size_t ky = oy; ky < kh; ky += sh) {
931 for (size_t kx = ox; kx < kw; kx += sw) {
932 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
933 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
934 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
935 *packed_w++ =
936 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
937 }
938 }
939 packed_w += (nr - nr_block_size) * kr;
940 }
941
942 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
943 const size_t kr_block_size = min(kc - kr_block_start, kr);
944 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
945 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
946 *packed_w++ =
947 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
948 }
949 packed_w += kr - kr_block_size;
950 }
951 packed_w += (nr - nr_block_size) * kr;
952 }
953 }
954 }
955 }
956 }
957 }
958 k += kh * kw * kc * nc;
959 if XNN_UNPREDICTABLE(b != NULL) {
960 b += nc;
961 }
962 }
963 }
964
xnn_pack_qu8_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,struct subconvolution_params * subconv_params,const struct xnn_qu8_packing_params * params)965 void xnn_pack_qu8_deconv_goki_w(
966 size_t g,
967 size_t nc,
968 size_t kh,
969 size_t kw,
970 size_t kc,
971 size_t sh,
972 size_t sw,
973 size_t nr,
974 size_t kr,
975 size_t sr,
976 const uint8_t* k,
977 const int32_t* b,
978 void* packed_w,
979 struct subconvolution_params* subconv_params,
980 const struct xnn_qu8_packing_params* params)
981 {
982 assert(sr == 1);
983 const int32_t izp = (int32_t) params->input_zero_point;
984 const int32_t kzp = (int32_t) params->kernel_zero_point;
985 for (size_t i = 0; i < g; i++) {
986 for (size_t oy = 0; oy < sh; oy++) {
987 for (size_t ox = 0; ox < sw; ox++) {
988 if (i == 0) {
989 (*subconv_params++).weights = packed_w;
990 }
991 const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
992 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
993 const size_t nr_block_size = min(nc - nr_block_start, nr);
994 int32_t* packed_b = (int32_t*) packed_w;
995 if XNN_LIKELY(b != 0) {
996 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
997 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
998 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
999 }
1000 } else {
1001 size_t n = nr_block_size;
1002 do {
1003 *((int32_t*) packed_w) = boff;
1004 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1005 } while (--n != 0);
1006 }
1007 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1008 for (size_t ky = oy; ky < kh; ky += sh) {
1009 for (size_t kx = ox; kx < kw; kx += sw) {
1010 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
1011 const size_t kr_block_size = min(kc - kr_block_start, kr);
1012 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1013 int32_t ksum = 0;
1014 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1015 const uint8_t kv =
1016 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1017 ksum += (int32_t) kv;
1018 *((uint8_t*) packed_w) = kv;
1019 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1020 }
1021 packed_b[nr_block_offset] -= ksum * izp;
1022 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
1023 }
1024 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
1025 }
1026 }
1027 }
1028 }
1029 }
1030 }
1031 k += kh * kw * kc * nc;
1032 if XNN_UNPREDICTABLE(b != NULL) {
1033 b += nc;
1034 }
1035 }
1036 }
1037
xnn_pack_f32_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,float * packed_w,const void * params)1038 void xnn_pack_f32_dwconv_ghw_w(
1039 size_t h,
1040 size_t w,
1041 size_t c,
1042 size_t cr,
1043 const float* k,
1044 const float* b,
1045 float* packed_w,
1046 const void* params)
1047 {
1048 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1049 const size_t cr_block_size = min(c - cr_block_start, cr);
1050 if XNN_LIKELY(b != NULL) {
1051 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1052 *packed_w++ = b[cr_block_start + cr_block_offset];
1053 }
1054 } else {
1055 size_t n = cr_block_size;
1056 do {
1057 *packed_w++ = 0.0f;
1058 } while (--n != 0);
1059 }
1060 packed_w += cr - cr_block_size;
1061 for (size_t x = 0; x < w; x++) {
1062 for (size_t y = 0; y < h; y++) {
1063 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1064 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1065 *packed_w++ = kv;
1066 }
1067 packed_w += cr - cr_block_size;
1068 }
1069 }
1070 }
1071 }
1072
xnn_pack_f16_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)1073 void xnn_pack_f16_dwconv_ghw_w(
1074 size_t h,
1075 size_t w,
1076 size_t c,
1077 size_t cr,
1078 const uint16_t* k,
1079 const uint16_t* b,
1080 uint16_t* packed_w,
1081 const void* params)
1082 {
1083 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1084 const size_t cr_block_size = min(c - cr_block_start, cr);
1085 if XNN_LIKELY(b != NULL) {
1086 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1087 *packed_w++ = b[cr_block_start + cr_block_offset];
1088 }
1089 } else {
1090 size_t n = cr_block_size;
1091 do {
1092 *packed_w++ = 0;
1093 } while (--n != 0);
1094 }
1095 packed_w += cr - cr_block_size;
1096 for (size_t x = 0; x < w; x++) {
1097 for (size_t y = 0; y < h; y++) {
1098 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1099 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1100 *packed_w++ = kv;
1101 }
1102 packed_w += cr - cr_block_size;
1103 }
1104 }
1105 }
1106 }
1107
xnn_pack_qu8_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)1108 void xnn_pack_qu8_dwconv_ghw_w(
1109 size_t h,
1110 size_t w,
1111 size_t c,
1112 size_t cr,
1113 const uint8_t* k,
1114 const int32_t* b,
1115 void* packed_w,
1116 const struct xnn_qu8_packing_params* params)
1117 {
1118 const int32_t izp = (int32_t) params->input_zero_point;
1119 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1120 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1121 const size_t cr_block_size = min(c - cr_block_start, cr);
1122 int32_t* packed_b = (int32_t*) packed_w;
1123 if XNN_LIKELY(b != NULL) {
1124 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1125 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1126 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1127 }
1128 } else {
1129 size_t n = cr_block_size;
1130 do {
1131 *((int32_t*) packed_w) = boff;
1132 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1133 } while (--n != 0);
1134 }
1135 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1136 for (size_t x = 0; x < w; x++) {
1137 for (size_t y = 0; y < h; y++) {
1138 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1139 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1140 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1141 *((uint8_t*) packed_w) = kv;
1142 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1143 }
1144 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1145 }
1146 }
1147 }
1148 }
1149
xnn_pack_qs8_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)1150 void xnn_pack_qs8_dwconv_ghw_w(
1151 size_t h,
1152 size_t w,
1153 size_t c,
1154 size_t cr,
1155 const int8_t* k,
1156 const int32_t* b,
1157 void* packed_w,
1158 const struct xnn_qs8_packing_params* params)
1159 {
1160 const int32_t izp = (int32_t) params->input_zero_point;
1161 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1162 const size_t cr_block_size = min(c - cr_block_start, cr);
1163 int32_t* packed_b = (int32_t*) packed_w;
1164 if XNN_LIKELY(b != NULL) {
1165 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1166 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1167 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1168 }
1169 } else {
1170 size_t n = cr_block_size;
1171 do {
1172 *((int32_t*) packed_w) = 0;
1173 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1174 } while (--n != 0);
1175 }
1176 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1177 for (size_t x = 0; x < w; x++) {
1178 for (size_t y = 0; y < h; y++) {
1179 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1180 const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1181 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1182 *((int8_t*) packed_w) = kv;
1183 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1184 }
1185 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1186 }
1187 }
1188 }
1189 }
1190
xnn_pack_f32_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,float * packed_w,const void * params)1191 void xnn_pack_f32_dwconv_hwg_w(
1192 size_t h,
1193 size_t w,
1194 size_t c,
1195 size_t cr,
1196 const float* k,
1197 const float* b,
1198 float* packed_w,
1199 const void* params)
1200 {
1201 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1202 const size_t cr_block_size = min(c - cr_block_start, cr);
1203 if XNN_LIKELY(b != NULL) {
1204 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1205 *packed_w++ = b[cr_block_start + cr_block_offset];
1206 }
1207 } else {
1208 size_t n = cr_block_size;
1209 do {
1210 *packed_w++ = 0.0f;
1211 } while (--n != 0);
1212 }
1213 packed_w += cr - cr_block_size;
1214 for (size_t x = 0; x < w; x++) {
1215 for (size_t y = 0; y < h; y++) {
1216 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1217 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1218 *packed_w++ = kv;
1219 }
1220 packed_w += cr - cr_block_size;
1221 }
1222 }
1223 }
1224 }
1225
xnn_pack_f16_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)1226 void xnn_pack_f16_dwconv_hwg_w(
1227 size_t h,
1228 size_t w,
1229 size_t c,
1230 size_t cr,
1231 const uint16_t* k,
1232 const uint16_t* b,
1233 uint16_t* packed_w,
1234 const void* params)
1235 {
1236 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1237 const size_t cr_block_size = min(c - cr_block_start, cr);
1238 if XNN_LIKELY(b != NULL) {
1239 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1240 *packed_w++ = b[cr_block_start + cr_block_offset];
1241 }
1242 } else {
1243 size_t n = cr_block_size;
1244 do {
1245 *packed_w++ = 0;
1246 } while (--n != 0);
1247 }
1248 packed_w += cr - cr_block_size;
1249 for (size_t x = 0; x < w; x++) {
1250 for (size_t y = 0; y < h; y++) {
1251 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1252 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1253 *packed_w++ = kv;
1254 }
1255 packed_w += cr - cr_block_size;
1256 }
1257 }
1258 }
1259 }
1260
xnn_pack_qu8_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)1261 void xnn_pack_qu8_dwconv_hwg_w(
1262 size_t h,
1263 size_t w,
1264 size_t c,
1265 size_t cr,
1266 const uint8_t* k,
1267 const int32_t* b,
1268 void* packed_w,
1269 const struct xnn_qu8_packing_params* params)
1270 {
1271 const int32_t izp = (int32_t) params->input_zero_point;
1272 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1273 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1274 const size_t cr_block_size = min(c - cr_block_start, cr);
1275 int32_t* packed_b = (int32_t*) packed_w;
1276 if XNN_LIKELY(b != NULL) {
1277 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1278 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1279 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1280 }
1281 } else {
1282 size_t n = cr_block_size;
1283 do {
1284 *((int32_t*) packed_w) = boff;
1285 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1286 } while (--n != 0);
1287 }
1288 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1289 for (size_t x = 0; x < w; x++) {
1290 for (size_t y = 0; y < h; y++) {
1291 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1292 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1293 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1294 *((uint8_t*) packed_w) = kv;
1295 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1296 }
1297 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1298 }
1299 }
1300 }
1301 }
1302
xnn_pack_qs8_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)1303 void xnn_pack_qs8_dwconv_hwg_w(
1304 size_t h,
1305 size_t w,
1306 size_t c,
1307 size_t cr,
1308 const int8_t* k,
1309 const int32_t* b,
1310 void* packed_w,
1311 const struct xnn_qs8_packing_params* params)
1312 {
1313 const int32_t izp = (int32_t) params->input_zero_point;
1314 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1315 const size_t cr_block_size = min(c - cr_block_start, cr);
1316 int32_t* packed_b = (int32_t*) packed_w;
1317 if XNN_LIKELY(b != NULL) {
1318 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1319 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1320 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1321 }
1322 } else {
1323 size_t n = cr_block_size;
1324 do {
1325 *((int32_t*) packed_w) = 0;
1326 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1327 } while (--n != 0);
1328 }
1329 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1330 for (size_t x = 0; x < w; x++) {
1331 for (size_t y = 0; y < h; y++) {
1332 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1333 const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1334 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1335 *((int8_t*) packed_w) = kv;
1336 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1337 }
1338 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1339 }
1340 }
1341 }
1342 }
1343
xnn_pack_f32_gemminc_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,float * packed_w,const void * params)1344 void xnn_pack_f32_gemminc_goi_w(
1345 size_t g,
1346 size_t nc,
1347 size_t kc,
1348 size_t nr,
1349 size_t kr,
1350 size_t sr,
1351 const float* k,
1352 float* packed_w,
1353 const void* params)
1354 {
1355 const size_t skr = sr * kr;
1356 const size_t skc = round_down_po2(kc, skr);
1357 const size_t sr_mask = (sr - 1) * kr;
1358 do {
1359 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1360 const size_t nr_block_size = min(nc - nr_block_start, nr);
1361
1362 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1363 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1364 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1365 *packed_w++ =
1366 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1367 }
1368 }
1369 packed_w += (nr - nr_block_size) * kr;
1370 }
1371
1372 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1373 const size_t kr_block_size = min(kc - kr_block_start, kr);
1374 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1375 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1376 *packed_w++ =
1377 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1378 }
1379 packed_w += kr - kr_block_size;
1380 }
1381 packed_w += (nr - nr_block_size) * kr;
1382 }
1383 }
1384 k += nc * kc;
1385 } while (--g != 0);
1386 }
1387
xnn_pack_f16_gemminc_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,uint16_t * packed_w,const void * params)1388 void xnn_pack_f16_gemminc_goi_w(
1389 size_t g,
1390 size_t nc,
1391 size_t kc,
1392 size_t nr,
1393 size_t kr,
1394 size_t sr,
1395 const uint16_t* k,
1396 uint16_t* packed_w,
1397 const void* params)
1398 {
1399 const size_t skr = sr * kr;
1400 const size_t skc = round_down_po2(kc, skr);
1401 const size_t sr_mask = (sr - 1) * kr;
1402 do {
1403 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1404 const size_t nr_block_size = min(nc - nr_block_start, nr);
1405
1406 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1407 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1408 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1409 *packed_w++ =
1410 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1411 }
1412 }
1413 packed_w += (nr - nr_block_size) * kr;
1414 }
1415
1416 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1417 const size_t kr_block_size = min(kc - kr_block_start, kr);
1418 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1419 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1420 *packed_w++ =
1421 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1422 }
1423 packed_w += kr - kr_block_size;
1424 }
1425 packed_w += (nr - nr_block_size) * kr;
1426 }
1427 }
1428 k += nc * kc;
1429 } while (--g != 0);
1430 }
1431
xnn_pack_f32_dconv_oki_w(size_t nc,size_t kc,size_t nr,size_t kh,size_t kw,const float * k,const float * b,float * packed_w,const void * params)1432 void xnn_pack_f32_dconv_oki_w(
1433 size_t nc,
1434 size_t kc,
1435 size_t nr,
1436 size_t kh,
1437 size_t kw,
1438 const float* k,
1439 const float* b,
1440 float* packed_w,
1441 const void* params)
1442 {
1443 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1444 const size_t nr_block_size = min(nc - nr_block_start, nr);
1445 if XNN_LIKELY(b != NULL) {
1446 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1447 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1448 }
1449 } else {
1450 size_t n = nr;
1451 do {
1452 *packed_w++ = 0.0f;
1453 } while (--n != 0);
1454 }
1455
1456 for (size_t kx = 0; kx < kw; kx++) {
1457 for (size_t c = 0; c < kc; c++) {
1458 for (size_t ky = 0; ky < kh; ky++) {
1459 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1460 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1461 }
1462 }
1463 }
1464 }
1465 if XNN_UNPREDICTABLE(b != NULL) {
1466 b += nr;
1467 }
1468 }
1469 }
1470
xnn_pack_f16_dconv_oki_w(size_t nc,size_t kc,size_t nr,size_t kh,size_t kw,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)1471 void xnn_pack_f16_dconv_oki_w(
1472 size_t nc,
1473 size_t kc,
1474 size_t nr,
1475 size_t kh,
1476 size_t kw,
1477 const uint16_t* k,
1478 const uint16_t* b,
1479 uint16_t* packed_w,
1480 const void* params)
1481 {
1482 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1483 const size_t nr_block_size = min(nc - nr_block_start, nr);
1484 if XNN_LIKELY(b != NULL) {
1485 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1486 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1487 }
1488 } else {
1489 size_t n = nr;
1490 do {
1491 *packed_w++ = 0;
1492 } while (--n != 0);
1493 }
1494
1495 for (size_t kx = 0; kx < kw; kx++) {
1496 for (size_t c = 0; c < kc; c++) {
1497 for (size_t ky = 0; ky < kh; ky++) {
1498 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1499 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1500 }
1501 }
1502 }
1503 }
1504 if XNN_UNPREDICTABLE(b != NULL) {
1505 b += nr;
1506 }
1507 }
1508 }
1509
xnn_pack_f32_chw_dwconv_ghw_w(size_t kernel_size,size_t groups,const float * kernel,const float * bias,float * packed_weights,const void * params)1510 void xnn_pack_f32_chw_dwconv_ghw_w(
1511 size_t kernel_size,
1512 size_t groups,
1513 const float* kernel,
1514 const float* bias,
1515 float* packed_weights,
1516 const void* params)
1517 {
1518 for (size_t g = 0; g < groups; g++) {
1519 if XNN_LIKELY(bias != NULL) {
1520 *packed_weights = *bias++;
1521 } else {
1522 *packed_weights = 0.0f;
1523 }
1524 packed_weights += 1;
1525 for (size_t i = 0; i < kernel_size; i++) {
1526 *packed_weights++ = kernel[g * kernel_size + i];
1527 }
1528 }
1529 }
1530
xnn_pack_f16_chw_dwconv_ghw_w(size_t kernel_size,size_t groups,const uint16_t * kernel,const uint16_t * bias,uint16_t * packed_weights,const void * params)1531 void xnn_pack_f16_chw_dwconv_ghw_w(
1532 size_t kernel_size,
1533 size_t groups,
1534 const uint16_t* kernel,
1535 const uint16_t* bias,
1536 uint16_t* packed_weights,
1537 const void* params)
1538 {
1539 for (size_t g = 0; g < groups; g++) {
1540 if XNN_LIKELY(bias != NULL) {
1541 *packed_weights = *bias++;
1542 } else {
1543 *packed_weights = 0;
1544 }
1545 packed_weights += 1;
1546 for (size_t i = 0; i < kernel_size; i++) {
1547 *packed_weights++ = kernel[g * kernel_size + i];
1548 }
1549 }
1550 }
1551
xnn_pack_f32_chw_dwconv_hwg_w(size_t kernel_size,size_t groups,const float * kernel,const float * bias,float * packed_weights,const void * params)1552 void xnn_pack_f32_chw_dwconv_hwg_w(
1553 size_t kernel_size,
1554 size_t groups,
1555 const float* kernel,
1556 const float* bias,
1557 float* packed_weights,
1558 const void* params)
1559 {
1560 for (size_t g = 0; g < groups; g++) {
1561 if XNN_LIKELY(bias != NULL) {
1562 *packed_weights = *bias++;
1563 } else {
1564 *packed_weights = 0.0f;
1565 }
1566 packed_weights += 1;
1567 for (size_t i = 0; i < kernel_size; i++) {
1568 *packed_weights++ = kernel[i * groups + g];
1569 }
1570 }
1571 }
1572
xnn_pack_f32_vmulcaddc_w(size_t c,size_t cr,const float * s,const float * b,float * packed_w,const void * params)1573 void xnn_pack_f32_vmulcaddc_w(
1574 size_t c,
1575 size_t cr,
1576 const float* s,
1577 const float* b,
1578 float* packed_w,
1579 const void* params)
1580 {
1581 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1582 const size_t cr_block_size = min(c - cr_block_start, cr);
1583 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1584 *packed_w++ = s[cr_block_start + cr_block_offset];
1585 }
1586 packed_w += cr - cr_block_size;
1587 if XNN_LIKELY(b != NULL) {
1588 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1589 *packed_w++ = b[cr_block_start + cr_block_offset];
1590 }
1591 } else {
1592 size_t n = cr_block_size;
1593 do {
1594 *packed_w++ = 0.0f;
1595 } while (--n != 0);
1596 }
1597 packed_w += cr - cr_block_size;
1598 }
1599 }
1600
xnn_pack_f16_vmulcaddc_w(size_t c,size_t cr,const uint16_t * s,const uint16_t * b,uint16_t * packed_w,const void * params)1601 void xnn_pack_f16_vmulcaddc_w(
1602 size_t c,
1603 size_t cr,
1604 const uint16_t* s,
1605 const uint16_t* b,
1606 uint16_t* packed_w,
1607 const void* params)
1608 {
1609 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1610 const size_t cr_block_size = min(c - cr_block_start, cr);
1611 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1612 *packed_w++ = s[cr_block_start + cr_block_offset];
1613 }
1614 packed_w += cr - cr_block_size;
1615 if XNN_LIKELY(b != NULL) {
1616 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1617 *packed_w++ = b[cr_block_start + cr_block_offset];
1618 }
1619 } else {
1620 size_t n = cr_block_size;
1621 do {
1622 *packed_w++ = 0;
1623 } while (--n != 0);
1624 }
1625 packed_w += cr - cr_block_size;
1626 }
1627 }
1628