• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <stdint.h>
10 #include <stddef.h>
11 #include <string.h>
12 
13 #include <fp16.h>
14 
15 #include <xnnpack/math.h>
16 #include <xnnpack/pack.h>
17 
18 
xnn_pack_f32_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)19 void xnn_pack_f32_gemm_goi_w(
20   size_t g,
21   size_t nc,
22   size_t kc,
23   size_t nr,
24   size_t kr,
25   size_t sr,
26   const float* k,
27   const float* b,
28   float* packed_w,
29   size_t extra_bytes,
30   const void* params)
31 {
32   assert(nr >= sr);
33 
34   const size_t skr = sr * kr;
35   do {
36     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
37       const size_t nr_block_size = min(nc - nr_block_start, nr);
38       if XNN_LIKELY(b != NULL) {
39         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
40           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
41         }
42       }
43       packed_w += nr;
44 
45       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
46         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
47           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
48             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
49             if (kc_idx < kc) {
50               packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
51             }
52           }
53           packed_w += kr;
54         }
55         packed_w += (nr - nr_block_size) * kr;
56       }
57       packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
58     }
59     k += nc * kc;
60     if XNN_UNPREDICTABLE(b != NULL) {
61       b += nc;
62     }
63   } while (--g != 0);
64 }
65 
xnn_pack_f16_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)66 void xnn_pack_f16_gemm_goi_w(
67   size_t g,
68   size_t nc,
69   size_t kc,
70   size_t nr,
71   size_t kr,
72   size_t sr,
73   const uint16_t* k,
74   const uint16_t* b,
75   uint16_t* packed_w,
76   size_t extra_bytes,
77   const void* params)
78 {
79   assert(nr >= sr);
80 
81   const size_t skr = sr * kr;
82   do {
83     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
84       const size_t nr_block_size = min(nc - nr_block_start, nr);
85       if XNN_LIKELY(b != NULL) {
86         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
87           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
88         }
89       }
90       packed_w += nr;
91 
92       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
93         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
94           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
95             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
96             if (kc_idx < kc) {
97               packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
98             }
99           }
100           packed_w += kr;
101         }
102         packed_w += (nr - nr_block_size) * kr;
103       }
104       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
105     }
106     k += nc * kc;
107     if XNN_UNPREDICTABLE(b != NULL) {
108       b += nc;
109     }
110   } while (--g != 0);
111 }
112 
xnn_pack_f32_to_f16_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)113 void xnn_pack_f32_to_f16_gemm_goi_w(
114   size_t g,
115   size_t nc,
116   size_t kc,
117   size_t nr,
118   size_t kr,
119   size_t sr,
120   const float* k,
121   const float* b,
122   uint16_t* packed_w,
123   size_t extra_bytes,
124   const void* params)
125 {
126   assert(nr >= sr);
127 
128   const size_t skr = sr * kr;
129   do {
130     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
131       const size_t nr_block_size = min(nc - nr_block_start, nr);
132       if XNN_LIKELY(b != NULL) {
133         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
134           packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
135         }
136       }
137       packed_w += nr;
138 
139       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
140         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
141           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
142             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
143             if (kc_idx < kc) {
144               packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[(nr_block_start + nr_block_offset) * kc + kc_idx]);
145             }
146           }
147           packed_w += kr;
148         }
149         packed_w += (nr - nr_block_size) * kr;
150       }
151       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
152     }
153     k += nc * kc;
154     if XNN_UNPREDICTABLE(b != NULL) {
155       b += nc;
156     }
157   } while (--g != 0);
158 }
159 
xnn_pack_qu8_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)160 void xnn_pack_qu8_gemm_goi_w(
161   size_t g,
162   size_t nc,
163   size_t kc,
164   size_t nr,
165   size_t kr,
166   size_t sr,
167   const uint8_t* k,
168   const int32_t* b,
169   void* packed_w,
170   size_t extra_bytes,
171   const struct xnn_qu8_packing_params* params)
172 {
173   assert(nr >= sr);
174 
175   const size_t skr = sr * kr;
176   const int32_t izp = (int32_t) params->input_zero_point;
177   const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
178   do {
179     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
180       const size_t nr_block_size = min(nc - nr_block_start, nr);
181       int32_t* packed_b = (int32_t*) packed_w;
182       if XNN_LIKELY(b != NULL) {
183         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
184           *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
185           packed_w = (int32_t*) packed_w + 1;
186         }
187       } else {
188         size_t n = nr_block_size;
189         do {
190           *((int32_t*) packed_w) = bzp;
191           packed_w = (int32_t*) packed_w + 1;
192         } while (--n != 0);
193       }
194       packed_w = (int32_t*) packed_w + (nr - nr_block_size);
195 
196       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
197         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
198           int32_t ksum = 0;
199           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
200             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
201             if (kc_idx < kc) {
202               const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
203               ksum += (int32_t) kv;
204               ((uint8_t*) packed_w)[kr_block_offset] = kv;
205             }
206           }
207           packed_b[nr_block_offset] -= ksum * izp;
208           packed_w = (uint8_t*) packed_w + kr;
209         }
210         packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
211       }
212       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
213     }
214     k += nc * kc;
215     if XNN_UNPREDICTABLE(b != NULL) {
216       b += nc;
217     }
218   } while (--g != 0);
219 }
220 
xnn_pack_qs8_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)221 void xnn_pack_qs8_gemm_goi_w(
222   size_t g,
223   size_t nc,
224   size_t kc,
225   size_t nr,
226   size_t kr,
227   size_t sr,
228   const int8_t* k,
229   const int32_t* b,
230   void* packed_w,
231   size_t extra_bytes,
232   const struct xnn_qs8_packing_params* params)
233 {
234   assert(nr >= sr);
235 
236   const size_t skr = sr * kr;
237   const int32_t izp = (int32_t) params->input_zero_point;
238   do {
239     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
240       const size_t nr_block_size = min(nc - nr_block_start, nr);
241       int32_t* packed_b = (int32_t*) packed_w;
242       if XNN_LIKELY(b != NULL) {
243         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
244           *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
245           packed_w = (int32_t*) packed_w + 1;
246         }
247       } else {
248         size_t n = nr_block_size;
249         do {
250           *((int32_t*) packed_w) = 0;
251           packed_w = (int32_t*) packed_w + 1;
252         } while (--n != 0);
253       }
254       packed_w = (int32_t*) packed_w + (nr - nr_block_size);
255 
256       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
257         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
258           int32_t ksum = 0;
259           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
260             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
261             if (kc_idx < kc) {
262               const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
263               ksum += (int32_t) kv;
264               ((int8_t*) packed_w)[kr_block_offset] = kv;
265             }
266           }
267           packed_b[nr_block_offset] -= ksum * izp;
268           packed_w = (int8_t*) packed_w + kr;
269         }
270         packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
271       }
272       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
273     }
274     k += nc * kc;
275     if XNN_UNPREDICTABLE(b != NULL) {
276       b += nc;
277     }
278   } while (--g != 0);
279 }
280 
xnn_pack_qs8_gemm_xw_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)281 void xnn_pack_qs8_gemm_xw_goi_w(
282   size_t g,
283   size_t nc,
284   size_t kc,
285   size_t nr,
286   size_t kr,
287   size_t sr,
288   const int8_t* k,
289   const int32_t* b,
290   void* packed_w,
291   size_t extra_bytes,
292   const struct xnn_qs8_packing_params* params)
293 {
294   assert(nr >= sr);
295 
296   const size_t skr = sr * kr;
297   const int32_t izp = (int32_t) params->input_zero_point;
298   do {
299     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
300       const size_t nr_block_size = min(nc - nr_block_start, nr);
301       int32_t* packed_b = (int32_t*) packed_w;
302       if XNN_LIKELY(b != NULL) {
303         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
304           *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
305           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
306         }
307       } else {
308         size_t n = nr_block_size;
309         do {
310           *((int32_t*) packed_w) = 0;
311           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
312         } while (--n != 0);
313       }
314       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
315 
316       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
317         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
318           int32_t ksum = 0;
319           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
320             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
321             if (kc_idx < kc) {
322               const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
323               ksum += (int32_t) kv;
324               ((int16_t*) packed_w)[kr_block_offset] = (int16_t) kv;
325             }
326           }
327           packed_b[nr_block_offset] -= ksum * izp;
328           packed_w = (int16_t*) packed_w + kr;
329         }
330         packed_w = (int16_t*) packed_w + (nr - nr_block_size) * kr;
331       }
332       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
333     }
334     k += nc * kc;
335     if XNN_UNPREDICTABLE(b != NULL) {
336       b += nc;
337     }
338   } while (--g != 0);
339 }
340 
xnn_pack_f32_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,const void * params)341 void xnn_pack_f32_gemm_io_w(
342   size_t nc,
343   size_t kc,
344   size_t nr,
345   size_t kr,
346   size_t sr,
347   const float* k,
348   const float* b,
349   float* packed_w,
350   const void* params)
351 {
352   assert(nr >= sr);
353 
354   const size_t skr = sr * kr;
355   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
356     const size_t nr_block_size = min(nc - nr_block_start, nr);
357     if XNN_LIKELY(b != NULL) {
358       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
359         packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
360       }
361     }
362     packed_w += nr;
363 
364     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
365       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
366         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
367           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
368           if (kc_idx < kc) {
369             packed_w[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
370           }
371         }
372         packed_w += kr;
373       }
374       packed_w += (nr - nr_block_size) * kr;
375     }
376   }
377 }
378 
xnn_pack_f16_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)379 void xnn_pack_f16_gemm_io_w(
380   size_t nc,
381   size_t kc,
382   size_t nr,
383   size_t kr,
384   size_t sr,
385   const uint16_t* k,
386   const uint16_t* b,
387   uint16_t* packed_w,
388   const void* params)
389 {
390   assert(nr >= sr);
391 
392   const size_t skr = sr * kr;
393   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
394     const size_t nr_block_size = min(nc - nr_block_start, nr);
395     if XNN_LIKELY(b != NULL) {
396       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
397         packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
398       }
399     }
400     packed_w += nr;
401 
402     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
403       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
404         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
405           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
406           if (kc_idx < kc) {
407             packed_w[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
408           }
409         }
410         packed_w += kr;
411       }
412       packed_w += (nr - nr_block_size) * kr;
413     }
414   }
415 }
416 
xnn_pack_f32_to_f16_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,const void * params)417 void xnn_pack_f32_to_f16_gemm_io_w(
418   size_t nc,
419   size_t kc,
420   size_t nr,
421   size_t kr,
422   size_t sr,
423   const float* k,
424   const float* b,
425   uint16_t* packed_w,
426   const void* params)
427 {
428   assert(nr >= sr);
429 
430   const size_t skr = sr * kr;
431   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
432     const size_t nr_block_size = min(nc - nr_block_start, nr);
433     if XNN_LIKELY(b != NULL) {
434       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
435         packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
436       }
437     }
438     packed_w += nr;
439 
440     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
441       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
442         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
443           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
444           if (kc_idx < kc) {
445             packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[kc_idx * nc + nr_block_start + nr_block_offset]);
446           }
447         }
448         packed_w += kr;
449       }
450       packed_w += (nr - nr_block_size) * kr;
451     }
452   }
453 }
454 
xnn_pack_qu8_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)455 void xnn_pack_qu8_gemm_io_w(
456   size_t nc,
457   size_t kc,
458   size_t nr,
459   size_t kr,
460   size_t sr,
461   const uint8_t* k,
462   const int32_t* b,
463   void* packed_w,
464   const struct xnn_qu8_packing_params* params)
465 {
466   assert(nr >= sr);
467 
468   const size_t skr = sr * kr;
469   const int32_t izp = (int32_t) params->input_zero_point;
470   const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
471   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
472     const size_t nr_block_size = min(nc - nr_block_start, nr);
473     int32_t* packed_b = (int32_t*) packed_w;
474     if XNN_LIKELY(b != NULL) {
475       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
476         *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
477         packed_w = (int32_t*) packed_w + 1;
478       }
479     } else {
480       size_t n = nr_block_size;
481       do {
482         *((int32_t*) packed_w) = bzp;
483         packed_w = (int32_t*) packed_w + 1;
484       } while (--n != 0);
485     }
486     packed_w = (int32_t*) packed_w + (nr - nr_block_size);
487 
488     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
489       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
490         int32_t ksum = 0;
491         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
492           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
493           if (kc_idx < kc) {
494             const uint8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
495             ksum += (int32_t) kv;
496             ((uint8_t*) packed_w)[kr_block_offset] = kv;
497           }
498         }
499         packed_b[nr_block_offset] -= ksum * izp;
500         packed_w = (uint8_t*) packed_w + kr;
501       }
502       packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
503     }
504   }
505 }
506 
xnn_pack_qs8_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)507 void xnn_pack_qs8_gemm_io_w(
508   size_t nc,
509   size_t kc,
510   size_t nr,
511   size_t kr,
512   size_t sr,
513   const int8_t* k,
514   const int32_t* b,
515   void* packed_w,
516   const struct xnn_qs8_packing_params* params)
517 {
518   assert(nr >= sr);
519 
520   const size_t skr = sr * kr;
521   const int32_t izp = (int32_t) params->input_zero_point;
522   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
523     const size_t nr_block_size = min(nc - nr_block_start, nr);
524     int32_t* packed_b = (int32_t*) packed_w;
525     if XNN_LIKELY(b != NULL) {
526       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
527         *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
528         packed_w = (int32_t*) packed_w + 1;
529       }
530     } else {
531       size_t n = nr_block_size;
532       do {
533         *((int32_t*) packed_w) = 0;
534         packed_w = (int32_t*) packed_w + 1;
535       } while (--n != 0);
536     }
537     packed_w = (uint32_t*) packed_w + (nr - nr_block_size);
538 
539     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
540       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
541         int32_t ksum = 0;
542         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
543           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
544           if (kc_idx < kc) {
545             const int8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
546             ksum += (int32_t) kv;
547             ((int8_t*) packed_w)[kr_block_offset] = kv;
548           }
549         }
550         packed_b[nr_block_offset] -= ksum * izp;
551         packed_w = (int8_t*) packed_w + kr;
552       }
553       packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
554     }
555   }
556 }
557 
xnn_pack_f32_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)558 void xnn_pack_f32_conv_goki_w(
559   size_t g,
560   size_t nc,
561   size_t ks,
562   size_t kc,
563   size_t nr,
564   size_t kr,
565   size_t sr,
566   const float* k,
567   const float* b,
568   float* packed_w,
569   size_t extra_bytes,
570   const void* params)
571 {
572   assert(nr >= sr);
573 
574   const size_t skr = sr * kr;
575   do {
576     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
577       const size_t nr_block_size = min(nc - nr_block_start, nr);
578       if XNN_LIKELY(b != NULL) {
579         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
580           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
581         }
582       }
583       packed_w += nr;
584 
585       for (size_t ki = 0; ki < ks; ki++) {
586         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
587           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
588             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
589               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
590               if (kc_idx < kc) {
591                 packed_w[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
592               }
593             }
594             packed_w += kr;
595           }
596           packed_w += (nr - nr_block_size) * kr;
597         }
598       }
599       packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
600     }
601     k += ks * kc * nc;
602     if XNN_UNPREDICTABLE(b != NULL) {
603       b += nc;
604     }
605   } while (--g != 0);
606 }
607 
xnn_pack_f16_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)608 void xnn_pack_f16_conv_goki_w(
609   size_t g,
610   size_t nc,
611   size_t ks,
612   size_t kc,
613   size_t nr,
614   size_t kr,
615   size_t sr,
616   const uint16_t* k,
617   const uint16_t* b,
618   uint16_t* packed_w,
619   size_t extra_bytes,
620   const void* params)
621 {
622   assert(nr >= sr);
623 
624   const size_t skr = sr * kr;
625   do {
626     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
627       const size_t nr_block_size = min(nc - nr_block_start, nr);
628       if XNN_LIKELY(b != NULL) {
629         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
630           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
631         }
632       }
633       packed_w += nr;
634 
635       for (size_t ki = 0; ki < ks; ki++) {
636         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
637           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
638             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
639               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
640               if (kc_idx < kc) {
641                 packed_w[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
642               }
643             }
644             packed_w += kr;
645           }
646           packed_w += (nr - nr_block_size) * kr;
647         }
648       }
649       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
650     }
651     k += ks * kc * nc;
652     if XNN_UNPREDICTABLE(b != NULL) {
653       b += nc;
654     }
655   } while (--g != 0);
656 }
657 
xnn_pack_f32_to_f16_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)658 void xnn_pack_f32_to_f16_conv_goki_w(
659   size_t g,
660   size_t nc,
661   size_t ks,
662   size_t kc,
663   size_t nr,
664   size_t kr,
665   size_t sr,
666   const float* k,
667   const float* b,
668   uint16_t* packed_w,
669   size_t extra_bytes,
670   const void* params)
671 {
672   assert(nr >= sr);
673 
674   const size_t skr = sr * kr;
675   do {
676     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
677       const size_t nr_block_size = min(nc - nr_block_start, nr);
678       if XNN_LIKELY(b != NULL) {
679         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
680           packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
681         }
682       }
683       packed_w += nr;
684 
685       for (size_t ki = 0; ki < ks; ki++) {
686         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
687           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
688             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
689               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
690               if (kc_idx < kc) {
691                 packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx]);
692               }
693             }
694             packed_w += kr;
695           }
696           packed_w += (nr - nr_block_size) * kr;
697         }
698       }
699       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
700     }
701     k += ks * kc * nc;
702     if XNN_UNPREDICTABLE(b != NULL) {
703       b += nc;
704     }
705   } while (--g != 0);
706 }
707 
xnn_pack_qu8_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)708 void xnn_pack_qu8_conv_goki_w(
709   size_t g,
710   size_t nc,
711   size_t ks,
712   size_t kc,
713   size_t nr,
714   size_t kr,
715   size_t sr,
716   const uint8_t* k,
717   const int32_t* b,
718   void* packed_w,
719   size_t extra_bytes,
720   const struct xnn_qu8_packing_params* params)
721 {
722   assert(nr >= sr);
723 
724   const size_t skr = sr * kr;
725   const int32_t izp = (int32_t) params->input_zero_point;
726   const int32_t bzp = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
727   do {
728     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
729       const size_t nr_block_size = min(nc - nr_block_start, nr);
730       int32_t* packed_b = (int32_t*) packed_w;
731       if XNN_LIKELY(b != NULL) {
732         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
733           *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
734           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
735         }
736       } else {
737         size_t n = nr_block_size;
738         do {
739           *((int32_t*) packed_w) = bzp;
740           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
741         } while (--n != 0);
742       }
743       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
744 
745       for (size_t ki = 0; ki < ks; ki++) {
746         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
747           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
748             int32_t ksum = 0;
749             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
750               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
751               if (kc_idx < kc) {
752                 const uint8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
753                 ksum += (int32_t) kv;
754                 ((uint8_t*) packed_w)[kr_block_offset] = kv;
755               }
756             }
757             packed_b[nr_block_offset] -= ksum * izp;
758             packed_w = (uint8_t*) packed_w + kr;
759           }
760           packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
761         }
762       }
763       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
764     }
765     k += ks * kc * nc;
766     if XNN_UNPREDICTABLE(b != NULL) {
767       b += nc;
768     }
769   } while (--g != 0);
770 }
771 
xnn_pack_qs8_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)772 void xnn_pack_qs8_conv_goki_w(
773   size_t g,
774   size_t nc,
775   size_t ks,
776   size_t kc,
777   size_t nr,
778   size_t kr,
779   size_t sr,
780   const int8_t* k,
781   const int32_t* b,
782   void* packed_w,
783   size_t extra_bytes,
784   const struct xnn_qs8_packing_params* params)
785 {
786   assert(nr >= sr);
787 
788   const size_t skr = sr * kr;
789   const int32_t izp = (int32_t) params->input_zero_point;
790   do {
791     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
792       const size_t nr_block_size = min(nc - nr_block_start, nr);
793       int32_t* packed_b = (int32_t*) packed_w;
794       if XNN_LIKELY(b != NULL) {
795         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
796           *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
797           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
798         }
799       } else {
800         size_t n = nr_block_size;
801         do {
802           *((int32_t*) packed_w) = 0;
803           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
804         } while (--n != 0);
805       }
806       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
807 
808       for (size_t ki = 0; ki < ks; ki++) {
809         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
810           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
811             int32_t ksum = 0;
812             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
813               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
814               if (kc_idx < kc) {
815                 const int8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
816                 ksum += (int32_t) kv;
817                 ((int8_t*) packed_w)[kr_block_offset] = kv;
818               }
819             }
820             packed_b[nr_block_offset] -= ksum * izp;
821             packed_w = (int8_t*) packed_w + kr;
822           }
823           packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
824         }
825       }
826       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
827     }
828     k += ks * kc * nc;
829     if XNN_UNPREDICTABLE(b != NULL) {
830       b += nc;
831     }
832   } while (--g != 0);
833 }
834 
xnn_pack_f32_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)835 void xnn_pack_f32_conv_kgo_w(
836   size_t g,
837   size_t nc,
838   size_t ks,
839   size_t nr,
840   size_t kr,
841   size_t sr,
842   const float* k,
843   const float* b,
844   float* packed_w,
845   size_t extra_bytes,
846   const void* params)
847 {
848   assert(nr >= sr);
849 
850   for (size_t i = 0; i < g; i++) {
851     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
852       const size_t nr_block_size = min(nc - nr_block_start, nr);
853       if XNN_LIKELY(b != NULL) {
854         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
855           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
856         }
857       }
858       packed_w += nr;
859 
860       for (size_t ki = 0; ki < ks; ki++) {
861         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
862           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
863             packed_w[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
864           }
865           packed_w += nr * kr;
866         }
867       }
868       packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
869     }
870     k += nc;
871     if XNN_UNPREDICTABLE(b != NULL) {
872       b += nc;
873     }
874   }
875 }
876 
xnn_pack_f16_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)877 void xnn_pack_f16_conv_kgo_w(
878   size_t g,
879   size_t nc,
880   size_t ks,
881   size_t nr,
882   size_t kr,
883   size_t sr,
884   const uint16_t* k,
885   const uint16_t* b,
886   uint16_t* packed_w,
887   size_t extra_bytes,
888   const void* params)
889 {
890   assert(nr >= sr);
891 
892   for (size_t i = 0; i < g; i++) {
893     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
894       const size_t nr_block_size = min(nc - nr_block_start, nr);
895       if XNN_LIKELY(b != NULL) {
896         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
897           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
898         }
899       }
900       packed_w += nr;
901 
902       for (size_t ki = 0; ki < ks; ki++) {
903         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
904           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
905             packed_w[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
906           }
907           packed_w += nr * kr;
908         }
909       }
910       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
911     }
912     k += nc;
913     if XNN_UNPREDICTABLE(b != NULL) {
914       b += nc;
915     }
916   }
917 }
918 
xnn_pack_f32_to_f16_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)919 void xnn_pack_f32_to_f16_conv_kgo_w(
920   size_t g,
921   size_t nc,
922   size_t ks,
923   size_t nr,
924   size_t kr,
925   size_t sr,
926   const float* k,
927   const float* b,
928   uint16_t* packed_w,
929   size_t extra_bytes,
930   const void* params)
931 {
932   assert(nr >= sr);
933 
934   for (size_t i = 0; i < g; i++) {
935     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
936       const size_t nr_block_size = min(nc - nr_block_start, nr);
937       if XNN_LIKELY(b != NULL) {
938         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
939           packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
940         }
941       }
942       packed_w += nr;
943 
944       for (size_t ki = 0; ki < ks; ki++) {
945         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
946           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
947             packed_w[nr_block_offset * kr] = fp16_ieee_from_fp32_value(k[ki * g * nc + (nr_block_start + nr_block_offset)]);
948           }
949           packed_w += nr * kr;
950         }
951       }
952       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
953     }
954     k += nc;
955     if XNN_UNPREDICTABLE(b != NULL) {
956       b += nc;
957     }
958   }
959 }
960 
xnn_pack_qu8_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)961 void xnn_pack_qu8_conv_kgo_w(
962   size_t g,
963   size_t nc,
964   size_t ks,
965   size_t nr,
966   size_t kr,
967   size_t sr,
968   const uint8_t* k,
969   const int32_t* b,
970   void* packed_w,
971   size_t extra_bytes,
972   const struct xnn_qu8_packing_params* params)
973 {
974   assert(nr >= sr);
975 
976   const int32_t izp = (int32_t) params->input_zero_point;
977   const int32_t bzp = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
978   for (size_t i = 0; i < g; i++) {
979     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
980       const size_t nr_block_size = min(nc - nr_block_start, nr);
981       int32_t* packed_b = (int32_t*) packed_w;
982       if XNN_LIKELY(b != NULL) {
983         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
984           *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
985           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
986         }
987       } else {
988         size_t n = nr_block_size;
989         do {
990           *((int32_t*) packed_w) = bzp;
991           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
992         } while (--n != 0);
993       }
994       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
995 
996       for (size_t ki = 0; ki < ks; ki++) {
997         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
998           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
999             const uint8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1000             ((uint8_t*) packed_w)[nr_block_offset * kr] = kv;
1001             packed_b[nr_block_offset] -= (int32_t) kv * izp;
1002           }
1003           packed_w = (uint8_t*) packed_w + nr * kr;
1004         }
1005       }
1006       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1007     }
1008     k += nc;
1009     if XNN_UNPREDICTABLE(b != NULL) {
1010       b += nc;
1011     }
1012   }
1013 }
1014 
xnn_pack_qs8_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)1015 void xnn_pack_qs8_conv_kgo_w(
1016   size_t g,
1017   size_t nc,
1018   size_t ks,
1019   size_t nr,
1020   size_t kr,
1021   size_t sr,
1022   const int8_t* k,
1023   const int32_t* b,
1024   void* packed_w,
1025   size_t extra_bytes,
1026   const struct xnn_qs8_packing_params* params)
1027 {
1028   assert(nr >= sr);
1029 
1030   const int32_t izp = (int32_t) params->input_zero_point;
1031   for (size_t i = 0; i < g; i++) {
1032     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1033       const size_t nr_block_size = min(nc - nr_block_start, nr);
1034       int32_t* packed_b = (int32_t*) packed_w;
1035       if XNN_LIKELY(b != NULL) {
1036         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1037           *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
1038           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1039         }
1040       } else {
1041         size_t n = nr_block_size;
1042         do {
1043           *((int32_t*) packed_w) = 0;
1044           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1045         } while (--n != 0);
1046       }
1047       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1048 
1049       for (size_t ki = 0; ki < ks; ki++) {
1050         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1051           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1052             const int8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1053             ((int8_t*) packed_w)[nr_block_offset * kr] = kv;
1054             packed_b[nr_block_offset] -= (int32_t) kv * izp;
1055           }
1056           packed_w = (int8_t*) packed_w + nr * kr;
1057         }
1058       }
1059       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1060     }
1061     k += nc;
1062     if XNN_UNPREDICTABLE(b != NULL) {
1063       b += nc;
1064     }
1065   }
1066 }
1067 
xnn_pack_f32_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,struct subconvolution_params * subconv_params,const void * params)1068 void xnn_pack_f32_deconv_goki_w(
1069   size_t g,
1070   size_t nc,
1071   size_t kh,
1072   size_t kw,
1073   size_t kc,
1074   size_t sh,
1075   size_t sw,
1076   size_t nr,
1077   size_t kr,
1078   size_t sr,
1079   const float* k,
1080   const float* b,
1081   float* packed_w,
1082   struct subconvolution_params* subconv_params,
1083   const void* params)
1084 {
1085   assert(nr >= sr);
1086 
1087   const size_t skr = sr * kr;
1088   for (size_t i = 0; i < g; i++) {
1089     for (size_t oy = 0; oy < sh; oy++) {
1090       for (size_t ox = 0; ox < sw; ox++) {
1091         if (i == 0) {
1092           (*subconv_params++).weights = packed_w;
1093         }
1094         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1095           const size_t nr_block_size = min(nc - nr_block_start, nr);
1096           if XNN_LIKELY(b != NULL) {
1097             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1098               packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1099             }
1100           }
1101           packed_w += nr;
1102           for (size_t ky = oy; ky < kh; ky += sh) {
1103             for (size_t kx = ox; kx < kw; kx += sw) {
1104               for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1105                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1106                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1107                     const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1108                     if (kc_idx < kc) {
1109                       packed_w[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1110                     }
1111                   }
1112                   packed_w += kr;
1113                 }
1114                 packed_w += (nr - nr_block_size) * kr;
1115               }
1116             }
1117           }
1118         }
1119       }
1120     }
1121     k += kh * kw * kc * nc;
1122     if XNN_UNPREDICTABLE(b != NULL) {
1123       b += nc;
1124     }
1125   }
1126 }
1127 
xnn_pack_f16_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,struct subconvolution_params * subconv_params,const void * params)1128 void xnn_pack_f16_deconv_goki_w(
1129   size_t g,
1130   size_t nc,
1131   size_t kh,
1132   size_t kw,
1133   size_t kc,
1134   size_t sh,
1135   size_t sw,
1136   size_t nr,
1137   size_t kr,
1138   size_t sr,
1139   const uint16_t* k,
1140   const uint16_t* b,
1141   uint16_t* packed_w,
1142   struct subconvolution_params* subconv_params,
1143   const void* params)
1144 {
1145   assert(nr >= sr);
1146 
1147   const size_t skr = sr * kr;
1148   for (size_t i = 0; i < g; i++) {
1149     for (size_t oy = 0; oy < sh; oy++) {
1150       for (size_t ox = 0; ox < sw; ox++) {
1151         if (i == 0) {
1152           (*subconv_params++).weights = packed_w;
1153         }
1154         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1155           const size_t nr_block_size = min(nc - nr_block_start, nr);
1156           if XNN_LIKELY(b != NULL) {
1157             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1158               packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1159             }
1160           }
1161           packed_w += nr;
1162           for (size_t ky = oy; ky < kh; ky += sh) {
1163             for (size_t kx = ox; kx < kw; kx += sw) {
1164               for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1165                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1166                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1167                     const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1168                     if (kc_idx < kc) {
1169                       packed_w[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1170                     }
1171                   }
1172                   packed_w += kr;
1173                 }
1174                 packed_w += (nr - nr_block_size) * kr;
1175               }
1176             }
1177           }
1178         }
1179       }
1180     }
1181     k += kh * kw * kc * nc;
1182     if XNN_UNPREDICTABLE(b != NULL) {
1183       b += nc;
1184     }
1185   }
1186 }
1187 
xnn_pack_qs8_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,struct subconvolution_params * subconv_params,const struct xnn_qs8_packing_params * params)1188 void xnn_pack_qs8_deconv_goki_w(
1189   size_t g,
1190   size_t nc,
1191   size_t kh,
1192   size_t kw,
1193   size_t kc,
1194   size_t sh,
1195   size_t sw,
1196   size_t nr,
1197   size_t kr,
1198   size_t sr,
1199   const int8_t* k,
1200   const int32_t* b,
1201   void* packed_w,
1202   struct subconvolution_params* subconv_params,
1203   const struct xnn_qs8_packing_params* params)
1204 {
1205   assert(nr >= sr);
1206 
1207   const size_t skr = sr * kr;
1208   const int32_t izp = (int32_t) params->input_zero_point;
1209   for (size_t i = 0; i < g; i++) {
1210     for (size_t oy = 0; oy < sh; oy++) {
1211       for (size_t ox = 0; ox < sw; ox++) {
1212         if (i == 0) {
1213           (*subconv_params++).weights = packed_w;
1214         }
1215         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1216           const size_t nr_block_size = min(nc - nr_block_start, nr);
1217           int32_t* packed_b = (int32_t*) packed_w;
1218           if XNN_LIKELY(b != 0) {
1219             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1220               *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
1221               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1222             }
1223           } else {
1224             size_t n = nr_block_size;
1225             do {
1226               *((int32_t*) packed_w) = 0;
1227               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1228             } while (--n != 0);
1229           }
1230           packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1231           for (size_t ky = oy; ky < kh; ky += sh) {
1232             for (size_t kx = ox; kx < kw; kx += sw) {
1233               for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1234                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1235                   int32_t ksum = 0;
1236                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1237                     const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1238                     if (kc_idx < kc) {
1239                       const int8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1240                       ksum += (int32_t) kv;
1241                       ((int8_t*) packed_w)[kr_block_offset] = kv;
1242                     }
1243                   }
1244                   packed_b[nr_block_offset] -= ksum * izp;
1245                   packed_w = (int8_t*) packed_w + kr;
1246                 }
1247                 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
1248               }
1249             }
1250           }
1251         }
1252       }
1253     }
1254     k += kh * kw * kc * nc;
1255     if XNN_UNPREDICTABLE(b != NULL) {
1256       b += nc;
1257     }
1258   }
1259 }
1260 
xnn_pack_qu8_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,struct subconvolution_params * subconv_params,const struct xnn_qu8_packing_params * params)1261 void xnn_pack_qu8_deconv_goki_w(
1262   size_t g,
1263   size_t nc,
1264   size_t kh,
1265   size_t kw,
1266   size_t kc,
1267   size_t sh,
1268   size_t sw,
1269   size_t nr,
1270   size_t kr,
1271   size_t sr,
1272   const uint8_t* k,
1273   const int32_t* b,
1274   void* packed_w,
1275   struct subconvolution_params* subconv_params,
1276   const struct xnn_qu8_packing_params* params)
1277 {
1278   assert(nr >= sr);
1279 
1280   const size_t skr = sr * kr;
1281   const int32_t izp = (int32_t) params->input_zero_point;
1282   const int32_t kzp = (int32_t) params->kernel_zero_point;
1283   for (size_t i = 0; i < g; i++) {
1284     for (size_t oy = 0; oy < sh; oy++) {
1285       for (size_t ox = 0; ox < sw; ox++) {
1286         if (i == 0) {
1287           (*subconv_params++).weights = packed_w;
1288         }
1289         const int32_t bzp = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
1290         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1291           const size_t nr_block_size = min(nc - nr_block_start, nr);
1292           int32_t* packed_b = (int32_t*) packed_w;
1293           if XNN_LIKELY(b != 0) {
1294             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1295               *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
1296               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1297             }
1298           } else {
1299             size_t n = nr_block_size;
1300             do {
1301               *((int32_t*) packed_w) = bzp;
1302               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1303             } while (--n != 0);
1304           }
1305           packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1306           for (size_t ky = oy; ky < kh; ky += sh) {
1307             for (size_t kx = ox; kx < kw; kx += sw) {
1308               for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1309                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1310                   int32_t ksum = 0;
1311                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1312                     const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1313                     if (kc_idx < kc) {
1314                       const uint8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1315                       ksum += (int32_t) kv;
1316                       ((uint8_t*) packed_w)[kr_block_offset] = kv;
1317                     }
1318                   }
1319                   packed_b[nr_block_offset] -= ksum * izp;
1320                   packed_w = (uint8_t*) packed_w + kr;
1321                 }
1322                 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
1323               }
1324             }
1325           }
1326         }
1327       }
1328     }
1329     k += kh * kw * kc * nc;
1330     if XNN_UNPREDICTABLE(b != NULL) {
1331       b += nc;
1332     }
1333   }
1334 }
1335 
xnn_pack_f32_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)1336 void xnn_pack_f32_dwconv_ghw_w(
1337   size_t h,
1338   size_t w,
1339   size_t c,
1340   size_t cr,
1341   const float* k,
1342   const float* b,
1343   float* packed_w,
1344   size_t extra_bytes,
1345   const void* params)
1346 {
1347   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1348     const size_t cr_block_size = min(c - cr_block_start, cr);
1349     if XNN_LIKELY(b != NULL) {
1350       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1351         *packed_w++ = b[cr_block_start + cr_block_offset];
1352       }
1353     } else {
1354       size_t n = cr_block_size;
1355       do {
1356         *packed_w++ = 0.0f;
1357       } while (--n != 0);
1358     }
1359     packed_w += cr - cr_block_size;
1360     for (size_t x = 0; x < w; x++) {
1361       for (size_t y = 0; y < h; y++) {
1362         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1363           const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1364           *packed_w++ = kv;
1365         }
1366         packed_w += cr - cr_block_size;
1367       }
1368     }
1369     packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
1370   }
1371 }
1372 
xnn_pack_f16_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1373 void xnn_pack_f16_dwconv_ghw_w(
1374   size_t h,
1375   size_t w,
1376   size_t c,
1377   size_t cr,
1378   const uint16_t* k,
1379   const uint16_t* b,
1380   uint16_t* packed_w,
1381   size_t extra_bytes,
1382   const void* params)
1383 {
1384   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1385     const size_t cr_block_size = min(c - cr_block_start, cr);
1386     if XNN_LIKELY(b != NULL) {
1387       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1388         *packed_w++ = b[cr_block_start + cr_block_offset];
1389       }
1390     } else {
1391       size_t n = cr_block_size;
1392       do {
1393         *packed_w++ = 0;
1394       } while (--n != 0);
1395     }
1396     packed_w += cr - cr_block_size;
1397     for (size_t x = 0; x < w; x++) {
1398       for (size_t y = 0; y < h; y++) {
1399         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1400           const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1401           *packed_w++ = kv;
1402         }
1403         packed_w += cr - cr_block_size;
1404       }
1405     }
1406     packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1407   }
1408 }
1409 
xnn_pack_f32_to_f16_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1410 void xnn_pack_f32_to_f16_dwconv_ghw_w(
1411   size_t h,
1412   size_t w,
1413   size_t c,
1414   size_t cr,
1415   const float* k,
1416   const float* b,
1417   uint16_t* packed_w,
1418   size_t extra_bytes,
1419   const void* params)
1420 {
1421   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1422     const size_t cr_block_size = min(c - cr_block_start, cr);
1423     if XNN_LIKELY(b != NULL) {
1424       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1425         *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
1426       }
1427     } else {
1428       size_t n = cr_block_size;
1429       do {
1430         *packed_w++ = 0;
1431       } while (--n != 0);
1432     }
1433     packed_w += cr - cr_block_size;
1434     for (size_t x = 0; x < w; x++) {
1435       for (size_t y = 0; y < h; y++) {
1436         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1437           const uint16_t kv = fp16_ieee_from_fp32_value(k[((cr_block_start + cr_block_offset) * h + y) * w + x]);
1438           *packed_w++ = kv;
1439         }
1440         packed_w += cr - cr_block_size;
1441       }
1442     }
1443     packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1444   }
1445 }
1446 
xnn_pack_qu8_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)1447 void xnn_pack_qu8_dwconv_ghw_w(
1448   size_t h,
1449   size_t w,
1450   size_t c,
1451   size_t cr,
1452   const uint8_t* k,
1453   const int32_t* b,
1454   void* packed_w,
1455   size_t extra_bytes,
1456   const struct xnn_qu8_packing_params* params)
1457 {
1458   const int32_t izp = (int32_t) params->input_zero_point;
1459   const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1460   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1461     const size_t cr_block_size = min(c - cr_block_start, cr);
1462     int32_t* packed_b = (int32_t*) packed_w;
1463     if XNN_LIKELY(b != NULL) {
1464       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1465         *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1466         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1467       }
1468     } else {
1469       size_t n = cr_block_size;
1470       do {
1471         *((int32_t*) packed_w) = boff;
1472         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1473       } while (--n != 0);
1474     }
1475     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1476     for (size_t x = 0; x < w; x++) {
1477       for (size_t y = 0; y < h; y++) {
1478         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1479           const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1480           packed_b[cr_block_offset] -= (int32_t) kv * izp;
1481           *((uint8_t*) packed_w) = kv;
1482           packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1483         }
1484         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1485       }
1486     }
1487     packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1488   }
1489 }
1490 
xnn_pack_qs8_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)1491 void xnn_pack_qs8_dwconv_ghw_w(
1492   size_t h,
1493   size_t w,
1494   size_t c,
1495   size_t cr,
1496   const int8_t* k,
1497   const int32_t* b,
1498   void* packed_w,
1499   size_t extra_bytes,
1500   const struct xnn_qs8_packing_params* params)
1501 {
1502   const int32_t izp = (int32_t) params->input_zero_point;
1503   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1504     const size_t cr_block_size = min(c - cr_block_start, cr);
1505     int32_t* packed_b = (int32_t*) packed_w;
1506     if XNN_LIKELY(b != NULL) {
1507       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1508         *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1509         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1510       }
1511     } else {
1512       size_t n = cr_block_size;
1513       do {
1514         *((int32_t*) packed_w) = 0;
1515         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1516       } while (--n != 0);
1517     }
1518     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1519     for (size_t x = 0; x < w; x++) {
1520       for (size_t y = 0; y < h; y++) {
1521         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1522           const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1523           packed_b[cr_block_offset] -= (int32_t) kv * izp;
1524           *((int8_t*) packed_w) = kv;
1525           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1526         }
1527         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1528       }
1529     }
1530     packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1531   }
1532 }
1533 
xnn_pack_f32_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)1534 void xnn_pack_f32_dwconv_hwg_w(
1535   size_t h,
1536   size_t w,
1537   size_t c,
1538   size_t cr,
1539   const float* k,
1540   const float* b,
1541   float* packed_w,
1542   size_t extra_bytes,
1543   const void* params)
1544 {
1545   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1546     const size_t cr_block_size = min(c - cr_block_start, cr);
1547     if XNN_LIKELY(b != NULL) {
1548       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1549         *packed_w++ = b[cr_block_start + cr_block_offset];
1550       }
1551     } else {
1552       size_t n = cr_block_size;
1553       do {
1554         *packed_w++ = 0.0f;
1555       } while (--n != 0);
1556     }
1557     packed_w += cr - cr_block_size;
1558     for (size_t x = 0; x < w; x++) {
1559       for (size_t y = 0; y < h; y++) {
1560         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1561           const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1562           *packed_w++ = kv;
1563         }
1564         packed_w += cr - cr_block_size;
1565       }
1566     }
1567     packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
1568   }
1569 }
1570 
xnn_pack_f16_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1571 void xnn_pack_f16_dwconv_hwg_w(
1572   size_t h,
1573   size_t w,
1574   size_t c,
1575   size_t cr,
1576   const uint16_t* k,
1577   const uint16_t* b,
1578   uint16_t* packed_w,
1579   size_t extra_bytes,
1580   const void* params)
1581 {
1582   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1583     const size_t cr_block_size = min(c - cr_block_start, cr);
1584     if XNN_LIKELY(b != NULL) {
1585       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1586         *packed_w++ = b[cr_block_start + cr_block_offset];
1587       }
1588     } else {
1589       size_t n = cr_block_size;
1590       do {
1591         *packed_w++ = 0;
1592       } while (--n != 0);
1593     }
1594     packed_w += cr - cr_block_size;
1595     for (size_t x = 0; x < w; x++) {
1596       for (size_t y = 0; y < h; y++) {
1597         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1598           const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1599           *packed_w++ = kv;
1600         }
1601         packed_w += cr - cr_block_size;
1602       }
1603     }
1604     packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1605   }
1606 }
1607 
xnn_pack_f32_to_f16_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1608 void xnn_pack_f32_to_f16_dwconv_hwg_w(
1609   size_t h,
1610   size_t w,
1611   size_t c,
1612   size_t cr,
1613   const float* k,
1614   const float* b,
1615   uint16_t* packed_w,
1616   size_t extra_bytes,
1617   const void* params)
1618 {
1619   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1620     const size_t cr_block_size = min(c - cr_block_start, cr);
1621     if XNN_LIKELY(b != NULL) {
1622       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1623         *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
1624       }
1625     } else {
1626       size_t n = cr_block_size;
1627       do {
1628         *packed_w++ = 0;
1629       } while (--n != 0);
1630     }
1631     packed_w += cr - cr_block_size;
1632     for (size_t x = 0; x < w; x++) {
1633       for (size_t y = 0; y < h; y++) {
1634         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1635           const uint16_t kv = fp16_ieee_from_fp32_value(k[(y * w + x) * c + (cr_block_start + cr_block_offset)]);
1636           *packed_w++ = kv;
1637         }
1638         packed_w += cr - cr_block_size;
1639       }
1640     }
1641     packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1642   }
1643 }
1644 
xnn_pack_qu8_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)1645 void xnn_pack_qu8_dwconv_hwg_w(
1646   size_t h,
1647   size_t w,
1648   size_t c,
1649   size_t cr,
1650   const uint8_t* k,
1651   const int32_t* b,
1652   void* packed_w,
1653   size_t extra_bytes,
1654   const struct xnn_qu8_packing_params* params)
1655 {
1656   const int32_t izp = (int32_t) params->input_zero_point;
1657   const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1658   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1659     const size_t cr_block_size = min(c - cr_block_start, cr);
1660     int32_t* packed_b = (int32_t*) packed_w;
1661     if XNN_LIKELY(b != NULL) {
1662       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1663         *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1664         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1665       }
1666     } else {
1667       size_t n = cr_block_size;
1668       do {
1669         *((int32_t*) packed_w) = boff;
1670         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1671       } while (--n != 0);
1672     }
1673     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1674     for (size_t x = 0; x < w; x++) {
1675       for (size_t y = 0; y < h; y++) {
1676         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1677           const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1678           packed_b[cr_block_offset] -= (int32_t) kv * izp;
1679           *((uint8_t*) packed_w) = kv;
1680           packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1681         }
1682         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1683       }
1684     }
1685     packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1686   }
1687 }
1688 
xnn_pack_qs8_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)1689 void xnn_pack_qs8_dwconv_hwg_w(
1690   size_t h,
1691   size_t w,
1692   size_t c,
1693   size_t cr,
1694   const int8_t* k,
1695   const int32_t* b,
1696   void* packed_w,
1697   size_t extra_bytes,
1698   const struct xnn_qs8_packing_params* params)
1699 {
1700   const int32_t izp = (int32_t) params->input_zero_point;
1701   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1702     const size_t cr_block_size = min(c - cr_block_start, cr);
1703     int32_t* packed_b = (int32_t*) packed_w;
1704     if XNN_LIKELY(b != NULL) {
1705       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1706         *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1707         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1708       }
1709     } else {
1710       size_t n = cr_block_size;
1711       do {
1712         *((int32_t*) packed_w) = 0;
1713         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1714       } while (--n != 0);
1715     }
1716     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1717     for (size_t x = 0; x < w; x++) {
1718       for (size_t y = 0; y < h; y++) {
1719         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1720           const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1721           packed_b[cr_block_offset] -= (int32_t) kv * izp;
1722           *((int8_t*) packed_w) = kv;
1723           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1724         }
1725         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1726       }
1727     }
1728     packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1729   }
1730 }
1731 
xnn_pack_f32_gemminc_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,float * packed_w,const void * params)1732 void xnn_pack_f32_gemminc_goi_w(
1733   size_t g,
1734   size_t nc,
1735   size_t kc,
1736   size_t nr,
1737   size_t kr,
1738   size_t sr,
1739   const float* k,
1740   float* packed_w,
1741   const void* params)
1742 {
1743   const size_t skr = sr * kr;
1744   do {
1745     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1746       const size_t nr_block_size = min(nc - nr_block_start, nr);
1747 
1748       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1749         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1750           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1751             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1752             if (kc_idx < kc) {
1753               packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
1754             }
1755           }
1756           packed_w += kr;
1757         }
1758         packed_w += (nr - nr_block_size) * kr;
1759       }
1760     }
1761     k += nc * kc;
1762   } while (--g != 0);
1763 }
1764 
xnn_pack_f16_gemminc_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,uint16_t * packed_w,const void * params)1765 void xnn_pack_f16_gemminc_goi_w(
1766   size_t g,
1767   size_t nc,
1768   size_t kc,
1769   size_t nr,
1770   size_t kr,
1771   size_t sr,
1772   const uint16_t* k,
1773   uint16_t* packed_w,
1774   const void* params)
1775 {
1776   const size_t skr = sr * kr;
1777   do {
1778     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1779       const size_t nr_block_size = min(nc - nr_block_start, nr);
1780 
1781       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1782         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1783           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1784             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1785             if (kc_idx < kc) {
1786               packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
1787             }
1788           }
1789           packed_w += kr;
1790         }
1791         packed_w += (nr - nr_block_size) * kr;
1792       }
1793     }
1794     k += nc * kc;
1795   } while (--g != 0);
1796 }
1797 
xnn_pack_f32_dconv_oki_w(size_t nc,size_t kc,size_t nr,size_t kh,size_t kw,const float * k,const float * b,float * packed_w,const void * params)1798 void xnn_pack_f32_dconv_oki_w(
1799   size_t nc,
1800   size_t kc,
1801   size_t nr,
1802   size_t kh,
1803   size_t kw,
1804   const float* k,
1805   const float* b,
1806   float* packed_w,
1807   const void* params)
1808 {
1809   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1810     const size_t nr_block_size = min(nc - nr_block_start, nr);
1811     if XNN_LIKELY(b != NULL) {
1812       for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1813         *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1814       }
1815     } else {
1816       size_t n = nr;
1817       do {
1818         *packed_w++ = 0.0f;
1819       } while (--n != 0);
1820     }
1821 
1822     for (size_t kx = 0; kx < kw; kx++) {
1823       for (size_t c = 0; c < kc; c++) {
1824         for (size_t ky = 0; ky < kh; ky++) {
1825           for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1826             *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1827           }
1828         }
1829       }
1830     }
1831     if XNN_UNPREDICTABLE(b != NULL) {
1832       b += nr;
1833     }
1834   }
1835 }
1836 
xnn_pack_f16_dconv_oki_w(size_t nc,size_t kc,size_t nr,size_t kh,size_t kw,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)1837 void xnn_pack_f16_dconv_oki_w(
1838   size_t nc,
1839   size_t kc,
1840   size_t nr,
1841   size_t kh,
1842   size_t kw,
1843   const uint16_t* k,
1844   const uint16_t* b,
1845   uint16_t* packed_w,
1846   const void* params)
1847 {
1848   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1849     const size_t nr_block_size = min(nc - nr_block_start, nr);
1850     if XNN_LIKELY(b != NULL) {
1851       for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1852         *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1853       }
1854     } else {
1855       size_t n = nr;
1856       do {
1857         *packed_w++ = 0;
1858       } while (--n != 0);
1859     }
1860 
1861     for (size_t kx = 0; kx < kw; kx++) {
1862       for (size_t c = 0; c < kc; c++) {
1863         for (size_t ky = 0; ky < kh; ky++) {
1864           for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1865             *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1866           }
1867         }
1868       }
1869     }
1870     if XNN_UNPREDICTABLE(b != NULL) {
1871       b += nr;
1872     }
1873   }
1874 }
1875 
xnn_pack_f32_chw_dwconv_ghw_w(size_t kernel_size,size_t groups,const float * kernel,const float * bias,float * packed_weights,const void * params)1876 void xnn_pack_f32_chw_dwconv_ghw_w(
1877   size_t kernel_size,
1878   size_t groups,
1879   const float* kernel,
1880   const float* bias,
1881   float* packed_weights,
1882   const void* params)
1883 {
1884   for (size_t g = 0; g < groups; g++) {
1885     if XNN_LIKELY(bias != NULL) {
1886       *packed_weights = *bias++;
1887     } else {
1888       *packed_weights = 0.0f;
1889     }
1890     packed_weights += 1;
1891     for (size_t i = 0; i < kernel_size; i++) {
1892       *packed_weights++ = kernel[g * kernel_size + i];
1893     }
1894   }
1895 }
1896 
xnn_pack_f16_chw_dwconv_ghw_w(size_t kernel_size,size_t groups,const uint16_t * kernel,const uint16_t * bias,uint16_t * packed_weights,const void * params)1897 void xnn_pack_f16_chw_dwconv_ghw_w(
1898   size_t kernel_size,
1899   size_t groups,
1900   const uint16_t* kernel,
1901   const uint16_t* bias,
1902   uint16_t* packed_weights,
1903   const void* params)
1904 {
1905   for (size_t g = 0; g < groups; g++) {
1906     if XNN_LIKELY(bias != NULL) {
1907       *packed_weights = *bias++;
1908     } else {
1909       *packed_weights = 0;
1910     }
1911     packed_weights += 1;
1912     for (size_t i = 0; i < kernel_size; i++) {
1913       *packed_weights++ = kernel[g * kernel_size + i];
1914     }
1915   }
1916 }
1917 
xnn_pack_f32_chw_dwconv_hwg_w(size_t kernel_size,size_t groups,const float * kernel,const float * bias,float * packed_weights,const void * params)1918 void xnn_pack_f32_chw_dwconv_hwg_w(
1919   size_t kernel_size,
1920   size_t groups,
1921   const float* kernel,
1922   const float* bias,
1923   float* packed_weights,
1924   const void* params)
1925 {
1926   for (size_t g = 0; g < groups; g++) {
1927     if XNN_LIKELY(bias != NULL) {
1928       *packed_weights = *bias++;
1929     } else {
1930       *packed_weights = 0.0f;
1931     }
1932     packed_weights += 1;
1933     for (size_t i = 0; i < kernel_size; i++) {
1934       *packed_weights++ = kernel[i * groups + g];
1935     }
1936   }
1937 }
1938 
xnn_pack_f32_vmulcaddc_w(size_t c,size_t cr,const float * s,const float * b,float * packed_w,const void * params)1939 void xnn_pack_f32_vmulcaddc_w(
1940   size_t c,
1941   size_t cr,
1942   const float* s,
1943   const float* b,
1944   float* packed_w,
1945   const void* params)
1946 {
1947   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1948     const size_t cr_block_size = min(c - cr_block_start, cr);
1949     for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1950       *packed_w++ = s[cr_block_start + cr_block_offset];
1951     }
1952     packed_w += cr - cr_block_size;
1953     if XNN_LIKELY(b != NULL) {
1954       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1955         *packed_w++ = b[cr_block_start + cr_block_offset];
1956       }
1957     } else {
1958       size_t n = cr_block_size;
1959       do {
1960         *packed_w++ = 0.0f;
1961       } while (--n != 0);
1962     }
1963     packed_w += cr - cr_block_size;
1964   }
1965 }
1966 
xnn_pack_f16_vmulcaddc_w(size_t c,size_t cr,const uint16_t * s,const uint16_t * b,uint16_t * packed_w,const void * params)1967 void xnn_pack_f16_vmulcaddc_w(
1968   size_t c,
1969   size_t cr,
1970   const uint16_t* s,
1971   const uint16_t* b,
1972   uint16_t* packed_w,
1973   const void* params)
1974 {
1975   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1976     const size_t cr_block_size = min(c - cr_block_start, cr);
1977     for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1978       *packed_w++ = s[cr_block_start + cr_block_offset];
1979     }
1980     packed_w += cr - cr_block_size;
1981     if XNN_LIKELY(b != NULL) {
1982       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1983         *packed_w++ = b[cr_block_start + cr_block_offset];
1984       }
1985     } else {
1986       size_t n = cr_block_size;
1987       do {
1988         *packed_w++ = 0;
1989       } while (--n != 0);
1990     }
1991     packed_w += cr - cr_block_size;
1992   }
1993 }
1994 
xnn_pack_f32_to_f16_vmulcaddc_w(size_t c,size_t cr,const float * s,const float * b,uint16_t * packed_w,const void * params)1995 void xnn_pack_f32_to_f16_vmulcaddc_w(
1996   size_t c,
1997   size_t cr,
1998   const float* s,
1999   const float* b,
2000   uint16_t* packed_w,
2001   const void* params)
2002 {
2003   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2004     const size_t cr_block_size = min(c - cr_block_start, cr);
2005     for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2006       *packed_w++ = fp16_ieee_from_fp32_value(s[cr_block_start + cr_block_offset]);
2007     }
2008     packed_w += cr - cr_block_size;
2009     if XNN_LIKELY(b != NULL) {
2010       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2011         *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
2012       }
2013     } else {
2014       size_t n = cr_block_size;
2015       do {
2016         *packed_w++ = 0;
2017       } while (--n != 0);
2018     }
2019     packed_w += cr - cr_block_size;
2020   }
2021 }
2022 
xnn_pack_f32_prelu_w(size_t c,const float * s,float * packed_w)2023 void xnn_pack_f32_prelu_w(
2024   size_t c,
2025   const float* s,
2026   float* packed_w)
2027 {
2028   memcpy(packed_w, s, c * sizeof(float));
2029 }
2030 
xnn_pack_f16_prelu_w(size_t c,const uint16_t * s,uint16_t * packed_w)2031 void xnn_pack_f16_prelu_w(
2032   size_t c,
2033   const uint16_t* s,
2034   uint16_t* packed_w)
2035 {
2036   memcpy(packed_w, s, c * sizeof(uint16_t));
2037 }
2038 
xnn_pack_f32_to_f16_prelu_w(size_t c,const float * s,uint16_t * packed_w)2039 void xnn_pack_f32_to_f16_prelu_w(
2040   size_t c,
2041   const float* s,
2042   uint16_t* packed_w)
2043 {
2044   do {
2045     *packed_w++ = fp16_ieee_from_fp32_value(*s++);
2046   } while (--c != 0);
2047 }
2048