• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <stdint.h>
10 #include <stddef.h>
11 #include <string.h>
12 
13 #include <fp16.h>
14 
15 #include <xnnpack/math.h>
16 #include <xnnpack/operator.h>
17 #include <xnnpack/pack.h>
18 #include <xnnpack/unaligned.h>
19 
20 
xnn_pack_f32_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)21 void xnn_pack_f32_gemm_goi_w(
22   size_t g,
23   size_t nc,
24   size_t kc,
25   size_t nr,
26   size_t kr,
27   size_t sr,
28   const float* k,
29   const float* b,
30   float* packed_w,
31   size_t extra_bytes,
32   const void* params)
33 {
34   assert(nr >= sr);
35 
36   const size_t skr = sr * kr;
37   do {
38     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
39       const size_t nr_block_size = min(nc - nr_block_start, nr);
40       if XNN_LIKELY(b != NULL) {
41         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
42           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
43         }
44       }
45       packed_w += nr;
46 
47       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
48         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
49           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
50             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
51             if (kc_idx < kc) {
52               packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
53             }
54           }
55           packed_w += kr;
56         }
57         packed_w += (nr - nr_block_size) * kr;
58       }
59       packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
60     }
61     k += nc * kc;
62     if XNN_UNPREDICTABLE(b != NULL) {
63       b += nc;
64     }
65   } while (--g != 0);
66 }
67 
xnn_pack_f16_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)68 void xnn_pack_f16_gemm_goi_w(
69   size_t g,
70   size_t nc,
71   size_t kc,
72   size_t nr,
73   size_t kr,
74   size_t sr,
75   const uint16_t* k,
76   const uint16_t* b,
77   uint16_t* packed_w,
78   size_t extra_bytes,
79   const void* params)
80 {
81   assert(nr >= sr);
82 
83   const size_t skr = sr * kr;
84   do {
85     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
86       const size_t nr_block_size = min(nc - nr_block_start, nr);
87       if XNN_LIKELY(b != NULL) {
88         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
89           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
90         }
91       }
92       packed_w += nr;
93 
94       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
95         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
96           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
97             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
98             if (kc_idx < kc) {
99               packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
100             }
101           }
102           packed_w += kr;
103         }
104         packed_w += (nr - nr_block_size) * kr;
105       }
106       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
107     }
108     k += nc * kc;
109     if XNN_UNPREDICTABLE(b != NULL) {
110       b += nc;
111     }
112   } while (--g != 0);
113 }
114 
xnn_pack_f32_to_f16_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)115 void xnn_pack_f32_to_f16_gemm_goi_w(
116   size_t g,
117   size_t nc,
118   size_t kc,
119   size_t nr,
120   size_t kr,
121   size_t sr,
122   const float* k,
123   const float* b,
124   uint16_t* packed_w,
125   size_t extra_bytes,
126   const void* params)
127 {
128   assert(nr >= sr);
129 
130   const size_t skr = sr * kr;
131   do {
132     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
133       const size_t nr_block_size = min(nc - nr_block_start, nr);
134       if XNN_LIKELY(b != NULL) {
135         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
136           packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
137         }
138       }
139       packed_w += nr;
140 
141       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
142         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
143           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
144             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
145             if (kc_idx < kc) {
146               packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[(nr_block_start + nr_block_offset) * kc + kc_idx]);
147             }
148           }
149           packed_w += kr;
150         }
151         packed_w += (nr - nr_block_size) * kr;
152       }
153       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
154     }
155     k += nc * kc;
156     if XNN_UNPREDICTABLE(b != NULL) {
157       b += nc;
158     }
159   } while (--g != 0);
160 }
161 
xnn_pack_qu8_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)162 void xnn_pack_qu8_gemm_goi_w(
163   size_t g,
164   size_t nc,
165   size_t kc,
166   size_t nr,
167   size_t kr,
168   size_t sr,
169   const uint8_t* k,
170   const int32_t* b,
171   void* packed_w,
172   size_t extra_bytes,
173   const struct xnn_qu8_packing_params* params)
174 {
175   assert(nr >= sr);
176 
177   const size_t skr = sr * kr;
178   const int32_t izp = (int32_t) params->input_zero_point;
179   const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
180   do {
181     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
182       const size_t nr_block_size = min(nc - nr_block_start, nr);
183       int32_t* packed_b = (int32_t*) packed_w;
184       if XNN_LIKELY(b != NULL) {
185         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
186           unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
187           packed_w = (int32_t*) packed_w + 1;
188         }
189       } else {
190         size_t n = nr_block_size;
191         do {
192           unaligned_store_s32(packed_w, bzp);
193           packed_w = (int32_t*) packed_w + 1;
194         } while (--n != 0);
195       }
196       packed_w = (int32_t*) packed_w + (nr - nr_block_size);
197 
198       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
199         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
200           int32_t ksum = 0;
201           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
202             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
203             if (kc_idx < kc) {
204               const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
205               ksum += (int32_t) kv;
206               ((uint8_t*) packed_w)[kr_block_offset] = kv;
207             }
208           }
209           unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
210           packed_w = (uint8_t*) packed_w + kr;
211         }
212         packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
213       }
214       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
215     }
216     k += nc * kc;
217     if XNN_UNPREDICTABLE(b != NULL) {
218       b += nc;
219     }
220   } while (--g != 0);
221 }
222 
xnn_pack_qs8_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)223 void xnn_pack_qs8_gemm_goi_w(
224   size_t g,
225   size_t nc,
226   size_t kc,
227   size_t nr,
228   size_t kr,
229   size_t sr,
230   const int8_t* k,
231   const int32_t* b,
232   void* packed_w,
233   size_t extra_bytes,
234   const struct xnn_qs8_packing_params* params)
235 {
236   assert(nr >= sr);
237 
238   const size_t skr = sr * kr;
239   const uint32_t izp = (uint32_t) params->input_zero_point;
240   do {
241     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
242       const size_t nr_block_size = min(nc - nr_block_start, nr);
243       int32_t* packed_b = (int32_t*) packed_w;
244       if XNN_LIKELY(b != NULL) {
245         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
246           unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
247           packed_w = (int32_t*) packed_w + 1;
248         }
249       } else {
250         size_t n = nr_block_size;
251         do {
252           unaligned_store_s32(packed_w, 0);
253           packed_w = (int32_t*) packed_w + 1;
254         } while (--n != 0);
255       }
256       packed_w = (int32_t*) packed_w + (nr - nr_block_size);
257 
258       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
259         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
260           uint32_t ksum = 0;
261           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
262             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
263             if (kc_idx < kc) {
264               const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
265               ksum += (uint32_t) kv;
266               ((int8_t*) packed_w)[kr_block_offset] = kv;
267             }
268           }
269           unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
270           packed_w = (int8_t*) packed_w + kr;
271         }
272         packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
273       }
274       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
275     }
276     k += nc * kc;
277     if XNN_UNPREDICTABLE(b != NULL) {
278       b += nc;
279     }
280   } while (--g != 0);
281 }
282 
xnn_pack_qs8_gemm_xw_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)283 void xnn_pack_qs8_gemm_xw_goi_w(
284   size_t g,
285   size_t nc,
286   size_t kc,
287   size_t nr,
288   size_t kr,
289   size_t sr,
290   const int8_t* k,
291   const int32_t* b,
292   void* packed_w,
293   size_t extra_bytes,
294   const struct xnn_qs8_packing_params* params)
295 {
296   assert(nr >= sr);
297 
298   const size_t skr = sr * kr;
299   const uint32_t izp = (uint32_t) params->input_zero_point;
300   do {
301     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
302       const size_t nr_block_size = min(nc - nr_block_start, nr);
303       int32_t* packed_b = (int32_t*) packed_w;
304       if XNN_LIKELY(b != NULL) {
305         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
306           unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
307           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
308         }
309       } else {
310         size_t n = nr_block_size;
311         do {
312           unaligned_store_s32(packed_w, 0);
313           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
314         } while (--n != 0);
315       }
316       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
317 
318       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
319         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
320           uint32_t ksum = 0;
321           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
322             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
323             if (kc_idx < kc) {
324               const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
325               ksum += (uint32_t) kv;
326               ((int16_t*) packed_w)[kr_block_offset] = (int16_t) kv;
327             }
328           }
329           unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
330           packed_w = (int16_t*) packed_w + kr;
331         }
332         packed_w = (int16_t*) packed_w + (nr - nr_block_size) * kr;
333       }
334       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
335     }
336     k += nc * kc;
337     if XNN_UNPREDICTABLE(b != NULL) {
338       b += nc;
339     }
340   } while (--g != 0);
341 }
342 
xnn_pack_f32_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,const void * params)343 void xnn_pack_f32_gemm_io_w(
344   size_t nc,
345   size_t kc,
346   size_t nr,
347   size_t kr,
348   size_t sr,
349   const float* k,
350   const float* b,
351   float* packed_w,
352   const void* params)
353 {
354   assert(nr >= sr);
355 
356   const size_t skr = sr * kr;
357   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
358     const size_t nr_block_size = min(nc - nr_block_start, nr);
359     if XNN_LIKELY(b != NULL) {
360       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
361         packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
362       }
363     }
364     packed_w += nr;
365 
366     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
367       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
368         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
369           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
370           if (kc_idx < kc) {
371             packed_w[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
372           }
373         }
374         packed_w += kr;
375       }
376       packed_w += (nr - nr_block_size) * kr;
377     }
378   }
379 }
380 
xnn_pack_f16_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)381 void xnn_pack_f16_gemm_io_w(
382   size_t nc,
383   size_t kc,
384   size_t nr,
385   size_t kr,
386   size_t sr,
387   const uint16_t* k,
388   const uint16_t* b,
389   uint16_t* packed_w,
390   const void* params)
391 {
392   assert(nr >= sr);
393 
394   const size_t skr = sr * kr;
395   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
396     const size_t nr_block_size = min(nc - nr_block_start, nr);
397     if XNN_LIKELY(b != NULL) {
398       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
399         packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
400       }
401     }
402     packed_w += nr;
403 
404     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
405       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
406         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
407           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
408           if (kc_idx < kc) {
409             packed_w[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
410           }
411         }
412         packed_w += kr;
413       }
414       packed_w += (nr - nr_block_size) * kr;
415     }
416   }
417 }
418 
xnn_pack_f32_to_f16_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,const void * params)419 void xnn_pack_f32_to_f16_gemm_io_w(
420   size_t nc,
421   size_t kc,
422   size_t nr,
423   size_t kr,
424   size_t sr,
425   const float* k,
426   const float* b,
427   uint16_t* packed_w,
428   const void* params)
429 {
430   assert(nr >= sr);
431 
432   const size_t skr = sr * kr;
433   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
434     const size_t nr_block_size = min(nc - nr_block_start, nr);
435     if XNN_LIKELY(b != NULL) {
436       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
437         packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
438       }
439     }
440     packed_w += nr;
441 
442     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
443       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
444         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
445           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
446           if (kc_idx < kc) {
447             packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[kc_idx * nc + nr_block_start + nr_block_offset]);
448           }
449         }
450         packed_w += kr;
451       }
452       packed_w += (nr - nr_block_size) * kr;
453     }
454   }
455 }
456 
xnn_pack_qu8_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)457 void xnn_pack_qu8_gemm_io_w(
458   size_t nc,
459   size_t kc,
460   size_t nr,
461   size_t kr,
462   size_t sr,
463   const uint8_t* k,
464   const int32_t* b,
465   void* packed_w,
466   const struct xnn_qu8_packing_params* params)
467 {
468   assert(nr >= sr);
469 
470   const size_t skr = sr * kr;
471   const int32_t izp = (int32_t) params->input_zero_point;
472   const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
473   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
474     const size_t nr_block_size = min(nc - nr_block_start, nr);
475     int32_t* packed_b = (int32_t*) packed_w;
476     if XNN_LIKELY(b != NULL) {
477       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
478         unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
479         packed_w = (int32_t*) packed_w + 1;
480       }
481     } else {
482       size_t n = nr_block_size;
483       do {
484         unaligned_store_s32(packed_w, bzp);
485         packed_w = (int32_t*) packed_w + 1;
486       } while (--n != 0);
487     }
488     packed_w = (int32_t*) packed_w + (nr - nr_block_size);
489 
490     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
491       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
492         int32_t ksum = 0;
493         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
494           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
495           if (kc_idx < kc) {
496             const uint8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
497             ksum += (int32_t) kv;
498             ((uint8_t*) packed_w)[kr_block_offset] = kv;
499           }
500         }
501         unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
502         packed_w = (uint8_t*) packed_w + kr;
503       }
504       packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
505     }
506   }
507 }
508 
xnn_pack_qs8_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)509 void xnn_pack_qs8_gemm_io_w(
510   size_t nc,
511   size_t kc,
512   size_t nr,
513   size_t kr,
514   size_t sr,
515   const int8_t* k,
516   const int32_t* b,
517   void* packed_w,
518   const struct xnn_qs8_packing_params* params)
519 {
520   assert(nr >= sr);
521 
522   const size_t skr = sr * kr;
523   const uint32_t izp = (uint32_t) params->input_zero_point;
524   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
525     const size_t nr_block_size = min(nc - nr_block_start, nr);
526     int32_t* packed_b = (int32_t*) packed_w;
527     if XNN_LIKELY(b != NULL) {
528       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
529         unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
530         packed_w = (int32_t*) packed_w + 1;
531       }
532     } else {
533       size_t n = nr_block_size;
534       do {
535         unaligned_store_s32(packed_w, 0);
536         packed_w = (int32_t*) packed_w + 1;
537       } while (--n != 0);
538     }
539     packed_w = (uint32_t*) packed_w + (nr - nr_block_size);
540 
541     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
542       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
543         uint32_t ksum = 0;
544         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
545           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
546           if (kc_idx < kc) {
547             const int8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
548             ksum += (uint32_t) kv;
549             ((int8_t*) packed_w)[kr_block_offset] = kv;
550           }
551         }
552         unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
553         packed_w = (int8_t*) packed_w + kr;
554       }
555       packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
556     }
557   }
558 }
559 
xnn_pack_f32_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)560 void xnn_pack_f32_conv_goki_w(
561   size_t g,
562   size_t nc,
563   size_t ks,
564   size_t kc,
565   size_t nr,
566   size_t kr,
567   size_t sr,
568   const float* k,
569   const float* b,
570   float* packed_w,
571   size_t extra_bytes,
572   const void* params)
573 {
574   assert(nr >= sr);
575 
576   const size_t skr = sr * kr;
577   do {
578     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
579       const size_t nr_block_size = min(nc - nr_block_start, nr);
580       if XNN_LIKELY(b != NULL) {
581         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
582           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
583         }
584       }
585       packed_w += nr;
586 
587       for (size_t ki = 0; ki < ks; ki++) {
588         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
589           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
590             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
591               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
592               if (kc_idx < kc) {
593                 packed_w[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
594               }
595             }
596             packed_w += kr;
597           }
598           packed_w += (nr - nr_block_size) * kr;
599         }
600       }
601       packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
602     }
603     k += ks * kc * nc;
604     if XNN_UNPREDICTABLE(b != NULL) {
605       b += nc;
606     }
607   } while (--g != 0);
608 }
609 
xnn_pack_f16_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)610 void xnn_pack_f16_conv_goki_w(
611   size_t g,
612   size_t nc,
613   size_t ks,
614   size_t kc,
615   size_t nr,
616   size_t kr,
617   size_t sr,
618   const uint16_t* k,
619   const uint16_t* b,
620   uint16_t* packed_w,
621   size_t extra_bytes,
622   const void* params)
623 {
624   assert(nr >= sr);
625 
626   const size_t skr = sr * kr;
627   do {
628     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
629       const size_t nr_block_size = min(nc - nr_block_start, nr);
630       if XNN_LIKELY(b != NULL) {
631         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
632           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
633         }
634       }
635       packed_w += nr;
636 
637       for (size_t ki = 0; ki < ks; ki++) {
638         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
639           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
640             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
641               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
642               if (kc_idx < kc) {
643                 packed_w[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
644               }
645             }
646             packed_w += kr;
647           }
648           packed_w += (nr - nr_block_size) * kr;
649         }
650       }
651       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
652     }
653     k += ks * kc * nc;
654     if XNN_UNPREDICTABLE(b != NULL) {
655       b += nc;
656     }
657   } while (--g != 0);
658 }
659 
xnn_pack_f32_to_f16_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)660 void xnn_pack_f32_to_f16_conv_goki_w(
661   size_t g,
662   size_t nc,
663   size_t ks,
664   size_t kc,
665   size_t nr,
666   size_t kr,
667   size_t sr,
668   const float* k,
669   const float* b,
670   uint16_t* packed_w,
671   size_t extra_bytes,
672   const void* params)
673 {
674   assert(nr >= sr);
675 
676   const size_t skr = sr * kr;
677   do {
678     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
679       const size_t nr_block_size = min(nc - nr_block_start, nr);
680       if XNN_LIKELY(b != NULL) {
681         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
682           packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
683         }
684       }
685       packed_w += nr;
686 
687       for (size_t ki = 0; ki < ks; ki++) {
688         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
689           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
690             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
691               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
692               if (kc_idx < kc) {
693                 packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx]);
694               }
695             }
696             packed_w += kr;
697           }
698           packed_w += (nr - nr_block_size) * kr;
699         }
700       }
701       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
702     }
703     k += ks * kc * nc;
704     if XNN_UNPREDICTABLE(b != NULL) {
705       b += nc;
706     }
707   } while (--g != 0);
708 }
709 
xnn_pack_qu8_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)710 void xnn_pack_qu8_conv_goki_w(
711   size_t g,
712   size_t nc,
713   size_t ks,
714   size_t kc,
715   size_t nr,
716   size_t kr,
717   size_t sr,
718   const uint8_t* k,
719   const int32_t* b,
720   void* packed_w,
721   size_t extra_bytes,
722   const struct xnn_qu8_packing_params* params)
723 {
724   assert(nr >= sr);
725 
726   const size_t skr = sr * kr;
727   const int32_t izp = (int32_t) params->input_zero_point;
728   const int32_t bzp = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
729   do {
730     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
731       const size_t nr_block_size = min(nc - nr_block_start, nr);
732       int32_t* packed_b = (int32_t*) packed_w;
733       if XNN_LIKELY(b != NULL) {
734         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
735           unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
736           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
737         }
738       } else {
739         size_t n = nr_block_size;
740         do {
741           unaligned_store_s32(packed_w, bzp);
742           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
743         } while (--n != 0);
744       }
745       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
746 
747       for (size_t ki = 0; ki < ks; ki++) {
748         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
749           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
750             int32_t ksum = 0;
751             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
752               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
753               if (kc_idx < kc) {
754                 const uint8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
755                 ksum += (int32_t) kv;
756                 ((uint8_t*) packed_w)[kr_block_offset] = kv;
757               }
758             }
759             unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
760             packed_w = (uint8_t*) packed_w + kr;
761           }
762           packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
763         }
764       }
765       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
766     }
767     k += ks * kc * nc;
768     if XNN_UNPREDICTABLE(b != NULL) {
769       b += nc;
770     }
771   } while (--g != 0);
772 }
773 
xnn_pack_qs8_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)774 void xnn_pack_qs8_conv_goki_w(
775   size_t g,
776   size_t nc,
777   size_t ks,
778   size_t kc,
779   size_t nr,
780   size_t kr,
781   size_t sr,
782   const int8_t* k,
783   const int32_t* b,
784   void* packed_w,
785   size_t extra_bytes,
786   const struct xnn_qs8_packing_params* params)
787 {
788   assert(nr >= sr);
789 
790   const size_t skr = sr * kr;
791   const uint32_t izp = (int32_t) params->input_zero_point;
792   do {
793     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
794       const size_t nr_block_size = min(nc - nr_block_start, nr);
795       int32_t* packed_b = (int32_t*) packed_w;
796       if XNN_LIKELY(b != NULL) {
797         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
798           unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
799           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
800         }
801       } else {
802         size_t n = nr_block_size;
803         do {
804           unaligned_store_s32(packed_w, 0);
805           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
806         } while (--n != 0);
807       }
808       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
809 
810       for (size_t ki = 0; ki < ks; ki++) {
811         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
812           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
813             uint32_t ksum = 0;
814             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
815               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
816               if (kc_idx < kc) {
817                 const int8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
818                 ksum += (uint32_t) kv;
819                 ((int8_t*) packed_w)[kr_block_offset] = kv;
820               }
821             }
822             unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
823             packed_w = (int8_t*) packed_w + kr;
824           }
825           packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
826         }
827       }
828       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
829     }
830     k += ks * kc * nc;
831     if XNN_UNPREDICTABLE(b != NULL) {
832       b += nc;
833     }
834   } while (--g != 0);
835 }
836 
xnn_pack_f32_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)837 void xnn_pack_f32_conv_kgo_w(
838   size_t g,
839   size_t nc,
840   size_t ks,
841   size_t nr,
842   size_t kr,
843   size_t sr,
844   const float* k,
845   const float* b,
846   float* packed_w,
847   size_t extra_bytes,
848   const void* params)
849 {
850   assert(nr >= sr);
851 
852   for (size_t i = 0; i < g; i++) {
853     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
854       const size_t nr_block_size = min(nc - nr_block_start, nr);
855       if XNN_LIKELY(b != NULL) {
856         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
857           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
858         }
859       }
860       packed_w += nr;
861 
862       for (size_t ki = 0; ki < ks; ki++) {
863         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
864           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
865             packed_w[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
866           }
867           packed_w += nr * kr;
868         }
869       }
870       packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
871     }
872     k += nc;
873     if XNN_UNPREDICTABLE(b != NULL) {
874       b += nc;
875     }
876   }
877 }
878 
xnn_pack_f16_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)879 void xnn_pack_f16_conv_kgo_w(
880   size_t g,
881   size_t nc,
882   size_t ks,
883   size_t nr,
884   size_t kr,
885   size_t sr,
886   const uint16_t* k,
887   const uint16_t* b,
888   uint16_t* packed_w,
889   size_t extra_bytes,
890   const void* params)
891 {
892   assert(nr >= sr);
893 
894   for (size_t i = 0; i < g; i++) {
895     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
896       const size_t nr_block_size = min(nc - nr_block_start, nr);
897       if XNN_LIKELY(b != NULL) {
898         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
899           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
900         }
901       }
902       packed_w += nr;
903 
904       for (size_t ki = 0; ki < ks; ki++) {
905         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
906           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
907             packed_w[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
908           }
909           packed_w += nr * kr;
910         }
911       }
912       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
913     }
914     k += nc;
915     if XNN_UNPREDICTABLE(b != NULL) {
916       b += nc;
917     }
918   }
919 }
920 
xnn_pack_f32_to_f16_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)921 void xnn_pack_f32_to_f16_conv_kgo_w(
922   size_t g,
923   size_t nc,
924   size_t ks,
925   size_t nr,
926   size_t kr,
927   size_t sr,
928   const float* k,
929   const float* b,
930   uint16_t* packed_w,
931   size_t extra_bytes,
932   const void* params)
933 {
934   assert(nr >= sr);
935 
936   for (size_t i = 0; i < g; i++) {
937     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
938       const size_t nr_block_size = min(nc - nr_block_start, nr);
939       if XNN_LIKELY(b != NULL) {
940         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
941           packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
942         }
943       }
944       packed_w += nr;
945 
946       for (size_t ki = 0; ki < ks; ki++) {
947         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
948           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
949             packed_w[nr_block_offset * kr] = fp16_ieee_from_fp32_value(k[ki * g * nc + (nr_block_start + nr_block_offset)]);
950           }
951           packed_w += nr * kr;
952         }
953       }
954       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
955     }
956     k += nc;
957     if XNN_UNPREDICTABLE(b != NULL) {
958       b += nc;
959     }
960   }
961 }
962 
xnn_pack_qu8_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)963 void xnn_pack_qu8_conv_kgo_w(
964   size_t g,
965   size_t nc,
966   size_t ks,
967   size_t nr,
968   size_t kr,
969   size_t sr,
970   const uint8_t* k,
971   const int32_t* b,
972   void* packed_w,
973   size_t extra_bytes,
974   const struct xnn_qu8_packing_params* params)
975 {
976   assert(nr >= sr);
977 
978   const int32_t izp = (int32_t) params->input_zero_point;
979   const int32_t bzp = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
980   for (size_t i = 0; i < g; i++) {
981     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
982       const size_t nr_block_size = min(nc - nr_block_start, nr);
983       int32_t* packed_b = (int32_t*) packed_w;
984       if XNN_LIKELY(b != NULL) {
985         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
986           unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
987           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
988         }
989       } else {
990         size_t n = nr_block_size;
991         do {
992           unaligned_store_s32(packed_w, bzp);
993           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
994         } while (--n != 0);
995       }
996       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
997 
998       for (size_t ki = 0; ki < ks; ki++) {
999         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1000           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1001             const uint8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1002             ((uint8_t*) packed_w)[nr_block_offset * kr] = kv;
1003             unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - (int32_t) kv * izp);
1004           }
1005           packed_w = (uint8_t*) packed_w + nr * kr;
1006         }
1007       }
1008       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1009     }
1010     k += nc;
1011     if XNN_UNPREDICTABLE(b != NULL) {
1012       b += nc;
1013     }
1014   }
1015 }
1016 
xnn_pack_qs8_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)1017 void xnn_pack_qs8_conv_kgo_w(
1018   size_t g,
1019   size_t nc,
1020   size_t ks,
1021   size_t nr,
1022   size_t kr,
1023   size_t sr,
1024   const int8_t* k,
1025   const int32_t* b,
1026   void* packed_w,
1027   size_t extra_bytes,
1028   const struct xnn_qs8_packing_params* params)
1029 {
1030   assert(nr >= sr);
1031 
1032   const uint32_t izp = (uint32_t) params->input_zero_point;
1033   for (size_t i = 0; i < g; i++) {
1034     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1035       const size_t nr_block_size = min(nc - nr_block_start, nr);
1036       int32_t* packed_b = (int32_t*) packed_w;
1037       if XNN_LIKELY(b != NULL) {
1038         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1039           unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
1040           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1041         }
1042       } else {
1043         size_t n = nr_block_size;
1044         do {
1045           unaligned_store_s32(packed_w, 0);
1046           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1047         } while (--n != 0);
1048       }
1049       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1050 
1051       for (size_t ki = 0; ki < ks; ki++) {
1052         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1053           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1054             const int8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1055             ((int8_t*) packed_w)[nr_block_offset * kr] = kv;
1056             unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - (uint32_t) kv * izp);
1057           }
1058           packed_w = (int8_t*) packed_w + nr * kr;
1059         }
1060       }
1061       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1062     }
1063     k += nc;
1064     if XNN_UNPREDICTABLE(b != NULL) {
1065       b += nc;
1066     }
1067   }
1068 }
1069 
xnn_pack_f32_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,struct subconvolution_params * subconv_params,const void * params)1070 void xnn_pack_f32_deconv_goki_w(
1071   size_t g,
1072   size_t nc,
1073   size_t kh,
1074   size_t kw,
1075   size_t kc,
1076   size_t sh,
1077   size_t sw,
1078   size_t nr,
1079   size_t kr,
1080   size_t sr,
1081   const float* k,
1082   const float* b,
1083   float* packed_w,
1084   struct subconvolution_params* subconv_params,
1085   const void* params)
1086 {
1087   assert(nr >= sr);
1088 
1089   const size_t skr = sr * kr;
1090   for (size_t i = 0; i < g; i++) {
1091     for (size_t oy = 0; oy < sh; oy++) {
1092       for (size_t ox = 0; ox < sw; ox++) {
1093         if (i == 0) {
1094           (*subconv_params++).weights = packed_w;
1095         }
1096         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1097           const size_t nr_block_size = min(nc - nr_block_start, nr);
1098           if XNN_LIKELY(b != NULL) {
1099             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1100               packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1101             }
1102           }
1103           packed_w += nr;
1104           for (size_t ky = oy; ky < kh; ky += sh) {
1105             for (size_t kx = ox; kx < kw; kx += sw) {
1106               for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1107                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1108                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1109                     const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1110                     if (kc_idx < kc) {
1111                       packed_w[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1112                     }
1113                   }
1114                   packed_w += kr;
1115                 }
1116                 packed_w += (nr - nr_block_size) * kr;
1117               }
1118             }
1119           }
1120         }
1121       }
1122     }
1123     k += kh * kw * kc * nc;
1124     if XNN_UNPREDICTABLE(b != NULL) {
1125       b += nc;
1126     }
1127   }
1128 }
1129 
xnn_pack_f16_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,struct subconvolution_params * subconv_params,const void * params)1130 void xnn_pack_f16_deconv_goki_w(
1131   size_t g,
1132   size_t nc,
1133   size_t kh,
1134   size_t kw,
1135   size_t kc,
1136   size_t sh,
1137   size_t sw,
1138   size_t nr,
1139   size_t kr,
1140   size_t sr,
1141   const uint16_t* k,
1142   const uint16_t* b,
1143   uint16_t* packed_w,
1144   struct subconvolution_params* subconv_params,
1145   const void* params)
1146 {
1147   assert(nr >= sr);
1148 
1149   const size_t skr = sr * kr;
1150   for (size_t i = 0; i < g; i++) {
1151     for (size_t oy = 0; oy < sh; oy++) {
1152       for (size_t ox = 0; ox < sw; ox++) {
1153         if (i == 0) {
1154           (*subconv_params++).weights = packed_w;
1155         }
1156         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1157           const size_t nr_block_size = min(nc - nr_block_start, nr);
1158           if XNN_LIKELY(b != NULL) {
1159             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1160               packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1161             }
1162           }
1163           packed_w += nr;
1164           for (size_t ky = oy; ky < kh; ky += sh) {
1165             for (size_t kx = ox; kx < kw; kx += sw) {
1166               for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1167                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1168                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1169                     const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1170                     if (kc_idx < kc) {
1171                       packed_w[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1172                     }
1173                   }
1174                   packed_w += kr;
1175                 }
1176                 packed_w += (nr - nr_block_size) * kr;
1177               }
1178             }
1179           }
1180         }
1181       }
1182     }
1183     k += kh * kw * kc * nc;
1184     if XNN_UNPREDICTABLE(b != NULL) {
1185       b += nc;
1186     }
1187   }
1188 }
1189 
xnn_pack_f32_to_f16_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,struct subconvolution_params * subconv_params,const void * params)1190 void xnn_pack_f32_to_f16_deconv_goki_w(
1191   size_t g,
1192   size_t nc,
1193   size_t kh,
1194   size_t kw,
1195   size_t kc,
1196   size_t sh,
1197   size_t sw,
1198   size_t nr,
1199   size_t kr,
1200   size_t sr,
1201   const float* k,
1202   const float* b,
1203   uint16_t* packed_w,
1204   struct subconvolution_params* subconv_params,
1205   const void* params)
1206 {
1207   assert(nr >= sr);
1208 
1209   const size_t skr = sr * kr;
1210   for (size_t i = 0; i < g; i++) {
1211     for (size_t oy = 0; oy < sh; oy++) {
1212       for (size_t ox = 0; ox < sw; ox++) {
1213         if (i == 0) {
1214           (*subconv_params++).weights = packed_w;
1215         }
1216         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1217           const size_t nr_block_size = min(nc - nr_block_start, nr);
1218           if XNN_LIKELY(b != NULL) {
1219             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1220               packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
1221             }
1222           }
1223           packed_w += nr;
1224           for (size_t ky = oy; ky < kh; ky += sh) {
1225             for (size_t kx = ox; kx < kw; kx += sw) {
1226               for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1227                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1228                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1229                     const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1230                     if (kc_idx < kc) {
1231                       packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx]);
1232                     }
1233                   }
1234                   packed_w += kr;
1235                 }
1236                 packed_w += (nr - nr_block_size) * kr;
1237               }
1238             }
1239           }
1240         }
1241       }
1242     }
1243     k += kh * kw * kc * nc;
1244     if XNN_UNPREDICTABLE(b != NULL) {
1245       b += nc;
1246     }
1247   }
1248 }
1249 
xnn_pack_qs8_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,struct subconvolution_params * subconv_params,const struct xnn_qs8_packing_params * params)1250 void xnn_pack_qs8_deconv_goki_w(
1251   size_t g,
1252   size_t nc,
1253   size_t kh,
1254   size_t kw,
1255   size_t kc,
1256   size_t sh,
1257   size_t sw,
1258   size_t nr,
1259   size_t kr,
1260   size_t sr,
1261   const int8_t* k,
1262   const int32_t* b,
1263   void* packed_w,
1264   struct subconvolution_params* subconv_params,
1265   const struct xnn_qs8_packing_params* params)
1266 {
1267   assert(nr >= sr);
1268 
1269   const size_t skr = sr * kr;
1270   const uint32_t izp = (uint32_t) params->input_zero_point;
1271   for (size_t i = 0; i < g; i++) {
1272     for (size_t oy = 0; oy < sh; oy++) {
1273       for (size_t ox = 0; ox < sw; ox++) {
1274         if (i == 0) {
1275           (*subconv_params++).weights = packed_w;
1276         }
1277         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1278           const size_t nr_block_size = min(nc - nr_block_start, nr);
1279           int32_t* packed_b = (int32_t*) packed_w;
1280           if XNN_LIKELY(b != 0) {
1281             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1282               unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
1283               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1284             }
1285           } else {
1286             size_t n = nr_block_size;
1287             do {
1288               unaligned_store_s32(packed_w, 0);
1289               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1290             } while (--n != 0);
1291           }
1292           packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1293           for (size_t ky = oy; ky < kh; ky += sh) {
1294             for (size_t kx = ox; kx < kw; kx += sw) {
1295               for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1296                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1297                   uint32_t ksum = 0;
1298                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1299                     const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1300                     if (kc_idx < kc) {
1301                       const int8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1302                       ksum += (uint32_t) kv;
1303                       ((int8_t*) packed_w)[kr_block_offset] = kv;
1304                     }
1305                   }
1306                   unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
1307                   packed_w = (int8_t*) packed_w + kr;
1308                 }
1309                 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
1310               }
1311             }
1312           }
1313         }
1314       }
1315     }
1316     k += kh * kw * kc * nc;
1317     if XNN_UNPREDICTABLE(b != NULL) {
1318       b += nc;
1319     }
1320   }
1321 }
1322 
xnn_pack_qu8_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,struct subconvolution_params * subconv_params,const struct xnn_qu8_packing_params * params)1323 void xnn_pack_qu8_deconv_goki_w(
1324   size_t g,
1325   size_t nc,
1326   size_t kh,
1327   size_t kw,
1328   size_t kc,
1329   size_t sh,
1330   size_t sw,
1331   size_t nr,
1332   size_t kr,
1333   size_t sr,
1334   const uint8_t* k,
1335   const int32_t* b,
1336   void* packed_w,
1337   struct subconvolution_params* subconv_params,
1338   const struct xnn_qu8_packing_params* params)
1339 {
1340   assert(nr >= sr);
1341 
1342   const size_t skr = sr * kr;
1343   const int32_t izp = (int32_t) params->input_zero_point;
1344   const int32_t kzp = (int32_t) params->kernel_zero_point;
1345   for (size_t i = 0; i < g; i++) {
1346     for (size_t oy = 0; oy < sh; oy++) {
1347       for (size_t ox = 0; ox < sw; ox++) {
1348         if (i == 0) {
1349           (*subconv_params++).weights = packed_w;
1350         }
1351         const int32_t bzp = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
1352         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1353           const size_t nr_block_size = min(nc - nr_block_start, nr);
1354           int32_t* packed_b = (int32_t*) packed_w;
1355           if XNN_LIKELY(b != 0) {
1356             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1357               unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
1358               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1359             }
1360           } else {
1361             size_t n = nr_block_size;
1362             do {
1363               unaligned_store_s32(packed_w, bzp);
1364               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1365             } while (--n != 0);
1366           }
1367           packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1368           for (size_t ky = oy; ky < kh; ky += sh) {
1369             for (size_t kx = ox; kx < kw; kx += sw) {
1370               for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1371                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1372                   int32_t ksum = 0;
1373                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1374                     const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1375                     if (kc_idx < kc) {
1376                       const uint8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1377                       ksum += (int32_t) kv;
1378                       ((uint8_t*) packed_w)[kr_block_offset] = kv;
1379                     }
1380                   }
1381                   unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
1382                   packed_w = (uint8_t*) packed_w + kr;
1383                 }
1384                 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
1385               }
1386             }
1387           }
1388         }
1389       }
1390     }
1391     k += kh * kw * kc * nc;
1392     if XNN_UNPREDICTABLE(b != NULL) {
1393       b += nc;
1394     }
1395   }
1396 }
1397 
xnn_pack_f32_dwconv_ghw_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)1398 void xnn_pack_f32_dwconv_ghw_w(
1399   size_t primary_tile,
1400   size_t h,
1401   size_t w,
1402   size_t c,
1403   size_t cr,
1404   const float* k,
1405   const float* b,
1406   float* packed_w,
1407   size_t extra_bytes,
1408   const void* params)
1409 {
1410   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1411     const size_t cr_block_size = min(c - cr_block_start, cr);
1412     if XNN_LIKELY(b != NULL) {
1413       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1414         *packed_w++ = b[cr_block_start + cr_block_offset];
1415       }
1416     } else {
1417       size_t n = cr_block_size;
1418       do {
1419         *packed_w++ = 0.0f;
1420       } while (--n != 0);
1421     }
1422     packed_w += cr - cr_block_size;
1423     for (size_t x = 0; x < w; x++) {
1424       for (size_t y = 0; y < h; y++) {
1425         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1426           const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1427           *packed_w++ = kv;
1428         }
1429         packed_w += cr - cr_block_size;
1430       }
1431     }
1432     packed_w += (primary_tile - (h * w)) * cr_block_size;
1433     packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
1434   }
1435 }
1436 
xnn_pack_f16_dwconv_ghw_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1437 void xnn_pack_f16_dwconv_ghw_w(
1438   size_t primary_tile,
1439   size_t h,
1440   size_t w,
1441   size_t c,
1442   size_t cr,
1443   const uint16_t* k,
1444   const uint16_t* b,
1445   uint16_t* packed_w,
1446   size_t extra_bytes,
1447   const void* params)
1448 {
1449   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1450     const size_t cr_block_size = min(c - cr_block_start, cr);
1451     if XNN_LIKELY(b != NULL) {
1452       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1453         *packed_w++ = b[cr_block_start + cr_block_offset];
1454       }
1455     } else {
1456       size_t n = cr_block_size;
1457       do {
1458         *packed_w++ = 0;
1459       } while (--n != 0);
1460     }
1461     packed_w += cr - cr_block_size;
1462     for (size_t x = 0; x < w; x++) {
1463       for (size_t y = 0; y < h; y++) {
1464         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1465           const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1466           *packed_w++ = kv;
1467         }
1468         packed_w += cr - cr_block_size;
1469       }
1470     }
1471     packed_w += (primary_tile - (h * w)) * cr_block_size;
1472     packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1473   }
1474 }
1475 
xnn_pack_f32_to_f16_dwconv_ghw_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1476 void xnn_pack_f32_to_f16_dwconv_ghw_w(
1477   size_t primary_tile,
1478   size_t h,
1479   size_t w,
1480   size_t c,
1481   size_t cr,
1482   const float* k,
1483   const float* b,
1484   uint16_t* packed_w,
1485   size_t extra_bytes,
1486   const void* params)
1487 {
1488   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1489     const size_t cr_block_size = min(c - cr_block_start, cr);
1490     if XNN_LIKELY(b != NULL) {
1491       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1492         *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
1493       }
1494     } else {
1495       size_t n = cr_block_size;
1496       do {
1497         *packed_w++ = 0;
1498       } while (--n != 0);
1499     }
1500     packed_w += cr - cr_block_size;
1501     for (size_t x = 0; x < w; x++) {
1502       for (size_t y = 0; y < h; y++) {
1503         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1504           const uint16_t kv = fp16_ieee_from_fp32_value(k[((cr_block_start + cr_block_offset) * h + y) * w + x]);
1505           *packed_w++ = kv;
1506         }
1507         packed_w += cr - cr_block_size;
1508       }
1509     }
1510     packed_w += (primary_tile - (h * w)) * cr_block_size;
1511     packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1512   }
1513 }
1514 
xnn_pack_qu8_dwconv_ghw_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)1515 void xnn_pack_qu8_dwconv_ghw_w(
1516   size_t primary_tile,
1517   size_t h,
1518   size_t w,
1519   size_t c,
1520   size_t cr,
1521   const uint8_t* k,
1522   const int32_t* b,
1523   void* packed_w,
1524   size_t extra_bytes,
1525   const struct xnn_qu8_packing_params* params)
1526 {
1527   const int32_t izp = (int32_t) params->input_zero_point;
1528   const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1529   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1530     const size_t cr_block_size = min(c - cr_block_start, cr);
1531     int32_t* packed_b = (int32_t*) packed_w;
1532     if XNN_LIKELY(b != NULL) {
1533       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1534         unaligned_store_s32(packed_w, boff + b[cr_block_start + cr_block_offset]);
1535         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1536       }
1537     } else {
1538       size_t n = cr_block_size;
1539       do {
1540         unaligned_store_s32(packed_w, boff);
1541         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1542       } while (--n != 0);
1543     }
1544     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1545     for (size_t x = 0; x < w; x++) {
1546       for (size_t y = 0; y < h; y++) {
1547         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1548           const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1549           unaligned_indexed_store_s32(packed_b, cr_block_offset, unaligned_indexed_load_s32(packed_b, cr_block_offset) - (int32_t) kv * izp);
1550           *((uint8_t*) packed_w) = kv;
1551           packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1552         }
1553         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1554       }
1555     }
1556     packed_w = (void*) ((uintptr_t) packed_w + (primary_tile - (h * w)) * cr_block_size * sizeof(uint8_t));
1557     packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1558   }
1559 }
1560 
xnn_pack_qs8_dwconv_ghw_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)1561 void xnn_pack_qs8_dwconv_ghw_w(
1562   size_t primary_tile,
1563   size_t h,
1564   size_t w,
1565   size_t c,
1566   size_t cr,
1567   const int8_t* k,
1568   const int32_t* b,
1569   void* packed_w,
1570   size_t extra_bytes,
1571   const struct xnn_qs8_packing_params* params)
1572 {
1573   const uint32_t izp = (uint32_t) params->input_zero_point;
1574   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1575     const size_t cr_block_size = min(c - cr_block_start, cr);
1576     int32_t* packed_b = (int32_t*) packed_w;
1577     if XNN_LIKELY(b != NULL) {
1578       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1579         unaligned_store_s32(packed_w, b[cr_block_start + cr_block_offset]);
1580         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1581       }
1582     } else {
1583       size_t n = cr_block_size;
1584       do {
1585         unaligned_store_s32(packed_w, 0);
1586         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1587       } while (--n != 0);
1588     }
1589     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1590     for (size_t x = 0; x < w; x++) {
1591       for (size_t y = 0; y < h; y++) {
1592         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1593           const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1594           unaligned_indexed_store_u32(packed_b, cr_block_offset, unaligned_indexed_load_u32(packed_b, cr_block_offset) - (uint32_t) kv * izp);
1595           *((int8_t*) packed_w) = kv;
1596           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1597         }
1598         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1599       }
1600     }
1601     packed_w = (void*) ((uintptr_t) packed_w + (primary_tile - (h * w)) * cr_block_size * sizeof(int8_t));
1602     packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1603   }
1604 }
1605 
xnn_pack_f32_dwconv_hwg_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)1606 void xnn_pack_f32_dwconv_hwg_w(
1607   size_t primary_tile,
1608   size_t h,
1609   size_t w,
1610   size_t c,
1611   size_t cr,
1612   const float* k,
1613   const float* b,
1614   float* packed_w,
1615   size_t extra_bytes,
1616   const void* params)
1617 {
1618   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1619     const size_t cr_block_size = min(c - cr_block_start, cr);
1620     if XNN_LIKELY(b != NULL) {
1621       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1622         *packed_w++ = b[cr_block_start + cr_block_offset];
1623       }
1624     } else {
1625       size_t n = cr_block_size;
1626       do {
1627         *packed_w++ = 0.0f;
1628       } while (--n != 0);
1629     }
1630     packed_w += cr - cr_block_size;
1631     for (size_t x = 0; x < w; x++) {
1632       for (size_t y = 0; y < h; y++) {
1633         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1634           const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1635           *packed_w++ = kv;
1636         }
1637         packed_w += cr - cr_block_size;
1638       }
1639     }
1640     packed_w += (primary_tile - (h * w)) * cr_block_size;
1641     packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
1642   }
1643 }
1644 
xnn_pack_f16_dwconv_hwg_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1645 void xnn_pack_f16_dwconv_hwg_w(
1646   size_t primary_tile,
1647   size_t h,
1648   size_t w,
1649   size_t c,
1650   size_t cr,
1651   const uint16_t* k,
1652   const uint16_t* b,
1653   uint16_t* packed_w,
1654   size_t extra_bytes,
1655   const void* params)
1656 {
1657   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1658     const size_t cr_block_size = min(c - cr_block_start, cr);
1659     if XNN_LIKELY(b != NULL) {
1660       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1661         *packed_w++ = b[cr_block_start + cr_block_offset];
1662       }
1663     } else {
1664       size_t n = cr_block_size;
1665       do {
1666         *packed_w++ = 0;
1667       } while (--n != 0);
1668     }
1669     packed_w += cr - cr_block_size;
1670     for (size_t x = 0; x < w; x++) {
1671       for (size_t y = 0; y < h; y++) {
1672         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1673           const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1674           *packed_w++ = kv;
1675         }
1676         packed_w += cr - cr_block_size;
1677       }
1678     }
1679     packed_w += (primary_tile - (h * w)) * cr_block_size;
1680     packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1681   }
1682 }
1683 
xnn_pack_f32_to_f16_dwconv_hwg_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1684 void xnn_pack_f32_to_f16_dwconv_hwg_w(
1685   size_t primary_tile,
1686   size_t h,
1687   size_t w,
1688   size_t c,
1689   size_t cr,
1690   const float* k,
1691   const float* b,
1692   uint16_t* packed_w,
1693   size_t extra_bytes,
1694   const void* params)
1695 {
1696   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1697     const size_t cr_block_size = min(c - cr_block_start, cr);
1698     if XNN_LIKELY(b != NULL) {
1699       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1700         *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
1701       }
1702     } else {
1703       size_t n = cr_block_size;
1704       do {
1705         *packed_w++ = 0;
1706       } while (--n != 0);
1707     }
1708     packed_w += cr - cr_block_size;
1709     for (size_t x = 0; x < w; x++) {
1710       for (size_t y = 0; y < h; y++) {
1711         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1712           const uint16_t kv = fp16_ieee_from_fp32_value(k[(y * w + x) * c + (cr_block_start + cr_block_offset)]);
1713           *packed_w++ = kv;
1714         }
1715         packed_w += cr - cr_block_size;
1716       }
1717     }
1718     packed_w += (primary_tile - (h * w)) * cr_block_size;
1719     packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1720   }
1721 }
1722 
xnn_pack_qu8_dwconv_hwg_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)1723 void xnn_pack_qu8_dwconv_hwg_w(
1724   size_t primary_tile,
1725   size_t h,
1726   size_t w,
1727   size_t c,
1728   size_t cr,
1729   const uint8_t* k,
1730   const int32_t* b,
1731   void* packed_w,
1732   size_t extra_bytes,
1733   const struct xnn_qu8_packing_params* params)
1734 {
1735   const int32_t izp = (int32_t) params->input_zero_point;
1736   const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1737   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1738     const size_t cr_block_size = min(c - cr_block_start, cr);
1739     int32_t* packed_b = (int32_t*) packed_w;
1740     if XNN_LIKELY(b != NULL) {
1741       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1742         unaligned_store_s32(packed_w, boff + b[cr_block_start + cr_block_offset]);
1743         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1744       }
1745     } else {
1746       size_t n = cr_block_size;
1747       do {
1748         unaligned_store_s32(packed_w, boff);
1749         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1750       } while (--n != 0);
1751     }
1752     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1753     for (size_t x = 0; x < w; x++) {
1754       for (size_t y = 0; y < h; y++) {
1755         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1756           const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1757           unaligned_indexed_store_s32(packed_b, cr_block_offset, unaligned_indexed_load_s32(packed_b, cr_block_offset) - (int32_t) kv * izp);
1758           *((uint8_t*) packed_w) = kv;
1759           packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1760         }
1761         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1762       }
1763     }
1764     packed_w = (void*) ((uintptr_t) packed_w + (primary_tile - (h * w)) * cr_block_size * sizeof(uint8_t));
1765     packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1766   }
1767 }
1768 
xnn_pack_qs8_dwconv_hwg_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)1769 void xnn_pack_qs8_dwconv_hwg_w(
1770   size_t primary_tile,
1771   size_t h,
1772   size_t w,
1773   size_t c,
1774   size_t cr,
1775   const int8_t* k,
1776   const int32_t* b,
1777   void* packed_w,
1778   size_t extra_bytes,
1779   const struct xnn_qs8_packing_params* params)
1780 {
1781   const uint32_t izp = (int32_t) params->input_zero_point;
1782   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1783     const size_t cr_block_size = min(c - cr_block_start, cr);
1784     int32_t* packed_b = (int32_t*) packed_w;
1785     if XNN_LIKELY(b != NULL) {
1786       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1787         unaligned_store_s32(packed_w, b[cr_block_start + cr_block_offset]);
1788         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1789       }
1790     } else {
1791       size_t n = cr_block_size;
1792       do {
1793         unaligned_store_s32(packed_w, 0);
1794         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1795       } while (--n != 0);
1796     }
1797     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1798     for (size_t x = 0; x < w; x++) {
1799       for (size_t y = 0; y < h; y++) {
1800         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1801           const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1802           unaligned_indexed_store_u32(packed_b, cr_block_offset, unaligned_indexed_load_u32(packed_b, cr_block_offset) - (uint32_t) kv * izp);
1803           *((int8_t*) packed_w) = kv;
1804           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1805         }
1806         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1807       }
1808     }
1809     packed_w = (void*) ((uintptr_t) packed_w + (primary_tile - (h * w)) * cr_block_size * sizeof(int8_t));
1810     packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1811   }
1812 }
1813 
xnn_pack_f32_gemminc_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,float * packed_w,const void * params)1814 void xnn_pack_f32_gemminc_goi_w(
1815   size_t g,
1816   size_t nc,
1817   size_t kc,
1818   size_t nr,
1819   size_t kr,
1820   size_t sr,
1821   const float* k,
1822   float* packed_w,
1823   const void* params)
1824 {
1825   const size_t skr = sr * kr;
1826   do {
1827     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1828       const size_t nr_block_size = min(nc - nr_block_start, nr);
1829 
1830       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1831         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1832           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1833             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1834             if (kc_idx < kc) {
1835               packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
1836             }
1837           }
1838           packed_w += kr;
1839         }
1840         packed_w += (nr - nr_block_size) * kr;
1841       }
1842     }
1843     k += nc * kc;
1844   } while (--g != 0);
1845 }
1846 
xnn_pack_f16_gemminc_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,uint16_t * packed_w,const void * params)1847 void xnn_pack_f16_gemminc_goi_w(
1848   size_t g,
1849   size_t nc,
1850   size_t kc,
1851   size_t nr,
1852   size_t kr,
1853   size_t sr,
1854   const uint16_t* k,
1855   uint16_t* packed_w,
1856   const void* params)
1857 {
1858   const size_t skr = sr * kr;
1859   do {
1860     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1861       const size_t nr_block_size = min(nc - nr_block_start, nr);
1862 
1863       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1864         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1865           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1866             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1867             if (kc_idx < kc) {
1868               packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
1869             }
1870           }
1871           packed_w += kr;
1872         }
1873         packed_w += (nr - nr_block_size) * kr;
1874       }
1875     }
1876     k += nc * kc;
1877   } while (--g != 0);
1878 }
1879 
xnn_pack_f32_dconv_oki_w(size_t nc,size_t kc,size_t nr,size_t kh,size_t kw,const float * k,const float * b,float * packed_w,const void * params)1880 void xnn_pack_f32_dconv_oki_w(
1881   size_t nc,
1882   size_t kc,
1883   size_t nr,
1884   size_t kh,
1885   size_t kw,
1886   const float* k,
1887   const float* b,
1888   float* packed_w,
1889   const void* params)
1890 {
1891   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1892     const size_t nr_block_size = min(nc - nr_block_start, nr);
1893     if XNN_LIKELY(b != NULL) {
1894       for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1895         *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1896       }
1897     } else {
1898       size_t n = nr;
1899       do {
1900         *packed_w++ = 0.0f;
1901       } while (--n != 0);
1902     }
1903 
1904     for (size_t kx = 0; kx < kw; kx++) {
1905       for (size_t c = 0; c < kc; c++) {
1906         for (size_t ky = 0; ky < kh; ky++) {
1907           for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1908             *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1909           }
1910         }
1911       }
1912     }
1913     if XNN_UNPREDICTABLE(b != NULL) {
1914       b += nr;
1915     }
1916   }
1917 }
1918 
xnn_pack_f16_dconv_oki_w(size_t nc,size_t kc,size_t nr,size_t kh,size_t kw,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)1919 void xnn_pack_f16_dconv_oki_w(
1920   size_t nc,
1921   size_t kc,
1922   size_t nr,
1923   size_t kh,
1924   size_t kw,
1925   const uint16_t* k,
1926   const uint16_t* b,
1927   uint16_t* packed_w,
1928   const void* params)
1929 {
1930   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1931     const size_t nr_block_size = min(nc - nr_block_start, nr);
1932     if XNN_LIKELY(b != NULL) {
1933       for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1934         *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1935       }
1936     } else {
1937       size_t n = nr;
1938       do {
1939         *packed_w++ = 0;
1940       } while (--n != 0);
1941     }
1942 
1943     for (size_t kx = 0; kx < kw; kx++) {
1944       for (size_t c = 0; c < kc; c++) {
1945         for (size_t ky = 0; ky < kh; ky++) {
1946           for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1947             *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1948           }
1949         }
1950       }
1951     }
1952     if XNN_UNPREDICTABLE(b != NULL) {
1953       b += nr;
1954     }
1955   }
1956 }
1957 
xnn_pack_f32_chw_dwconv_ghw_w(size_t kernel_size,size_t groups,const float * kernel,const float * bias,float * packed_weights,const void * params)1958 void xnn_pack_f32_chw_dwconv_ghw_w(
1959   size_t kernel_size,
1960   size_t groups,
1961   const float* kernel,
1962   const float* bias,
1963   float* packed_weights,
1964   const void* params)
1965 {
1966   for (size_t g = 0; g < groups; g++) {
1967     if XNN_LIKELY(bias != NULL) {
1968       *packed_weights = *bias++;
1969     } else {
1970       *packed_weights = 0.0f;
1971     }
1972     packed_weights += 1;
1973     for (size_t i = 0; i < kernel_size; i++) {
1974       *packed_weights++ = kernel[g * kernel_size + i];
1975     }
1976   }
1977 }
1978 
xnn_pack_f16_chw_dwconv_ghw_w(size_t kernel_size,size_t groups,const uint16_t * kernel,const uint16_t * bias,uint16_t * packed_weights,const void * params)1979 void xnn_pack_f16_chw_dwconv_ghw_w(
1980   size_t kernel_size,
1981   size_t groups,
1982   const uint16_t* kernel,
1983   const uint16_t* bias,
1984   uint16_t* packed_weights,
1985   const void* params)
1986 {
1987   for (size_t g = 0; g < groups; g++) {
1988     if XNN_LIKELY(bias != NULL) {
1989       *packed_weights = *bias++;
1990     } else {
1991       *packed_weights = 0;
1992     }
1993     packed_weights += 1;
1994     for (size_t i = 0; i < kernel_size; i++) {
1995       *packed_weights++ = kernel[g * kernel_size + i];
1996     }
1997   }
1998 }
1999 
xnn_pack_f32_chw_dwconv_hwg_w(size_t kernel_size,size_t groups,const float * kernel,const float * bias,float * packed_weights,const void * params)2000 void xnn_pack_f32_chw_dwconv_hwg_w(
2001   size_t kernel_size,
2002   size_t groups,
2003   const float* kernel,
2004   const float* bias,
2005   float* packed_weights,
2006   const void* params)
2007 {
2008   for (size_t g = 0; g < groups; g++) {
2009     if XNN_LIKELY(bias != NULL) {
2010       *packed_weights = *bias++;
2011     } else {
2012       *packed_weights = 0.0f;
2013     }
2014     packed_weights += 1;
2015     for (size_t i = 0; i < kernel_size; i++) {
2016       *packed_weights++ = kernel[i * groups + g];
2017     }
2018   }
2019 }
2020 
xnn_pack_f32_vmulcaddc_w(size_t c,size_t cr,const float * s,const float * b,float * packed_w,const void * params)2021 void xnn_pack_f32_vmulcaddc_w(
2022   size_t c,
2023   size_t cr,
2024   const float* s,
2025   const float* b,
2026   float* packed_w,
2027   const void* params)
2028 {
2029   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2030     const size_t cr_block_size = min(c - cr_block_start, cr);
2031     for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2032       *packed_w++ = s[cr_block_start + cr_block_offset];
2033     }
2034     packed_w += cr - cr_block_size;
2035     if XNN_LIKELY(b != NULL) {
2036       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2037         *packed_w++ = b[cr_block_start + cr_block_offset];
2038       }
2039     } else {
2040       size_t n = cr_block_size;
2041       do {
2042         *packed_w++ = 0.0f;
2043       } while (--n != 0);
2044     }
2045     packed_w += cr - cr_block_size;
2046   }
2047 }
2048 
xnn_pack_f16_vmulcaddc_w(size_t c,size_t cr,const uint16_t * s,const uint16_t * b,uint16_t * packed_w,const void * params)2049 void xnn_pack_f16_vmulcaddc_w(
2050   size_t c,
2051   size_t cr,
2052   const uint16_t* s,
2053   const uint16_t* b,
2054   uint16_t* packed_w,
2055   const void* params)
2056 {
2057   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2058     const size_t cr_block_size = min(c - cr_block_start, cr);
2059     for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2060       *packed_w++ = s[cr_block_start + cr_block_offset];
2061     }
2062     packed_w += cr - cr_block_size;
2063     if XNN_LIKELY(b != NULL) {
2064       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2065         *packed_w++ = b[cr_block_start + cr_block_offset];
2066       }
2067     } else {
2068       size_t n = cr_block_size;
2069       do {
2070         *packed_w++ = 0;
2071       } while (--n != 0);
2072     }
2073     packed_w += cr - cr_block_size;
2074   }
2075 }
2076 
xnn_pack_f32_to_f16_vmulcaddc_w(size_t c,size_t cr,const float * s,const float * b,uint16_t * packed_w,const void * params)2077 void xnn_pack_f32_to_f16_vmulcaddc_w(
2078   size_t c,
2079   size_t cr,
2080   const float* s,
2081   const float* b,
2082   uint16_t* packed_w,
2083   const void* params)
2084 {
2085   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2086     const size_t cr_block_size = min(c - cr_block_start, cr);
2087     for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2088       *packed_w++ = fp16_ieee_from_fp32_value(s[cr_block_start + cr_block_offset]);
2089     }
2090     packed_w += cr - cr_block_size;
2091     if XNN_LIKELY(b != NULL) {
2092       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2093         *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
2094       }
2095     } else {
2096       size_t n = cr_block_size;
2097       do {
2098         *packed_w++ = 0;
2099       } while (--n != 0);
2100     }
2101     packed_w += cr - cr_block_size;
2102   }
2103 }
2104 
xnn_pack_f32_prelu_w(size_t c,const float * s,float * packed_w)2105 void xnn_pack_f32_prelu_w(
2106   size_t c,
2107   const float* s,
2108   float* packed_w)
2109 {
2110   memcpy(packed_w, s, c * sizeof(float));
2111 }
2112 
xnn_pack_f16_prelu_w(size_t c,const uint16_t * s,uint16_t * packed_w)2113 void xnn_pack_f16_prelu_w(
2114   size_t c,
2115   const uint16_t* s,
2116   uint16_t* packed_w)
2117 {
2118   memcpy(packed_w, s, c * sizeof(uint16_t));
2119 }
2120 
xnn_pack_f32_to_f16_prelu_w(size_t c,const float * s,uint16_t * packed_w)2121 void xnn_pack_f32_to_f16_prelu_w(
2122   size_t c,
2123   const float* s,
2124   uint16_t* packed_w)
2125 {
2126   do {
2127     *packed_w++ = fp16_ieee_from_fp32_value(*s++);
2128   } while (--c != 0);
2129 }
2130