1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <stdint.h>
10 #include <stddef.h>
11 #include <string.h>
12
13 #include <fp16.h>
14
15 #include <xnnpack/math.h>
16 #include <xnnpack/pack.h>
17
18
xnn_pack_f32_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)19 void xnn_pack_f32_gemm_goi_w(
20 size_t g,
21 size_t nc,
22 size_t kc,
23 size_t nr,
24 size_t kr,
25 size_t sr,
26 const float* k,
27 const float* b,
28 float* packed_w,
29 size_t extra_bytes,
30 const void* params)
31 {
32 assert(nr >= sr);
33
34 const size_t skr = sr * kr;
35 do {
36 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
37 const size_t nr_block_size = min(nc - nr_block_start, nr);
38 if XNN_LIKELY(b != NULL) {
39 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
40 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
41 }
42 }
43 packed_w += nr;
44
45 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
46 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
47 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
48 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
49 if (kc_idx < kc) {
50 packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
51 }
52 }
53 packed_w += kr;
54 }
55 packed_w += (nr - nr_block_size) * kr;
56 }
57 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
58 }
59 k += nc * kc;
60 if XNN_UNPREDICTABLE(b != NULL) {
61 b += nc;
62 }
63 } while (--g != 0);
64 }
65
xnn_pack_f16_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)66 void xnn_pack_f16_gemm_goi_w(
67 size_t g,
68 size_t nc,
69 size_t kc,
70 size_t nr,
71 size_t kr,
72 size_t sr,
73 const uint16_t* k,
74 const uint16_t* b,
75 uint16_t* packed_w,
76 size_t extra_bytes,
77 const void* params)
78 {
79 assert(nr >= sr);
80
81 const size_t skr = sr * kr;
82 do {
83 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
84 const size_t nr_block_size = min(nc - nr_block_start, nr);
85 if XNN_LIKELY(b != NULL) {
86 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
87 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
88 }
89 }
90 packed_w += nr;
91
92 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
93 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
94 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
95 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
96 if (kc_idx < kc) {
97 packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
98 }
99 }
100 packed_w += kr;
101 }
102 packed_w += (nr - nr_block_size) * kr;
103 }
104 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
105 }
106 k += nc * kc;
107 if XNN_UNPREDICTABLE(b != NULL) {
108 b += nc;
109 }
110 } while (--g != 0);
111 }
112
xnn_pack_f32_to_f16_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)113 void xnn_pack_f32_to_f16_gemm_goi_w(
114 size_t g,
115 size_t nc,
116 size_t kc,
117 size_t nr,
118 size_t kr,
119 size_t sr,
120 const float* k,
121 const float* b,
122 uint16_t* packed_w,
123 size_t extra_bytes,
124 const void* params)
125 {
126 assert(nr >= sr);
127
128 const size_t skr = sr * kr;
129 do {
130 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
131 const size_t nr_block_size = min(nc - nr_block_start, nr);
132 if XNN_LIKELY(b != NULL) {
133 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
134 packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
135 }
136 }
137 packed_w += nr;
138
139 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
140 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
141 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
142 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
143 if (kc_idx < kc) {
144 packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[(nr_block_start + nr_block_offset) * kc + kc_idx]);
145 }
146 }
147 packed_w += kr;
148 }
149 packed_w += (nr - nr_block_size) * kr;
150 }
151 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
152 }
153 k += nc * kc;
154 if XNN_UNPREDICTABLE(b != NULL) {
155 b += nc;
156 }
157 } while (--g != 0);
158 }
159
xnn_pack_qu8_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)160 void xnn_pack_qu8_gemm_goi_w(
161 size_t g,
162 size_t nc,
163 size_t kc,
164 size_t nr,
165 size_t kr,
166 size_t sr,
167 const uint8_t* k,
168 const int32_t* b,
169 void* packed_w,
170 size_t extra_bytes,
171 const struct xnn_qu8_packing_params* params)
172 {
173 assert(nr >= sr);
174
175 const size_t skr = sr * kr;
176 const int32_t izp = (int32_t) params->input_zero_point;
177 const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
178 do {
179 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
180 const size_t nr_block_size = min(nc - nr_block_start, nr);
181 int32_t* packed_b = (int32_t*) packed_w;
182 if XNN_LIKELY(b != NULL) {
183 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
184 *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
185 packed_w = (int32_t*) packed_w + 1;
186 }
187 } else {
188 size_t n = nr_block_size;
189 do {
190 *((int32_t*) packed_w) = bzp;
191 packed_w = (int32_t*) packed_w + 1;
192 } while (--n != 0);
193 }
194 packed_w = (int32_t*) packed_w + (nr - nr_block_size);
195
196 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
197 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
198 int32_t ksum = 0;
199 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
200 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
201 if (kc_idx < kc) {
202 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
203 ksum += (int32_t) kv;
204 ((uint8_t*) packed_w)[kr_block_offset] = kv;
205 }
206 }
207 packed_b[nr_block_offset] -= ksum * izp;
208 packed_w = (uint8_t*) packed_w + kr;
209 }
210 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
211 }
212 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
213 }
214 k += nc * kc;
215 if XNN_UNPREDICTABLE(b != NULL) {
216 b += nc;
217 }
218 } while (--g != 0);
219 }
220
xnn_pack_qs8_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)221 void xnn_pack_qs8_gemm_goi_w(
222 size_t g,
223 size_t nc,
224 size_t kc,
225 size_t nr,
226 size_t kr,
227 size_t sr,
228 const int8_t* k,
229 const int32_t* b,
230 void* packed_w,
231 size_t extra_bytes,
232 const struct xnn_qs8_packing_params* params)
233 {
234 assert(nr >= sr);
235
236 const size_t skr = sr * kr;
237 const int32_t izp = (int32_t) params->input_zero_point;
238 do {
239 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
240 const size_t nr_block_size = min(nc - nr_block_start, nr);
241 int32_t* packed_b = (int32_t*) packed_w;
242 if XNN_LIKELY(b != NULL) {
243 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
244 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
245 packed_w = (int32_t*) packed_w + 1;
246 }
247 } else {
248 size_t n = nr_block_size;
249 do {
250 *((int32_t*) packed_w) = 0;
251 packed_w = (int32_t*) packed_w + 1;
252 } while (--n != 0);
253 }
254 packed_w = (int32_t*) packed_w + (nr - nr_block_size);
255
256 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
257 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
258 int32_t ksum = 0;
259 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
260 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
261 if (kc_idx < kc) {
262 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
263 ksum += (int32_t) kv;
264 ((int8_t*) packed_w)[kr_block_offset] = kv;
265 }
266 }
267 packed_b[nr_block_offset] -= ksum * izp;
268 packed_w = (int8_t*) packed_w + kr;
269 }
270 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
271 }
272 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
273 }
274 k += nc * kc;
275 if XNN_UNPREDICTABLE(b != NULL) {
276 b += nc;
277 }
278 } while (--g != 0);
279 }
280
xnn_pack_qs8_gemm_xw_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)281 void xnn_pack_qs8_gemm_xw_goi_w(
282 size_t g,
283 size_t nc,
284 size_t kc,
285 size_t nr,
286 size_t kr,
287 size_t sr,
288 const int8_t* k,
289 const int32_t* b,
290 void* packed_w,
291 size_t extra_bytes,
292 const struct xnn_qs8_packing_params* params)
293 {
294 assert(nr >= sr);
295
296 const size_t skr = sr * kr;
297 const int32_t izp = (int32_t) params->input_zero_point;
298 do {
299 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
300 const size_t nr_block_size = min(nc - nr_block_start, nr);
301 int32_t* packed_b = (int32_t*) packed_w;
302 if XNN_LIKELY(b != NULL) {
303 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
304 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
305 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
306 }
307 } else {
308 size_t n = nr_block_size;
309 do {
310 *((int32_t*) packed_w) = 0;
311 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
312 } while (--n != 0);
313 }
314 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
315
316 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
317 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
318 int32_t ksum = 0;
319 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
320 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
321 if (kc_idx < kc) {
322 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
323 ksum += (int32_t) kv;
324 ((int16_t*) packed_w)[kr_block_offset] = (int16_t) kv;
325 }
326 }
327 packed_b[nr_block_offset] -= ksum * izp;
328 packed_w = (int16_t*) packed_w + kr;
329 }
330 packed_w = (int16_t*) packed_w + (nr - nr_block_size) * kr;
331 }
332 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
333 }
334 k += nc * kc;
335 if XNN_UNPREDICTABLE(b != NULL) {
336 b += nc;
337 }
338 } while (--g != 0);
339 }
340
xnn_pack_f32_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,const void * params)341 void xnn_pack_f32_gemm_io_w(
342 size_t nc,
343 size_t kc,
344 size_t nr,
345 size_t kr,
346 size_t sr,
347 const float* k,
348 const float* b,
349 float* packed_w,
350 const void* params)
351 {
352 assert(nr >= sr);
353
354 const size_t skr = sr * kr;
355 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
356 const size_t nr_block_size = min(nc - nr_block_start, nr);
357 if XNN_LIKELY(b != NULL) {
358 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
359 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
360 }
361 }
362 packed_w += nr;
363
364 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
365 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
366 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
367 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
368 if (kc_idx < kc) {
369 packed_w[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
370 }
371 }
372 packed_w += kr;
373 }
374 packed_w += (nr - nr_block_size) * kr;
375 }
376 }
377 }
378
xnn_pack_f16_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)379 void xnn_pack_f16_gemm_io_w(
380 size_t nc,
381 size_t kc,
382 size_t nr,
383 size_t kr,
384 size_t sr,
385 const uint16_t* k,
386 const uint16_t* b,
387 uint16_t* packed_w,
388 const void* params)
389 {
390 assert(nr >= sr);
391
392 const size_t skr = sr * kr;
393 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
394 const size_t nr_block_size = min(nc - nr_block_start, nr);
395 if XNN_LIKELY(b != NULL) {
396 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
397 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
398 }
399 }
400 packed_w += nr;
401
402 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
403 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
404 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
405 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
406 if (kc_idx < kc) {
407 packed_w[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
408 }
409 }
410 packed_w += kr;
411 }
412 packed_w += (nr - nr_block_size) * kr;
413 }
414 }
415 }
416
xnn_pack_f32_to_f16_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,const void * params)417 void xnn_pack_f32_to_f16_gemm_io_w(
418 size_t nc,
419 size_t kc,
420 size_t nr,
421 size_t kr,
422 size_t sr,
423 const float* k,
424 const float* b,
425 uint16_t* packed_w,
426 const void* params)
427 {
428 assert(nr >= sr);
429
430 const size_t skr = sr * kr;
431 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
432 const size_t nr_block_size = min(nc - nr_block_start, nr);
433 if XNN_LIKELY(b != NULL) {
434 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
435 packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
436 }
437 }
438 packed_w += nr;
439
440 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
441 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
442 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
443 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
444 if (kc_idx < kc) {
445 packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[kc_idx * nc + nr_block_start + nr_block_offset]);
446 }
447 }
448 packed_w += kr;
449 }
450 packed_w += (nr - nr_block_size) * kr;
451 }
452 }
453 }
454
xnn_pack_qu8_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)455 void xnn_pack_qu8_gemm_io_w(
456 size_t nc,
457 size_t kc,
458 size_t nr,
459 size_t kr,
460 size_t sr,
461 const uint8_t* k,
462 const int32_t* b,
463 void* packed_w,
464 const struct xnn_qu8_packing_params* params)
465 {
466 assert(nr >= sr);
467
468 const size_t skr = sr * kr;
469 const int32_t izp = (int32_t) params->input_zero_point;
470 const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
471 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
472 const size_t nr_block_size = min(nc - nr_block_start, nr);
473 int32_t* packed_b = (int32_t*) packed_w;
474 if XNN_LIKELY(b != NULL) {
475 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
476 *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
477 packed_w = (int32_t*) packed_w + 1;
478 }
479 } else {
480 size_t n = nr_block_size;
481 do {
482 *((int32_t*) packed_w) = bzp;
483 packed_w = (int32_t*) packed_w + 1;
484 } while (--n != 0);
485 }
486 packed_w = (int32_t*) packed_w + (nr - nr_block_size);
487
488 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
489 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
490 int32_t ksum = 0;
491 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
492 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
493 if (kc_idx < kc) {
494 const uint8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
495 ksum += (int32_t) kv;
496 ((uint8_t*) packed_w)[kr_block_offset] = kv;
497 }
498 }
499 packed_b[nr_block_offset] -= ksum * izp;
500 packed_w = (uint8_t*) packed_w + kr;
501 }
502 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
503 }
504 }
505 }
506
xnn_pack_qs8_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)507 void xnn_pack_qs8_gemm_io_w(
508 size_t nc,
509 size_t kc,
510 size_t nr,
511 size_t kr,
512 size_t sr,
513 const int8_t* k,
514 const int32_t* b,
515 void* packed_w,
516 const struct xnn_qs8_packing_params* params)
517 {
518 assert(nr >= sr);
519
520 const size_t skr = sr * kr;
521 const int32_t izp = (int32_t) params->input_zero_point;
522 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
523 const size_t nr_block_size = min(nc - nr_block_start, nr);
524 int32_t* packed_b = (int32_t*) packed_w;
525 if XNN_LIKELY(b != NULL) {
526 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
527 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
528 packed_w = (int32_t*) packed_w + 1;
529 }
530 } else {
531 size_t n = nr_block_size;
532 do {
533 *((int32_t*) packed_w) = 0;
534 packed_w = (int32_t*) packed_w + 1;
535 } while (--n != 0);
536 }
537 packed_w = (uint32_t*) packed_w + (nr - nr_block_size);
538
539 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
540 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
541 int32_t ksum = 0;
542 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
543 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
544 if (kc_idx < kc) {
545 const int8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
546 ksum += (int32_t) kv;
547 ((int8_t*) packed_w)[kr_block_offset] = kv;
548 }
549 }
550 packed_b[nr_block_offset] -= ksum * izp;
551 packed_w = (int8_t*) packed_w + kr;
552 }
553 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
554 }
555 }
556 }
557
xnn_pack_f32_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)558 void xnn_pack_f32_conv_goki_w(
559 size_t g,
560 size_t nc,
561 size_t ks,
562 size_t kc,
563 size_t nr,
564 size_t kr,
565 size_t sr,
566 const float* k,
567 const float* b,
568 float* packed_w,
569 size_t extra_bytes,
570 const void* params)
571 {
572 assert(nr >= sr);
573
574 const size_t skr = sr * kr;
575 do {
576 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
577 const size_t nr_block_size = min(nc - nr_block_start, nr);
578 if XNN_LIKELY(b != NULL) {
579 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
580 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
581 }
582 }
583 packed_w += nr;
584
585 for (size_t ki = 0; ki < ks; ki++) {
586 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
587 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
588 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
589 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
590 if (kc_idx < kc) {
591 packed_w[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
592 }
593 }
594 packed_w += kr;
595 }
596 packed_w += (nr - nr_block_size) * kr;
597 }
598 }
599 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
600 }
601 k += ks * kc * nc;
602 if XNN_UNPREDICTABLE(b != NULL) {
603 b += nc;
604 }
605 } while (--g != 0);
606 }
607
xnn_pack_f16_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)608 void xnn_pack_f16_conv_goki_w(
609 size_t g,
610 size_t nc,
611 size_t ks,
612 size_t kc,
613 size_t nr,
614 size_t kr,
615 size_t sr,
616 const uint16_t* k,
617 const uint16_t* b,
618 uint16_t* packed_w,
619 size_t extra_bytes,
620 const void* params)
621 {
622 assert(nr >= sr);
623
624 const size_t skr = sr * kr;
625 do {
626 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
627 const size_t nr_block_size = min(nc - nr_block_start, nr);
628 if XNN_LIKELY(b != NULL) {
629 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
630 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
631 }
632 }
633 packed_w += nr;
634
635 for (size_t ki = 0; ki < ks; ki++) {
636 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
637 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
638 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
639 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
640 if (kc_idx < kc) {
641 packed_w[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
642 }
643 }
644 packed_w += kr;
645 }
646 packed_w += (nr - nr_block_size) * kr;
647 }
648 }
649 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
650 }
651 k += ks * kc * nc;
652 if XNN_UNPREDICTABLE(b != NULL) {
653 b += nc;
654 }
655 } while (--g != 0);
656 }
657
xnn_pack_f32_to_f16_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)658 void xnn_pack_f32_to_f16_conv_goki_w(
659 size_t g,
660 size_t nc,
661 size_t ks,
662 size_t kc,
663 size_t nr,
664 size_t kr,
665 size_t sr,
666 const float* k,
667 const float* b,
668 uint16_t* packed_w,
669 size_t extra_bytes,
670 const void* params)
671 {
672 assert(nr >= sr);
673
674 const size_t skr = sr * kr;
675 do {
676 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
677 const size_t nr_block_size = min(nc - nr_block_start, nr);
678 if XNN_LIKELY(b != NULL) {
679 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
680 packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
681 }
682 }
683 packed_w += nr;
684
685 for (size_t ki = 0; ki < ks; ki++) {
686 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
687 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
688 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
689 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
690 if (kc_idx < kc) {
691 packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx]);
692 }
693 }
694 packed_w += kr;
695 }
696 packed_w += (nr - nr_block_size) * kr;
697 }
698 }
699 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
700 }
701 k += ks * kc * nc;
702 if XNN_UNPREDICTABLE(b != NULL) {
703 b += nc;
704 }
705 } while (--g != 0);
706 }
707
xnn_pack_qu8_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)708 void xnn_pack_qu8_conv_goki_w(
709 size_t g,
710 size_t nc,
711 size_t ks,
712 size_t kc,
713 size_t nr,
714 size_t kr,
715 size_t sr,
716 const uint8_t* k,
717 const int32_t* b,
718 void* packed_w,
719 size_t extra_bytes,
720 const struct xnn_qu8_packing_params* params)
721 {
722 assert(nr >= sr);
723
724 const size_t skr = sr * kr;
725 const int32_t izp = (int32_t) params->input_zero_point;
726 const int32_t bzp = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
727 do {
728 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
729 const size_t nr_block_size = min(nc - nr_block_start, nr);
730 int32_t* packed_b = (int32_t*) packed_w;
731 if XNN_LIKELY(b != NULL) {
732 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
733 *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
734 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
735 }
736 } else {
737 size_t n = nr_block_size;
738 do {
739 *((int32_t*) packed_w) = bzp;
740 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
741 } while (--n != 0);
742 }
743 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
744
745 for (size_t ki = 0; ki < ks; ki++) {
746 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
747 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
748 int32_t ksum = 0;
749 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
750 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
751 if (kc_idx < kc) {
752 const uint8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
753 ksum += (int32_t) kv;
754 ((uint8_t*) packed_w)[kr_block_offset] = kv;
755 }
756 }
757 packed_b[nr_block_offset] -= ksum * izp;
758 packed_w = (uint8_t*) packed_w + kr;
759 }
760 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
761 }
762 }
763 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
764 }
765 k += ks * kc * nc;
766 if XNN_UNPREDICTABLE(b != NULL) {
767 b += nc;
768 }
769 } while (--g != 0);
770 }
771
xnn_pack_qs8_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)772 void xnn_pack_qs8_conv_goki_w(
773 size_t g,
774 size_t nc,
775 size_t ks,
776 size_t kc,
777 size_t nr,
778 size_t kr,
779 size_t sr,
780 const int8_t* k,
781 const int32_t* b,
782 void* packed_w,
783 size_t extra_bytes,
784 const struct xnn_qs8_packing_params* params)
785 {
786 assert(nr >= sr);
787
788 const size_t skr = sr * kr;
789 const int32_t izp = (int32_t) params->input_zero_point;
790 do {
791 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
792 const size_t nr_block_size = min(nc - nr_block_start, nr);
793 int32_t* packed_b = (int32_t*) packed_w;
794 if XNN_LIKELY(b != NULL) {
795 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
796 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
797 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
798 }
799 } else {
800 size_t n = nr_block_size;
801 do {
802 *((int32_t*) packed_w) = 0;
803 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
804 } while (--n != 0);
805 }
806 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
807
808 for (size_t ki = 0; ki < ks; ki++) {
809 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
810 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
811 int32_t ksum = 0;
812 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
813 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
814 if (kc_idx < kc) {
815 const int8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
816 ksum += (int32_t) kv;
817 ((int8_t*) packed_w)[kr_block_offset] = kv;
818 }
819 }
820 packed_b[nr_block_offset] -= ksum * izp;
821 packed_w = (int8_t*) packed_w + kr;
822 }
823 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
824 }
825 }
826 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
827 }
828 k += ks * kc * nc;
829 if XNN_UNPREDICTABLE(b != NULL) {
830 b += nc;
831 }
832 } while (--g != 0);
833 }
834
xnn_pack_f32_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)835 void xnn_pack_f32_conv_kgo_w(
836 size_t g,
837 size_t nc,
838 size_t ks,
839 size_t nr,
840 size_t kr,
841 size_t sr,
842 const float* k,
843 const float* b,
844 float* packed_w,
845 size_t extra_bytes,
846 const void* params)
847 {
848 assert(nr >= sr);
849
850 for (size_t i = 0; i < g; i++) {
851 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
852 const size_t nr_block_size = min(nc - nr_block_start, nr);
853 if XNN_LIKELY(b != NULL) {
854 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
855 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
856 }
857 }
858 packed_w += nr;
859
860 for (size_t ki = 0; ki < ks; ki++) {
861 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
862 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
863 packed_w[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
864 }
865 packed_w += nr * kr;
866 }
867 }
868 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
869 }
870 k += nc;
871 if XNN_UNPREDICTABLE(b != NULL) {
872 b += nc;
873 }
874 }
875 }
876
xnn_pack_f16_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)877 void xnn_pack_f16_conv_kgo_w(
878 size_t g,
879 size_t nc,
880 size_t ks,
881 size_t nr,
882 size_t kr,
883 size_t sr,
884 const uint16_t* k,
885 const uint16_t* b,
886 uint16_t* packed_w,
887 size_t extra_bytes,
888 const void* params)
889 {
890 assert(nr >= sr);
891
892 for (size_t i = 0; i < g; i++) {
893 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
894 const size_t nr_block_size = min(nc - nr_block_start, nr);
895 if XNN_LIKELY(b != NULL) {
896 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
897 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
898 }
899 }
900 packed_w += nr;
901
902 for (size_t ki = 0; ki < ks; ki++) {
903 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
904 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
905 packed_w[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
906 }
907 packed_w += nr * kr;
908 }
909 }
910 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
911 }
912 k += nc;
913 if XNN_UNPREDICTABLE(b != NULL) {
914 b += nc;
915 }
916 }
917 }
918
xnn_pack_f32_to_f16_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)919 void xnn_pack_f32_to_f16_conv_kgo_w(
920 size_t g,
921 size_t nc,
922 size_t ks,
923 size_t nr,
924 size_t kr,
925 size_t sr,
926 const float* k,
927 const float* b,
928 uint16_t* packed_w,
929 size_t extra_bytes,
930 const void* params)
931 {
932 assert(nr >= sr);
933
934 for (size_t i = 0; i < g; i++) {
935 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
936 const size_t nr_block_size = min(nc - nr_block_start, nr);
937 if XNN_LIKELY(b != NULL) {
938 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
939 packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
940 }
941 }
942 packed_w += nr;
943
944 for (size_t ki = 0; ki < ks; ki++) {
945 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
946 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
947 packed_w[nr_block_offset * kr] = fp16_ieee_from_fp32_value(k[ki * g * nc + (nr_block_start + nr_block_offset)]);
948 }
949 packed_w += nr * kr;
950 }
951 }
952 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
953 }
954 k += nc;
955 if XNN_UNPREDICTABLE(b != NULL) {
956 b += nc;
957 }
958 }
959 }
960
xnn_pack_qu8_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)961 void xnn_pack_qu8_conv_kgo_w(
962 size_t g,
963 size_t nc,
964 size_t ks,
965 size_t nr,
966 size_t kr,
967 size_t sr,
968 const uint8_t* k,
969 const int32_t* b,
970 void* packed_w,
971 size_t extra_bytes,
972 const struct xnn_qu8_packing_params* params)
973 {
974 assert(nr >= sr);
975
976 const int32_t izp = (int32_t) params->input_zero_point;
977 const int32_t bzp = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
978 for (size_t i = 0; i < g; i++) {
979 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
980 const size_t nr_block_size = min(nc - nr_block_start, nr);
981 int32_t* packed_b = (int32_t*) packed_w;
982 if XNN_LIKELY(b != NULL) {
983 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
984 *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
985 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
986 }
987 } else {
988 size_t n = nr_block_size;
989 do {
990 *((int32_t*) packed_w) = bzp;
991 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
992 } while (--n != 0);
993 }
994 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
995
996 for (size_t ki = 0; ki < ks; ki++) {
997 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
998 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
999 const uint8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1000 ((uint8_t*) packed_w)[nr_block_offset * kr] = kv;
1001 packed_b[nr_block_offset] -= (int32_t) kv * izp;
1002 }
1003 packed_w = (uint8_t*) packed_w + nr * kr;
1004 }
1005 }
1006 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1007 }
1008 k += nc;
1009 if XNN_UNPREDICTABLE(b != NULL) {
1010 b += nc;
1011 }
1012 }
1013 }
1014
xnn_pack_qs8_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)1015 void xnn_pack_qs8_conv_kgo_w(
1016 size_t g,
1017 size_t nc,
1018 size_t ks,
1019 size_t nr,
1020 size_t kr,
1021 size_t sr,
1022 const int8_t* k,
1023 const int32_t* b,
1024 void* packed_w,
1025 size_t extra_bytes,
1026 const struct xnn_qs8_packing_params* params)
1027 {
1028 assert(nr >= sr);
1029
1030 const int32_t izp = (int32_t) params->input_zero_point;
1031 for (size_t i = 0; i < g; i++) {
1032 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1033 const size_t nr_block_size = min(nc - nr_block_start, nr);
1034 int32_t* packed_b = (int32_t*) packed_w;
1035 if XNN_LIKELY(b != NULL) {
1036 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1037 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
1038 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1039 }
1040 } else {
1041 size_t n = nr_block_size;
1042 do {
1043 *((int32_t*) packed_w) = 0;
1044 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1045 } while (--n != 0);
1046 }
1047 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1048
1049 for (size_t ki = 0; ki < ks; ki++) {
1050 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1051 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1052 const int8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1053 ((int8_t*) packed_w)[nr_block_offset * kr] = kv;
1054 packed_b[nr_block_offset] -= (int32_t) kv * izp;
1055 }
1056 packed_w = (int8_t*) packed_w + nr * kr;
1057 }
1058 }
1059 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1060 }
1061 k += nc;
1062 if XNN_UNPREDICTABLE(b != NULL) {
1063 b += nc;
1064 }
1065 }
1066 }
1067
xnn_pack_f32_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,struct subconvolution_params * subconv_params,const void * params)1068 void xnn_pack_f32_deconv_goki_w(
1069 size_t g,
1070 size_t nc,
1071 size_t kh,
1072 size_t kw,
1073 size_t kc,
1074 size_t sh,
1075 size_t sw,
1076 size_t nr,
1077 size_t kr,
1078 size_t sr,
1079 const float* k,
1080 const float* b,
1081 float* packed_w,
1082 struct subconvolution_params* subconv_params,
1083 const void* params)
1084 {
1085 assert(nr >= sr);
1086
1087 const size_t skr = sr * kr;
1088 for (size_t i = 0; i < g; i++) {
1089 for (size_t oy = 0; oy < sh; oy++) {
1090 for (size_t ox = 0; ox < sw; ox++) {
1091 if (i == 0) {
1092 (*subconv_params++).weights = packed_w;
1093 }
1094 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1095 const size_t nr_block_size = min(nc - nr_block_start, nr);
1096 if XNN_LIKELY(b != NULL) {
1097 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1098 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1099 }
1100 }
1101 packed_w += nr;
1102 for (size_t ky = oy; ky < kh; ky += sh) {
1103 for (size_t kx = ox; kx < kw; kx += sw) {
1104 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1105 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1106 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1107 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1108 if (kc_idx < kc) {
1109 packed_w[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1110 }
1111 }
1112 packed_w += kr;
1113 }
1114 packed_w += (nr - nr_block_size) * kr;
1115 }
1116 }
1117 }
1118 }
1119 }
1120 }
1121 k += kh * kw * kc * nc;
1122 if XNN_UNPREDICTABLE(b != NULL) {
1123 b += nc;
1124 }
1125 }
1126 }
1127
xnn_pack_f16_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,struct subconvolution_params * subconv_params,const void * params)1128 void xnn_pack_f16_deconv_goki_w(
1129 size_t g,
1130 size_t nc,
1131 size_t kh,
1132 size_t kw,
1133 size_t kc,
1134 size_t sh,
1135 size_t sw,
1136 size_t nr,
1137 size_t kr,
1138 size_t sr,
1139 const uint16_t* k,
1140 const uint16_t* b,
1141 uint16_t* packed_w,
1142 struct subconvolution_params* subconv_params,
1143 const void* params)
1144 {
1145 assert(nr >= sr);
1146
1147 const size_t skr = sr * kr;
1148 for (size_t i = 0; i < g; i++) {
1149 for (size_t oy = 0; oy < sh; oy++) {
1150 for (size_t ox = 0; ox < sw; ox++) {
1151 if (i == 0) {
1152 (*subconv_params++).weights = packed_w;
1153 }
1154 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1155 const size_t nr_block_size = min(nc - nr_block_start, nr);
1156 if XNN_LIKELY(b != NULL) {
1157 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1158 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1159 }
1160 }
1161 packed_w += nr;
1162 for (size_t ky = oy; ky < kh; ky += sh) {
1163 for (size_t kx = ox; kx < kw; kx += sw) {
1164 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1165 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1166 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1167 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1168 if (kc_idx < kc) {
1169 packed_w[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1170 }
1171 }
1172 packed_w += kr;
1173 }
1174 packed_w += (nr - nr_block_size) * kr;
1175 }
1176 }
1177 }
1178 }
1179 }
1180 }
1181 k += kh * kw * kc * nc;
1182 if XNN_UNPREDICTABLE(b != NULL) {
1183 b += nc;
1184 }
1185 }
1186 }
1187
xnn_pack_qs8_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,struct subconvolution_params * subconv_params,const struct xnn_qs8_packing_params * params)1188 void xnn_pack_qs8_deconv_goki_w(
1189 size_t g,
1190 size_t nc,
1191 size_t kh,
1192 size_t kw,
1193 size_t kc,
1194 size_t sh,
1195 size_t sw,
1196 size_t nr,
1197 size_t kr,
1198 size_t sr,
1199 const int8_t* k,
1200 const int32_t* b,
1201 void* packed_w,
1202 struct subconvolution_params* subconv_params,
1203 const struct xnn_qs8_packing_params* params)
1204 {
1205 assert(nr >= sr);
1206
1207 const size_t skr = sr * kr;
1208 const int32_t izp = (int32_t) params->input_zero_point;
1209 for (size_t i = 0; i < g; i++) {
1210 for (size_t oy = 0; oy < sh; oy++) {
1211 for (size_t ox = 0; ox < sw; ox++) {
1212 if (i == 0) {
1213 (*subconv_params++).weights = packed_w;
1214 }
1215 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1216 const size_t nr_block_size = min(nc - nr_block_start, nr);
1217 int32_t* packed_b = (int32_t*) packed_w;
1218 if XNN_LIKELY(b != 0) {
1219 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1220 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
1221 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1222 }
1223 } else {
1224 size_t n = nr_block_size;
1225 do {
1226 *((int32_t*) packed_w) = 0;
1227 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1228 } while (--n != 0);
1229 }
1230 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1231 for (size_t ky = oy; ky < kh; ky += sh) {
1232 for (size_t kx = ox; kx < kw; kx += sw) {
1233 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1234 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1235 int32_t ksum = 0;
1236 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1237 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1238 if (kc_idx < kc) {
1239 const int8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1240 ksum += (int32_t) kv;
1241 ((int8_t*) packed_w)[kr_block_offset] = kv;
1242 }
1243 }
1244 packed_b[nr_block_offset] -= ksum * izp;
1245 packed_w = (int8_t*) packed_w + kr;
1246 }
1247 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
1248 }
1249 }
1250 }
1251 }
1252 }
1253 }
1254 k += kh * kw * kc * nc;
1255 if XNN_UNPREDICTABLE(b != NULL) {
1256 b += nc;
1257 }
1258 }
1259 }
1260
xnn_pack_qu8_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,struct subconvolution_params * subconv_params,const struct xnn_qu8_packing_params * params)1261 void xnn_pack_qu8_deconv_goki_w(
1262 size_t g,
1263 size_t nc,
1264 size_t kh,
1265 size_t kw,
1266 size_t kc,
1267 size_t sh,
1268 size_t sw,
1269 size_t nr,
1270 size_t kr,
1271 size_t sr,
1272 const uint8_t* k,
1273 const int32_t* b,
1274 void* packed_w,
1275 struct subconvolution_params* subconv_params,
1276 const struct xnn_qu8_packing_params* params)
1277 {
1278 assert(nr >= sr);
1279
1280 const size_t skr = sr * kr;
1281 const int32_t izp = (int32_t) params->input_zero_point;
1282 const int32_t kzp = (int32_t) params->kernel_zero_point;
1283 for (size_t i = 0; i < g; i++) {
1284 for (size_t oy = 0; oy < sh; oy++) {
1285 for (size_t ox = 0; ox < sw; ox++) {
1286 if (i == 0) {
1287 (*subconv_params++).weights = packed_w;
1288 }
1289 const int32_t bzp = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
1290 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1291 const size_t nr_block_size = min(nc - nr_block_start, nr);
1292 int32_t* packed_b = (int32_t*) packed_w;
1293 if XNN_LIKELY(b != 0) {
1294 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1295 *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
1296 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1297 }
1298 } else {
1299 size_t n = nr_block_size;
1300 do {
1301 *((int32_t*) packed_w) = bzp;
1302 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1303 } while (--n != 0);
1304 }
1305 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1306 for (size_t ky = oy; ky < kh; ky += sh) {
1307 for (size_t kx = ox; kx < kw; kx += sw) {
1308 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1309 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1310 int32_t ksum = 0;
1311 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1312 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1313 if (kc_idx < kc) {
1314 const uint8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1315 ksum += (int32_t) kv;
1316 ((uint8_t*) packed_w)[kr_block_offset] = kv;
1317 }
1318 }
1319 packed_b[nr_block_offset] -= ksum * izp;
1320 packed_w = (uint8_t*) packed_w + kr;
1321 }
1322 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
1323 }
1324 }
1325 }
1326 }
1327 }
1328 }
1329 k += kh * kw * kc * nc;
1330 if XNN_UNPREDICTABLE(b != NULL) {
1331 b += nc;
1332 }
1333 }
1334 }
1335
xnn_pack_f32_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)1336 void xnn_pack_f32_dwconv_ghw_w(
1337 size_t h,
1338 size_t w,
1339 size_t c,
1340 size_t cr,
1341 const float* k,
1342 const float* b,
1343 float* packed_w,
1344 size_t extra_bytes,
1345 const void* params)
1346 {
1347 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1348 const size_t cr_block_size = min(c - cr_block_start, cr);
1349 if XNN_LIKELY(b != NULL) {
1350 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1351 *packed_w++ = b[cr_block_start + cr_block_offset];
1352 }
1353 } else {
1354 size_t n = cr_block_size;
1355 do {
1356 *packed_w++ = 0.0f;
1357 } while (--n != 0);
1358 }
1359 packed_w += cr - cr_block_size;
1360 for (size_t x = 0; x < w; x++) {
1361 for (size_t y = 0; y < h; y++) {
1362 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1363 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1364 *packed_w++ = kv;
1365 }
1366 packed_w += cr - cr_block_size;
1367 }
1368 }
1369 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
1370 }
1371 }
1372
xnn_pack_f16_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1373 void xnn_pack_f16_dwconv_ghw_w(
1374 size_t h,
1375 size_t w,
1376 size_t c,
1377 size_t cr,
1378 const uint16_t* k,
1379 const uint16_t* b,
1380 uint16_t* packed_w,
1381 size_t extra_bytes,
1382 const void* params)
1383 {
1384 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1385 const size_t cr_block_size = min(c - cr_block_start, cr);
1386 if XNN_LIKELY(b != NULL) {
1387 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1388 *packed_w++ = b[cr_block_start + cr_block_offset];
1389 }
1390 } else {
1391 size_t n = cr_block_size;
1392 do {
1393 *packed_w++ = 0;
1394 } while (--n != 0);
1395 }
1396 packed_w += cr - cr_block_size;
1397 for (size_t x = 0; x < w; x++) {
1398 for (size_t y = 0; y < h; y++) {
1399 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1400 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1401 *packed_w++ = kv;
1402 }
1403 packed_w += cr - cr_block_size;
1404 }
1405 }
1406 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1407 }
1408 }
1409
xnn_pack_f32_to_f16_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1410 void xnn_pack_f32_to_f16_dwconv_ghw_w(
1411 size_t h,
1412 size_t w,
1413 size_t c,
1414 size_t cr,
1415 const float* k,
1416 const float* b,
1417 uint16_t* packed_w,
1418 size_t extra_bytes,
1419 const void* params)
1420 {
1421 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1422 const size_t cr_block_size = min(c - cr_block_start, cr);
1423 if XNN_LIKELY(b != NULL) {
1424 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1425 *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
1426 }
1427 } else {
1428 size_t n = cr_block_size;
1429 do {
1430 *packed_w++ = 0;
1431 } while (--n != 0);
1432 }
1433 packed_w += cr - cr_block_size;
1434 for (size_t x = 0; x < w; x++) {
1435 for (size_t y = 0; y < h; y++) {
1436 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1437 const uint16_t kv = fp16_ieee_from_fp32_value(k[((cr_block_start + cr_block_offset) * h + y) * w + x]);
1438 *packed_w++ = kv;
1439 }
1440 packed_w += cr - cr_block_size;
1441 }
1442 }
1443 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1444 }
1445 }
1446
xnn_pack_qu8_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)1447 void xnn_pack_qu8_dwconv_ghw_w(
1448 size_t h,
1449 size_t w,
1450 size_t c,
1451 size_t cr,
1452 const uint8_t* k,
1453 const int32_t* b,
1454 void* packed_w,
1455 size_t extra_bytes,
1456 const struct xnn_qu8_packing_params* params)
1457 {
1458 const int32_t izp = (int32_t) params->input_zero_point;
1459 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1460 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1461 const size_t cr_block_size = min(c - cr_block_start, cr);
1462 int32_t* packed_b = (int32_t*) packed_w;
1463 if XNN_LIKELY(b != NULL) {
1464 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1465 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1466 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1467 }
1468 } else {
1469 size_t n = cr_block_size;
1470 do {
1471 *((int32_t*) packed_w) = boff;
1472 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1473 } while (--n != 0);
1474 }
1475 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1476 for (size_t x = 0; x < w; x++) {
1477 for (size_t y = 0; y < h; y++) {
1478 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1479 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1480 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1481 *((uint8_t*) packed_w) = kv;
1482 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1483 }
1484 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1485 }
1486 }
1487 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1488 }
1489 }
1490
xnn_pack_qs8_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)1491 void xnn_pack_qs8_dwconv_ghw_w(
1492 size_t h,
1493 size_t w,
1494 size_t c,
1495 size_t cr,
1496 const int8_t* k,
1497 const int32_t* b,
1498 void* packed_w,
1499 size_t extra_bytes,
1500 const struct xnn_qs8_packing_params* params)
1501 {
1502 const int32_t izp = (int32_t) params->input_zero_point;
1503 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1504 const size_t cr_block_size = min(c - cr_block_start, cr);
1505 int32_t* packed_b = (int32_t*) packed_w;
1506 if XNN_LIKELY(b != NULL) {
1507 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1508 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1509 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1510 }
1511 } else {
1512 size_t n = cr_block_size;
1513 do {
1514 *((int32_t*) packed_w) = 0;
1515 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1516 } while (--n != 0);
1517 }
1518 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1519 for (size_t x = 0; x < w; x++) {
1520 for (size_t y = 0; y < h; y++) {
1521 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1522 const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1523 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1524 *((int8_t*) packed_w) = kv;
1525 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1526 }
1527 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1528 }
1529 }
1530 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1531 }
1532 }
1533
xnn_pack_f32_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)1534 void xnn_pack_f32_dwconv_hwg_w(
1535 size_t h,
1536 size_t w,
1537 size_t c,
1538 size_t cr,
1539 const float* k,
1540 const float* b,
1541 float* packed_w,
1542 size_t extra_bytes,
1543 const void* params)
1544 {
1545 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1546 const size_t cr_block_size = min(c - cr_block_start, cr);
1547 if XNN_LIKELY(b != NULL) {
1548 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1549 *packed_w++ = b[cr_block_start + cr_block_offset];
1550 }
1551 } else {
1552 size_t n = cr_block_size;
1553 do {
1554 *packed_w++ = 0.0f;
1555 } while (--n != 0);
1556 }
1557 packed_w += cr - cr_block_size;
1558 for (size_t x = 0; x < w; x++) {
1559 for (size_t y = 0; y < h; y++) {
1560 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1561 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1562 *packed_w++ = kv;
1563 }
1564 packed_w += cr - cr_block_size;
1565 }
1566 }
1567 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
1568 }
1569 }
1570
xnn_pack_f16_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1571 void xnn_pack_f16_dwconv_hwg_w(
1572 size_t h,
1573 size_t w,
1574 size_t c,
1575 size_t cr,
1576 const uint16_t* k,
1577 const uint16_t* b,
1578 uint16_t* packed_w,
1579 size_t extra_bytes,
1580 const void* params)
1581 {
1582 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1583 const size_t cr_block_size = min(c - cr_block_start, cr);
1584 if XNN_LIKELY(b != NULL) {
1585 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1586 *packed_w++ = b[cr_block_start + cr_block_offset];
1587 }
1588 } else {
1589 size_t n = cr_block_size;
1590 do {
1591 *packed_w++ = 0;
1592 } while (--n != 0);
1593 }
1594 packed_w += cr - cr_block_size;
1595 for (size_t x = 0; x < w; x++) {
1596 for (size_t y = 0; y < h; y++) {
1597 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1598 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1599 *packed_w++ = kv;
1600 }
1601 packed_w += cr - cr_block_size;
1602 }
1603 }
1604 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1605 }
1606 }
1607
xnn_pack_f32_to_f16_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1608 void xnn_pack_f32_to_f16_dwconv_hwg_w(
1609 size_t h,
1610 size_t w,
1611 size_t c,
1612 size_t cr,
1613 const float* k,
1614 const float* b,
1615 uint16_t* packed_w,
1616 size_t extra_bytes,
1617 const void* params)
1618 {
1619 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1620 const size_t cr_block_size = min(c - cr_block_start, cr);
1621 if XNN_LIKELY(b != NULL) {
1622 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1623 *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
1624 }
1625 } else {
1626 size_t n = cr_block_size;
1627 do {
1628 *packed_w++ = 0;
1629 } while (--n != 0);
1630 }
1631 packed_w += cr - cr_block_size;
1632 for (size_t x = 0; x < w; x++) {
1633 for (size_t y = 0; y < h; y++) {
1634 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1635 const uint16_t kv = fp16_ieee_from_fp32_value(k[(y * w + x) * c + (cr_block_start + cr_block_offset)]);
1636 *packed_w++ = kv;
1637 }
1638 packed_w += cr - cr_block_size;
1639 }
1640 }
1641 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1642 }
1643 }
1644
xnn_pack_qu8_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)1645 void xnn_pack_qu8_dwconv_hwg_w(
1646 size_t h,
1647 size_t w,
1648 size_t c,
1649 size_t cr,
1650 const uint8_t* k,
1651 const int32_t* b,
1652 void* packed_w,
1653 size_t extra_bytes,
1654 const struct xnn_qu8_packing_params* params)
1655 {
1656 const int32_t izp = (int32_t) params->input_zero_point;
1657 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1658 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1659 const size_t cr_block_size = min(c - cr_block_start, cr);
1660 int32_t* packed_b = (int32_t*) packed_w;
1661 if XNN_LIKELY(b != NULL) {
1662 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1663 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1664 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1665 }
1666 } else {
1667 size_t n = cr_block_size;
1668 do {
1669 *((int32_t*) packed_w) = boff;
1670 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1671 } while (--n != 0);
1672 }
1673 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1674 for (size_t x = 0; x < w; x++) {
1675 for (size_t y = 0; y < h; y++) {
1676 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1677 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1678 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1679 *((uint8_t*) packed_w) = kv;
1680 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1681 }
1682 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1683 }
1684 }
1685 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1686 }
1687 }
1688
xnn_pack_qs8_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)1689 void xnn_pack_qs8_dwconv_hwg_w(
1690 size_t h,
1691 size_t w,
1692 size_t c,
1693 size_t cr,
1694 const int8_t* k,
1695 const int32_t* b,
1696 void* packed_w,
1697 size_t extra_bytes,
1698 const struct xnn_qs8_packing_params* params)
1699 {
1700 const int32_t izp = (int32_t) params->input_zero_point;
1701 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1702 const size_t cr_block_size = min(c - cr_block_start, cr);
1703 int32_t* packed_b = (int32_t*) packed_w;
1704 if XNN_LIKELY(b != NULL) {
1705 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1706 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1707 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1708 }
1709 } else {
1710 size_t n = cr_block_size;
1711 do {
1712 *((int32_t*) packed_w) = 0;
1713 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1714 } while (--n != 0);
1715 }
1716 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1717 for (size_t x = 0; x < w; x++) {
1718 for (size_t y = 0; y < h; y++) {
1719 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1720 const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1721 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1722 *((int8_t*) packed_w) = kv;
1723 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1724 }
1725 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1726 }
1727 }
1728 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1729 }
1730 }
1731
xnn_pack_f32_gemminc_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,float * packed_w,const void * params)1732 void xnn_pack_f32_gemminc_goi_w(
1733 size_t g,
1734 size_t nc,
1735 size_t kc,
1736 size_t nr,
1737 size_t kr,
1738 size_t sr,
1739 const float* k,
1740 float* packed_w,
1741 const void* params)
1742 {
1743 const size_t skr = sr * kr;
1744 do {
1745 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1746 const size_t nr_block_size = min(nc - nr_block_start, nr);
1747
1748 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1749 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1750 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1751 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1752 if (kc_idx < kc) {
1753 packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
1754 }
1755 }
1756 packed_w += kr;
1757 }
1758 packed_w += (nr - nr_block_size) * kr;
1759 }
1760 }
1761 k += nc * kc;
1762 } while (--g != 0);
1763 }
1764
xnn_pack_f16_gemminc_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,uint16_t * packed_w,const void * params)1765 void xnn_pack_f16_gemminc_goi_w(
1766 size_t g,
1767 size_t nc,
1768 size_t kc,
1769 size_t nr,
1770 size_t kr,
1771 size_t sr,
1772 const uint16_t* k,
1773 uint16_t* packed_w,
1774 const void* params)
1775 {
1776 const size_t skr = sr * kr;
1777 do {
1778 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1779 const size_t nr_block_size = min(nc - nr_block_start, nr);
1780
1781 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1782 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1783 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1784 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1785 if (kc_idx < kc) {
1786 packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
1787 }
1788 }
1789 packed_w += kr;
1790 }
1791 packed_w += (nr - nr_block_size) * kr;
1792 }
1793 }
1794 k += nc * kc;
1795 } while (--g != 0);
1796 }
1797
xnn_pack_f32_dconv_oki_w(size_t nc,size_t kc,size_t nr,size_t kh,size_t kw,const float * k,const float * b,float * packed_w,const void * params)1798 void xnn_pack_f32_dconv_oki_w(
1799 size_t nc,
1800 size_t kc,
1801 size_t nr,
1802 size_t kh,
1803 size_t kw,
1804 const float* k,
1805 const float* b,
1806 float* packed_w,
1807 const void* params)
1808 {
1809 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1810 const size_t nr_block_size = min(nc - nr_block_start, nr);
1811 if XNN_LIKELY(b != NULL) {
1812 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1813 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1814 }
1815 } else {
1816 size_t n = nr;
1817 do {
1818 *packed_w++ = 0.0f;
1819 } while (--n != 0);
1820 }
1821
1822 for (size_t kx = 0; kx < kw; kx++) {
1823 for (size_t c = 0; c < kc; c++) {
1824 for (size_t ky = 0; ky < kh; ky++) {
1825 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1826 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1827 }
1828 }
1829 }
1830 }
1831 if XNN_UNPREDICTABLE(b != NULL) {
1832 b += nr;
1833 }
1834 }
1835 }
1836
xnn_pack_f16_dconv_oki_w(size_t nc,size_t kc,size_t nr,size_t kh,size_t kw,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)1837 void xnn_pack_f16_dconv_oki_w(
1838 size_t nc,
1839 size_t kc,
1840 size_t nr,
1841 size_t kh,
1842 size_t kw,
1843 const uint16_t* k,
1844 const uint16_t* b,
1845 uint16_t* packed_w,
1846 const void* params)
1847 {
1848 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1849 const size_t nr_block_size = min(nc - nr_block_start, nr);
1850 if XNN_LIKELY(b != NULL) {
1851 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1852 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1853 }
1854 } else {
1855 size_t n = nr;
1856 do {
1857 *packed_w++ = 0;
1858 } while (--n != 0);
1859 }
1860
1861 for (size_t kx = 0; kx < kw; kx++) {
1862 for (size_t c = 0; c < kc; c++) {
1863 for (size_t ky = 0; ky < kh; ky++) {
1864 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1865 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1866 }
1867 }
1868 }
1869 }
1870 if XNN_UNPREDICTABLE(b != NULL) {
1871 b += nr;
1872 }
1873 }
1874 }
1875
xnn_pack_f32_chw_dwconv_ghw_w(size_t kernel_size,size_t groups,const float * kernel,const float * bias,float * packed_weights,const void * params)1876 void xnn_pack_f32_chw_dwconv_ghw_w(
1877 size_t kernel_size,
1878 size_t groups,
1879 const float* kernel,
1880 const float* bias,
1881 float* packed_weights,
1882 const void* params)
1883 {
1884 for (size_t g = 0; g < groups; g++) {
1885 if XNN_LIKELY(bias != NULL) {
1886 *packed_weights = *bias++;
1887 } else {
1888 *packed_weights = 0.0f;
1889 }
1890 packed_weights += 1;
1891 for (size_t i = 0; i < kernel_size; i++) {
1892 *packed_weights++ = kernel[g * kernel_size + i];
1893 }
1894 }
1895 }
1896
xnn_pack_f16_chw_dwconv_ghw_w(size_t kernel_size,size_t groups,const uint16_t * kernel,const uint16_t * bias,uint16_t * packed_weights,const void * params)1897 void xnn_pack_f16_chw_dwconv_ghw_w(
1898 size_t kernel_size,
1899 size_t groups,
1900 const uint16_t* kernel,
1901 const uint16_t* bias,
1902 uint16_t* packed_weights,
1903 const void* params)
1904 {
1905 for (size_t g = 0; g < groups; g++) {
1906 if XNN_LIKELY(bias != NULL) {
1907 *packed_weights = *bias++;
1908 } else {
1909 *packed_weights = 0;
1910 }
1911 packed_weights += 1;
1912 for (size_t i = 0; i < kernel_size; i++) {
1913 *packed_weights++ = kernel[g * kernel_size + i];
1914 }
1915 }
1916 }
1917
xnn_pack_f32_chw_dwconv_hwg_w(size_t kernel_size,size_t groups,const float * kernel,const float * bias,float * packed_weights,const void * params)1918 void xnn_pack_f32_chw_dwconv_hwg_w(
1919 size_t kernel_size,
1920 size_t groups,
1921 const float* kernel,
1922 const float* bias,
1923 float* packed_weights,
1924 const void* params)
1925 {
1926 for (size_t g = 0; g < groups; g++) {
1927 if XNN_LIKELY(bias != NULL) {
1928 *packed_weights = *bias++;
1929 } else {
1930 *packed_weights = 0.0f;
1931 }
1932 packed_weights += 1;
1933 for (size_t i = 0; i < kernel_size; i++) {
1934 *packed_weights++ = kernel[i * groups + g];
1935 }
1936 }
1937 }
1938
xnn_pack_f32_vmulcaddc_w(size_t c,size_t cr,const float * s,const float * b,float * packed_w,const void * params)1939 void xnn_pack_f32_vmulcaddc_w(
1940 size_t c,
1941 size_t cr,
1942 const float* s,
1943 const float* b,
1944 float* packed_w,
1945 const void* params)
1946 {
1947 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1948 const size_t cr_block_size = min(c - cr_block_start, cr);
1949 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1950 *packed_w++ = s[cr_block_start + cr_block_offset];
1951 }
1952 packed_w += cr - cr_block_size;
1953 if XNN_LIKELY(b != NULL) {
1954 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1955 *packed_w++ = b[cr_block_start + cr_block_offset];
1956 }
1957 } else {
1958 size_t n = cr_block_size;
1959 do {
1960 *packed_w++ = 0.0f;
1961 } while (--n != 0);
1962 }
1963 packed_w += cr - cr_block_size;
1964 }
1965 }
1966
xnn_pack_f16_vmulcaddc_w(size_t c,size_t cr,const uint16_t * s,const uint16_t * b,uint16_t * packed_w,const void * params)1967 void xnn_pack_f16_vmulcaddc_w(
1968 size_t c,
1969 size_t cr,
1970 const uint16_t* s,
1971 const uint16_t* b,
1972 uint16_t* packed_w,
1973 const void* params)
1974 {
1975 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1976 const size_t cr_block_size = min(c - cr_block_start, cr);
1977 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1978 *packed_w++ = s[cr_block_start + cr_block_offset];
1979 }
1980 packed_w += cr - cr_block_size;
1981 if XNN_LIKELY(b != NULL) {
1982 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1983 *packed_w++ = b[cr_block_start + cr_block_offset];
1984 }
1985 } else {
1986 size_t n = cr_block_size;
1987 do {
1988 *packed_w++ = 0;
1989 } while (--n != 0);
1990 }
1991 packed_w += cr - cr_block_size;
1992 }
1993 }
1994
xnn_pack_f32_to_f16_vmulcaddc_w(size_t c,size_t cr,const float * s,const float * b,uint16_t * packed_w,const void * params)1995 void xnn_pack_f32_to_f16_vmulcaddc_w(
1996 size_t c,
1997 size_t cr,
1998 const float* s,
1999 const float* b,
2000 uint16_t* packed_w,
2001 const void* params)
2002 {
2003 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2004 const size_t cr_block_size = min(c - cr_block_start, cr);
2005 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2006 *packed_w++ = fp16_ieee_from_fp32_value(s[cr_block_start + cr_block_offset]);
2007 }
2008 packed_w += cr - cr_block_size;
2009 if XNN_LIKELY(b != NULL) {
2010 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2011 *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
2012 }
2013 } else {
2014 size_t n = cr_block_size;
2015 do {
2016 *packed_w++ = 0;
2017 } while (--n != 0);
2018 }
2019 packed_w += cr - cr_block_size;
2020 }
2021 }
2022
xnn_pack_f32_prelu_w(size_t c,const float * s,float * packed_w)2023 void xnn_pack_f32_prelu_w(
2024 size_t c,
2025 const float* s,
2026 float* packed_w)
2027 {
2028 memcpy(packed_w, s, c * sizeof(float));
2029 }
2030
xnn_pack_f16_prelu_w(size_t c,const uint16_t * s,uint16_t * packed_w)2031 void xnn_pack_f16_prelu_w(
2032 size_t c,
2033 const uint16_t* s,
2034 uint16_t* packed_w)
2035 {
2036 memcpy(packed_w, s, c * sizeof(uint16_t));
2037 }
2038
xnn_pack_f32_to_f16_prelu_w(size_t c,const float * s,uint16_t * packed_w)2039 void xnn_pack_f32_to_f16_prelu_w(
2040 size_t c,
2041 const float* s,
2042 uint16_t* packed_w)
2043 {
2044 do {
2045 *packed_w++ = fp16_ieee_from_fp32_value(*s++);
2046 } while (--c != 0);
2047 }
2048