Lines Matching refs:nr_block_start
32 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_f32_gemm_goi_w() local
33 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_f32_gemm_goi_w()
36 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; in xnn_pack_f32_gemm_goi_w()
45 …k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start… in xnn_pack_f32_gemm_goi_w()
56 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)]; in xnn_pack_f32_gemm_goi_w()
86 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_f16_gemm_goi_w() local
87 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_f16_gemm_goi_w()
90 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; in xnn_pack_f16_gemm_goi_w()
99 …k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start… in xnn_pack_f16_gemm_goi_w()
110 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)]; in xnn_pack_f16_gemm_goi_w()
140 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_qu8_gemm_goi_w() local
141 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_qu8_gemm_goi_w()
145 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff; in xnn_pack_qu8_gemm_goi_w()
161 …const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)]; in xnn_pack_qu8_gemm_goi_w()
194 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_qs8_gemm_goi_w() local
195 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_qs8_gemm_goi_w()
199 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset]; in xnn_pack_qs8_gemm_goi_w()
215 … const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)]; in xnn_pack_qs8_gemm_goi_w()
248 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_qs8_gemm_xw_goi_w() local
249 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_qs8_gemm_xw_goi_w()
253 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset]; in xnn_pack_qs8_gemm_xw_goi_w()
269 … const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)]; in xnn_pack_qs8_gemm_xw_goi_w()
301 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_f32_gemm_io_w() local
302 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_f32_gemm_io_w()
305 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; in xnn_pack_f32_gemm_io_w()
314 …_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_off… in xnn_pack_f32_gemm_io_w()
325 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)]; in xnn_pack_f32_gemm_io_w()
348 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_f16_gemm_io_w() local
349 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_f16_gemm_io_w()
352 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; in xnn_pack_f16_gemm_io_w()
361 …_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_off… in xnn_pack_f16_gemm_io_w()
372 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)]; in xnn_pack_f16_gemm_io_w()
395 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_qu8_gemm_io_w() local
396 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_qu8_gemm_io_w()
400 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff; in xnn_pack_qu8_gemm_io_w()
416 …const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)]; in xnn_pack_qu8_gemm_io_w()
446 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_f32_conv_goki_w() local
447 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_f32_conv_goki_w()
450 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; in xnn_pack_f32_conv_goki_w()
460 …k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr… in xnn_pack_f32_conv_goki_w()
471 … k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)]; in xnn_pack_f32_conv_goki_w()
503 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_f16_conv_goki_w() local
504 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_f16_conv_goki_w()
507 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; in xnn_pack_f16_conv_goki_w()
517 …k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr… in xnn_pack_f16_conv_goki_w()
528 … k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)]; in xnn_pack_f16_conv_goki_w()
560 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_qu8_conv_goki_w() local
561 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_qu8_conv_goki_w()
565 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff; in xnn_pack_qu8_conv_goki_w()
583 … k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)]; in xnn_pack_qu8_conv_goki_w()
618 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_qs8_conv_goki_w() local
619 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_qs8_conv_goki_w()
623 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset]; in xnn_pack_qs8_conv_goki_w()
641 … k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)]; in xnn_pack_qs8_conv_goki_w()
672 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_f32_conv_kgo_w() local
673 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_f32_conv_kgo_w()
676 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; in xnn_pack_f32_conv_kgo_w()
683 k[ki * g * nc + (nr_block_start + nr_block_offset)]; in xnn_pack_f32_conv_kgo_w()
708 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_f16_conv_kgo_w() local
709 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_f16_conv_kgo_w()
712 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; in xnn_pack_f16_conv_kgo_w()
719 k[ki * g * nc + (nr_block_start + nr_block_offset)]; in xnn_pack_f16_conv_kgo_w()
746 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_qu8_conv_kgo_w() local
747 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_qu8_conv_kgo_w()
751 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff; in xnn_pack_qu8_conv_kgo_w()
765 k[ki * g * nc + (nr_block_start + nr_block_offset)]; in xnn_pack_qu8_conv_kgo_w()
793 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_qs8_conv_kgo_w() local
794 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_qs8_conv_kgo_w()
798 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset]; in xnn_pack_qs8_conv_kgo_w()
812 k[ki * g * nc + (nr_block_start + nr_block_offset)]; in xnn_pack_qs8_conv_kgo_w()
853 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_f32_deconv_goki_w() local
854 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_f32_deconv_goki_w()
857 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; in xnn_pack_f32_deconv_goki_w()
867 …k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start,… in xnn_pack_f32_deconv_goki_w()
878 …k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_off… in xnn_pack_f32_deconv_goki_w()
922 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_f16_deconv_goki_w() local
923 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_f16_deconv_goki_w()
926 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; in xnn_pack_f16_deconv_goki_w()
936 …k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start,… in xnn_pack_f16_deconv_goki_w()
947 …k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_off… in xnn_pack_f16_deconv_goki_w()
992 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_qu8_deconv_goki_w() local
993 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_qu8_deconv_goki_w()
997 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff; in xnn_pack_qu8_deconv_goki_w()
1016 …k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_off… in xnn_pack_qu8_deconv_goki_w()
1359 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_f32_gemminc_goi_w() local
1360 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_f32_gemminc_goi_w()
1366 …k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start… in xnn_pack_f32_gemminc_goi_w()
1377 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)]; in xnn_pack_f32_gemminc_goi_w()
1403 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_f16_gemminc_goi_w() local
1404 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_f16_gemminc_goi_w()
1410 …k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start… in xnn_pack_f16_gemminc_goi_w()
1421 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)]; in xnn_pack_f16_gemminc_goi_w()
1443 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_f32_dconv_oki_w() local
1444 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_f32_dconv_oki_w()
1460 …*packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx)… in xnn_pack_f32_dconv_oki_w()
1482 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { in xnn_pack_f16_dconv_oki_w() local
1483 const size_t nr_block_size = min(nc - nr_block_start, nr); in xnn_pack_f16_dconv_oki_w()
1499 …*packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx)… in xnn_pack_f16_dconv_oki_w()