XNNPACK/src/packing.c

// Copyright (c) Facebook, Inc. and its affiliates.
// All rights reserved.
//
// Copyright 2019 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <stdint.h>
#include <stddef.h>

#include <xnnpack/math.h>
#include <xnnpack/pack.h>


void xnn_pack_f32_gemm_goi_w(
  size_t g,
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const float* k,
  const float* b,
  float* packed_w,
  const void* params)
{
  const size_t skr = sr * kr;
  const size_t skc = round_down_po2(kc, skr);
  const size_t sr_mask = (sr - 1) * kr;
  do {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);
      if XNN_LIKELY(b != NULL) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
        }
      }
      packed_w += nr;

      for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
            *packed_w++ =
              k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
          }
        }
        packed_w += (nr - nr_block_size) * kr;
      }

      for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
        const size_t kr_block_size = min(kc - kr_block_start, kr);
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
            *packed_w++ =
              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
          }
          packed_w += kr - kr_block_size;
        }
        packed_w += (nr - nr_block_size) * kr;
      }
    }
    k += nc * kc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  } while (--g != 0);
}

void xnn_pack_f16_gemm_goi_w(
  size_t g,
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const uint16_t* k,
  const uint16_t* b,
  uint16_t* packed_w,
  const void* params)
{
  const size_t skr = sr * kr;
  const size_t skc = round_down_po2(kc, skr);
  const size_t sr_mask = (sr - 1) * kr;
  do {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);
      if XNN_LIKELY(b != NULL) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
        }
      }
      packed_w += nr;

      for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
            *packed_w++ =
              k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
          }
        }
        packed_w += (nr - nr_block_size) * kr;
      }

      for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
        const size_t kr_block_size = min(kc - kr_block_start, kr);
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
            *packed_w++ =
              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
          }
          packed_w += kr - kr_block_size;
        }
        packed_w += (nr - nr_block_size) * kr;
      }
    }
    k += nc * kc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  } while (--g != 0);
}

void xnn_pack_qu8_gemm_goi_w(
  size_t g,
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const uint8_t* k,
  const int32_t* b,
  void* packed_w,
  const struct xnn_qu8_packing_params* params)
{
  assert(sr == 1);
  const int32_t izp = (int32_t) params->input_zero_point;
  const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
  do {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);
      int32_t* packed_b = (int32_t*) packed_w;
      if XNN_LIKELY(b != NULL) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
        }
      } else {
        size_t n = nr_block_size;
        do {
          *((int32_t*) packed_w) = boff;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
        } while (--n != 0);
      }
      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
      for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
        const size_t kr_block_size = min(kc - kr_block_start, kr);
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          int32_t ksum = 0;
          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
            const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
            ksum += (int32_t) kv;
            *((uint8_t*) packed_w) = kv;
            packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
          }
          packed_b[nr_block_offset] -= ksum * izp;
          packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
        }
        packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
      }
    }
    k += nc * kc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  } while (--g != 0);
}

void xnn_pack_qs8_gemm_goi_w(
  size_t g,
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const int8_t* k,
  const int32_t* b,
  void* packed_w,
  const struct xnn_qs8_packing_params* params)
{
  assert(sr == 1);
  const int32_t izp = (int32_t) params->input_zero_point;
  do {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);
      int32_t* packed_b = (int32_t*) packed_w;
      if XNN_LIKELY(b != NULL) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
        }
      } else {
        size_t n = nr_block_size;
        do {
          *((int32_t*) packed_w) = 0;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
        } while (--n != 0);
      }
      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
      for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
        const size_t kr_block_size = min(kc - kr_block_start, kr);
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          int32_t ksum = 0;
          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
            const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
            ksum += (int32_t) kv;
            *((int8_t*) packed_w) = kv;
            packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
          }
          packed_b[nr_block_offset] -= ksum * izp;
          packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
        }
        packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
      }
    }
    k += nc * kc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  } while (--g != 0);
}

void xnn_pack_qs8_gemm_xw_goi_w(
  size_t g,
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const int8_t* k,
  const int32_t* b,
  void* packed_w,
  const struct xnn_qs8_packing_params* params)
{
  assert(sr == 1);
  const int32_t izp = (int32_t) params->input_zero_point;
  do {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);
      int32_t* packed_b = (int32_t*) packed_w;
      if XNN_LIKELY(b != NULL) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
        }
      } else {
        size_t n = nr_block_size;
        do {
          *((int32_t*) packed_w) = 0;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
        } while (--n != 0);
      }
      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
      for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
        const size_t kr_block_size = min(kc - kr_block_start, kr);
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          int32_t ksum = 0;
          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
            const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
            ksum += (int32_t) kv;
            *((int16_t*) packed_w) = (int16_t) kv;
            packed_w = (void*) ((uintptr_t) packed_w + sizeof(int16_t));
          }
          packed_b[nr_block_offset] -= ksum * izp;
          packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int16_t));
        }
        packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int16_t));
      }
    }
    k += nc * kc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  } while (--g != 0);
}

void xnn_pack_f32_gemm_io_w(
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const float* k,
  const float* b,
  float* packed_w,
  const void* params)
{
  const size_t skr = sr * kr;
  const size_t skc = round_down_po2(kc, skr);
  const size_t sr_mask = (sr - 1) * kr;
  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
    const size_t nr_block_size = min(nc - nr_block_start, nr);
    if XNN_LIKELY(b != NULL) {
      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
        packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
      }
    }
    packed_w += nr;

    for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
        for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
          *packed_w++ =
            k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
        }
      }
      packed_w += (nr - nr_block_size) * kr;
    }

    for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
      const size_t kr_block_size = min(kc - kr_block_start, kr);
      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
        for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
          *packed_w++ =
            k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
        }
        packed_w += kr - kr_block_size;
      }
      packed_w += (nr - nr_block_size) * kr;
    }
  }
}

void xnn_pack_f16_gemm_io_w(
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const uint16_t* k,
  const uint16_t* b,
  uint16_t* packed_w,
  const void* params)
{
  const size_t skr = sr * kr;
  const size_t skc = round_down_po2(kc, skr);
  const size_t sr_mask = (sr - 1) * kr;
  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
    const size_t nr_block_size = min(nc - nr_block_start, nr);
    if XNN_LIKELY(b != NULL) {
      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
        packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
      }
    }
    packed_w += nr;

    for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
        for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
          *packed_w++ =
            k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
        }
      }
      packed_w += (nr - nr_block_size) * kr;
    }

    for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
      const size_t kr_block_size = min(kc - kr_block_start, kr);
      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
        for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
          *packed_w++ =
            k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
        }
        packed_w += kr - kr_block_size;
      }
      packed_w += (nr - nr_block_size) * kr;
    }
  }
}

void xnn_pack_qu8_gemm_io_w(
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const uint8_t* k,
  const int32_t* b,
  void* packed_w,
  const struct xnn_qu8_packing_params* params)
{
  assert(sr == 1);
  const int32_t izp = (int32_t) params->input_zero_point;
  const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
    const size_t nr_block_size = min(nc - nr_block_start, nr);
    int32_t* packed_b = (int32_t*) packed_w;
    if XNN_LIKELY(b != NULL) {
      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
        *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
      }
    } else {
      size_t n = nr_block_size;
      do {
        *((int32_t*) packed_w) = boff;
        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
      } while (--n != 0);
    }
    packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
    for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
      const size_t kr_block_size = min(kc - kr_block_start, kr);
      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
        int32_t ksum = 0;
        for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
          const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
          ksum += (int32_t) kv;
          *((uint8_t*) packed_w) = kv;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
        }
        packed_b[nr_block_offset] -= ksum * izp;
        packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
      }
      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
    }
  }
}

void xnn_pack_f32_conv_goki_w(
  size_t g,
  size_t nc,
  size_t ks,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const float* k,
  const float* b,
  float* packed_w,
  const void* params)
{
  const size_t skr = sr * kr;
  const size_t skc = round_down_po2(kc, skr);
  const size_t sr_mask = (sr - 1) * kr;
  do {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);
      if XNN_LIKELY(b != NULL) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
        }
      }
      packed_w += nr;

      for (size_t ki = 0; ki < ks; ki++) {
        for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
            for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
              *packed_w++ =
                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
            }
          }
          packed_w += (nr - nr_block_size) * kr;
        }

        for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
          const size_t kr_block_size = min(kc - kr_block_start, kr);
          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
            for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
              *packed_w++ =
                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
            }
            packed_w += kr - kr_block_size;
          }
          packed_w += (nr - nr_block_size) * kr;
        }
      }
    }
    k += ks * kc * nc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  } while (--g != 0);
}

void xnn_pack_f16_conv_goki_w(
  size_t g,
  size_t nc,
  size_t ks,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const uint16_t* k,
  const uint16_t* b,
  uint16_t* packed_w,
  const void* params)
{
  const size_t skr = sr * kr;
  const size_t skc = round_down_po2(kc, skr);
  const size_t sr_mask = (sr - 1) * kr;
  do {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);
      if XNN_LIKELY(b != NULL) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
        }
      }
      packed_w += nr;

      for (size_t ki = 0; ki < ks; ki++) {
        for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
            for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
              *packed_w++ =
                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
            }
          }
          packed_w += (nr - nr_block_size) * kr;
        }

        for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
          const size_t kr_block_size = min(kc - kr_block_start, kr);
          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
            for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
              *packed_w++ =
                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
            }
            packed_w += kr - kr_block_size;
          }
          packed_w += (nr - nr_block_size) * kr;
        }
      }
    }
    k += ks * kc * nc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  } while (--g != 0);
}

void xnn_pack_qu8_conv_goki_w(
  size_t g,
  size_t nc,
  size_t ks,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const uint8_t* k,
  const int32_t* b,
  void* packed_w,
  const struct xnn_qu8_packing_params* params)
{
  assert(sr == 1);
  const int32_t izp = (int32_t) params->input_zero_point;
  const int32_t boff = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
  do {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);
      int32_t* packed_b = (int32_t*) packed_w;
      if XNN_LIKELY(b != NULL) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
        }
      } else {
        size_t n = nr_block_size;
        do {
          *((int32_t*) packed_w) = boff;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
        } while (--n != 0);
      }
      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
      for (size_t ki = 0; ki < ks; ki++) {
        for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
          const size_t kr_block_size = min(kc - kr_block_start, kr);
          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
            int32_t ksum = 0;
            for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
              const uint8_t kv =
                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
              ksum += (int32_t) kv;
              *((uint8_t*) packed_w) = kv;
              packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
            }
            packed_b[nr_block_offset] -= ksum * izp;
            packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
          }
          packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
        }
      }
    }
    k += ks * kc * nc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  } while (--g != 0);
}

void xnn_pack_qs8_conv_goki_w(
  size_t g,
  size_t nc,
  size_t ks,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const int8_t* k,
  const int32_t* b,
  void* packed_w,
  const struct xnn_qs8_packing_params* params)
{
  assert(sr == 1);
  const int32_t izp = (int32_t) params->input_zero_point;
  do {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);
      int32_t* packed_b = (int32_t*) packed_w;
      if XNN_LIKELY(b != NULL) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
        }
      } else {
        size_t n = nr_block_size;
        do {
          *((int32_t*) packed_w) = 0;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
        } while (--n != 0);
      }
      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
      for (size_t ki = 0; ki < ks; ki++) {
        for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
          const size_t kr_block_size = min(kc - kr_block_start, kr);
          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
            int32_t ksum = 0;
            for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
              const int8_t kv =
                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
              ksum += (int32_t) kv;
              *((int8_t*) packed_w) = kv;
              packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
            }
            packed_b[nr_block_offset] -= ksum * izp;
            packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
          }
          packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
        }
      }
    }
    k += ks * kc * nc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  } while (--g != 0);
}

void xnn_pack_f32_conv_kgo_w(
  size_t g,
  size_t nc,
  size_t ks,
  size_t nr,
  size_t kr,
  const float* k,
  const float* b,
  float* packed_w,
  const void* params)
{
  for (size_t i = 0; i < g; i++) {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);
      if XNN_LIKELY(b != NULL) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
        }
      }
      packed_w += nr;
      for (size_t ki = 0; ki < ks; ki++) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          *packed_w =
            k[ki * g * nc + (nr_block_start + nr_block_offset)];
          packed_w += kr;
        }
        packed_w += (nr - nr_block_size) * kr;
      }
    }
    k += nc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  }
}

void xnn_pack_f16_conv_kgo_w(
  size_t g,
  size_t nc,
  size_t ks,
  size_t nr,
  size_t kr,
  const uint16_t* k,
  const uint16_t* b,
  uint16_t* packed_w,
  const void* params)
{
  for (size_t i = 0; i < g; i++) {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);
      if XNN_LIKELY(b != NULL) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
        }
      }
      packed_w += nr;
      for (size_t ki = 0; ki < ks; ki++) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          *packed_w =
            k[ki * g * nc + (nr_block_start + nr_block_offset)];
          packed_w += kr;
        }
        packed_w += (nr - nr_block_size) * kr;
      }
    }
    k += nc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  }
}

void xnn_pack_qu8_conv_kgo_w(
  size_t g,
  size_t nc,
  size_t ks,
  size_t nr,
  size_t kr,
  const uint8_t* k,
  const int32_t* b,
  void* packed_w,
  const struct xnn_qu8_packing_params* params)
{
  const int32_t izp = (int32_t) params->input_zero_point;
  const int32_t boff = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
  for (size_t i = 0; i < g; i++) {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);
      int32_t* packed_b = (int32_t*) packed_w;
      if XNN_LIKELY(b != NULL) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
        }
      } else {
        size_t n = nr_block_size;
        do {
          *((int32_t*) packed_w) = boff;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
        } while (--n != 0);
      }
      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
      for (size_t ki = 0; ki < ks; ki++) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          const uint8_t kv =
            k[ki * g * nc + (nr_block_start + nr_block_offset)];
          *((uint8_t*) packed_w) = kv;
          packed_b[nr_block_offset] -= (int32_t) kv * izp;
          packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
        }
        packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
      }
    }
    k += nc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  }
}

void xnn_pack_qs8_conv_kgo_w(
  size_t g,
  size_t nc,
  size_t ks,
  size_t nr,
  size_t kr,
  const int8_t* k,
  const int32_t* b,
  void* packed_w,
  const struct xnn_qs8_packing_params* params)
{
  const int32_t izp = (int32_t) params->input_zero_point;
  for (size_t i = 0; i < g; i++) {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);
      int32_t* packed_b = (int32_t*) packed_w;
      if XNN_LIKELY(b != NULL) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
        }
      } else {
        size_t n = nr_block_size;
        do {
          *((int32_t*) packed_w) = 0;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
        } while (--n != 0);
      }
      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
      for (size_t ki = 0; ki < ks; ki++) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          const int8_t kv =
            k[ki * g * nc + (nr_block_start + nr_block_offset)];
          *((int8_t*) packed_w) = kv;
          packed_b[nr_block_offset] -= (int32_t) kv * izp;
          packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(int8_t));
        }
        packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
      }
    }
    k += nc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  }
}

void xnn_pack_f32_deconv_goki_w(
  size_t g,
  size_t nc,
  size_t kh,
  size_t kw,
  size_t kc,
  size_t sh,
  size_t sw,
  size_t nr,
  size_t kr,
  size_t sr,
  const float* k,
  const float* b,
  float* packed_w,
  struct subconvolution_params* subconv_params,
  const void* params)
{
  const size_t skr = sr * kr;
  const size_t skc = round_down_po2(kc, skr);
  const size_t sr_mask = (sr - 1) * kr;
  for (size_t i = 0; i < g; i++) {
    for (size_t oy = 0; oy < sh; oy++) {
      for (size_t ox = 0; ox < sw; ox++) {
        if (i == 0) {
          (*subconv_params++).weights = packed_w;
        }
        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
          const size_t nr_block_size = min(nc - nr_block_start, nr);
          if XNN_LIKELY(b != NULL) {
            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
              packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
            }
          }
          packed_w += nr;
          for (size_t ky = oy; ky < kh; ky += sh) {
            for (size_t kx = ox; kx < kw; kx += sw) {
              for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
                  for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
                    *packed_w++ =
                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
                  }
                }
                packed_w += (nr - nr_block_size) * kr;
              }

              for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
                const size_t kr_block_size = min(kc - kr_block_start, kr);
                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
                  for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
                    *packed_w++ =
                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
                  }
                  packed_w += kr - kr_block_size;
                }
                packed_w += (nr - nr_block_size) * kr;
              }
            }
          }
        }
      }
    }
    k += kh * kw * kc * nc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  }
}

void xnn_pack_f16_deconv_goki_w(
  size_t g,
  size_t nc,
  size_t kh,
  size_t kw,
  size_t kc,
  size_t sh,
  size_t sw,
  size_t nr,
  size_t kr,
  size_t sr,
  const uint16_t* k,
  const uint16_t* b,
  uint16_t* packed_w,
  struct subconvolution_params* subconv_params,
  const void* params)
{
  const size_t skr = sr * kr;
  const size_t skc = round_down_po2(kc, skr);
  const size_t sr_mask = (sr - 1) * kr;
  for (size_t i = 0; i < g; i++) {
    for (size_t oy = 0; oy < sh; oy++) {
      for (size_t ox = 0; ox < sw; ox++) {
        if (i == 0) {
          (*subconv_params++).weights = packed_w;
        }
        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
          const size_t nr_block_size = min(nc - nr_block_start, nr);
          if XNN_LIKELY(b != NULL) {
            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
              packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
            }
          }
          packed_w += nr;
          for (size_t ky = oy; ky < kh; ky += sh) {
            for (size_t kx = ox; kx < kw; kx += sw) {
              for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
                  for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
                    *packed_w++ =
                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
                  }
                }
                packed_w += (nr - nr_block_size) * kr;
              }

              for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
                const size_t kr_block_size = min(kc - kr_block_start, kr);
                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
                  for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
                    *packed_w++ =
                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
                  }
                  packed_w += kr - kr_block_size;
                }
                packed_w += (nr - nr_block_size) * kr;
              }
            }
          }
        }
      }
    }
    k += kh * kw * kc * nc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  }
}

void xnn_pack_qu8_deconv_goki_w(
  size_t g,
  size_t nc,
  size_t kh,
  size_t kw,
  size_t kc,
  size_t sh,
  size_t sw,
  size_t nr,
  size_t kr,
  size_t sr,
  const uint8_t* k,
  const int32_t* b,
  void* packed_w,
  struct subconvolution_params* subconv_params,
  const struct xnn_qu8_packing_params* params)
{
  assert(sr == 1);
  const int32_t izp = (int32_t) params->input_zero_point;
  const int32_t kzp = (int32_t) params->kernel_zero_point;
  for (size_t i = 0; i < g; i++) {
    for (size_t oy = 0; oy < sh; oy++) {
      for (size_t ox = 0; ox < sw; ox++) {
        if (i == 0) {
          (*subconv_params++).weights = packed_w;
        }
        const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
          const size_t nr_block_size = min(nc - nr_block_start, nr);
          int32_t* packed_b = (int32_t*) packed_w;
          if XNN_LIKELY(b != 0) {
            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
              *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
              packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
            }
          } else {
            size_t n = nr_block_size;
            do {
              *((int32_t*) packed_w) = boff;
              packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
            } while (--n != 0);
          }
          packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
          for (size_t ky = oy; ky < kh; ky += sh) {
            for (size_t kx = ox; kx < kw; kx += sw) {
              for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
                const size_t kr_block_size = min(kc - kr_block_start, kr);
                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
                  int32_t ksum = 0;
                  for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
                    const uint8_t kv =
                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
                    ksum += (int32_t) kv;
                    *((uint8_t*) packed_w) = kv;
                    packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
                  }
                  packed_b[nr_block_offset] -= ksum * izp;
                  packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
                }
                packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
              }
            }
          }
        }
      }
    }
    k += kh * kw * kc * nc;
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nc;
    }
  }
}

void xnn_pack_f32_dwconv_ghw_w(
  size_t h,
  size_t w,
  size_t c,
  size_t cr,
  const float* k,
  const float* b,
  float* packed_w,
  const void* params)
{
  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
    const size_t cr_block_size = min(c - cr_block_start, cr);
    if XNN_LIKELY(b != NULL) {
      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
        *packed_w++ = b[cr_block_start + cr_block_offset];
      }
    } else {
      size_t n = cr_block_size;
      do {
        *packed_w++ = 0.0f;
      } while (--n != 0);
    }
    packed_w += cr - cr_block_size;
    for (size_t x = 0; x < w; x++) {
      for (size_t y = 0; y < h; y++) {
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
          const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
          *packed_w++ = kv;
        }
        packed_w += cr - cr_block_size;
      }
    }
  }
}

void xnn_pack_f16_dwconv_ghw_w(
  size_t h,
  size_t w,
  size_t c,
  size_t cr,
  const uint16_t* k,
  const uint16_t* b,
  uint16_t* packed_w,
  const void* params)
{
  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
    const size_t cr_block_size = min(c - cr_block_start, cr);
    if XNN_LIKELY(b != NULL) {
      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
        *packed_w++ = b[cr_block_start + cr_block_offset];
      }
    } else {
      size_t n = cr_block_size;
      do {
        *packed_w++ = 0;
      } while (--n != 0);
    }
    packed_w += cr - cr_block_size;
    for (size_t x = 0; x < w; x++) {
      for (size_t y = 0; y < h; y++) {
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
          const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
          *packed_w++ = kv;
        }
        packed_w += cr - cr_block_size;
      }
    }
  }
}

void xnn_pack_qu8_dwconv_ghw_w(
  size_t h,
  size_t w,
  size_t c,
  size_t cr,
  const uint8_t* k,
  const int32_t* b,
  void* packed_w,
  const struct xnn_qu8_packing_params* params)
{
  const int32_t izp = (int32_t) params->input_zero_point;
  const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
    const size_t cr_block_size = min(c - cr_block_start, cr);
    int32_t* packed_b = (int32_t*) packed_w;
    if XNN_LIKELY(b != NULL) {
      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
        *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
      }
    } else {
      size_t n = cr_block_size;
      do {
        *((int32_t*) packed_w) = boff;
        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
      } while (--n != 0);
    }
    packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
    for (size_t x = 0; x < w; x++) {
      for (size_t y = 0; y < h; y++) {
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
          const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
          packed_b[cr_block_offset] -= (int32_t) kv * izp;
          *((uint8_t*) packed_w) = kv;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
        }
        packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
      }
    }
  }
}

void xnn_pack_qs8_dwconv_ghw_w(
  size_t h,
  size_t w,
  size_t c,
  size_t cr,
  const int8_t* k,
  const int32_t* b,
  void* packed_w,
  const struct xnn_qs8_packing_params* params)
{
  const int32_t izp = (int32_t) params->input_zero_point;
  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
    const size_t cr_block_size = min(c - cr_block_start, cr);
    int32_t* packed_b = (int32_t*) packed_w;
    if XNN_LIKELY(b != NULL) {
      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
        *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
      }
    } else {
      size_t n = cr_block_size;
      do {
        *((int32_t*) packed_w) = 0;
        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
      } while (--n != 0);
    }
    packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
    for (size_t x = 0; x < w; x++) {
      for (size_t y = 0; y < h; y++) {
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
          const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
          packed_b[cr_block_offset] -= (int32_t) kv * izp;
          *((int8_t*) packed_w) = kv;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
        }
        packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
      }
    }
  }
}

void xnn_pack_f32_dwconv_hwg_w(
  size_t h,
  size_t w,
  size_t c,
  size_t cr,
  const float* k,
  const float* b,
  float* packed_w,
  const void* params)
{
  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
    const size_t cr_block_size = min(c - cr_block_start, cr);
    if XNN_LIKELY(b != NULL) {
      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
        *packed_w++ = b[cr_block_start + cr_block_offset];
      }
    } else {
      size_t n = cr_block_size;
      do {
        *packed_w++ = 0.0f;
      } while (--n != 0);
    }
    packed_w += cr - cr_block_size;
    for (size_t x = 0; x < w; x++) {
      for (size_t y = 0; y < h; y++) {
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
          const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
          *packed_w++ = kv;
        }
        packed_w += cr - cr_block_size;
      }
    }
  }
}

void xnn_pack_f16_dwconv_hwg_w(
  size_t h,
  size_t w,
  size_t c,
  size_t cr,
  const uint16_t* k,
  const uint16_t* b,
  uint16_t* packed_w,
  const void* params)
{
  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
    const size_t cr_block_size = min(c - cr_block_start, cr);
    if XNN_LIKELY(b != NULL) {
      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
        *packed_w++ = b[cr_block_start + cr_block_offset];
      }
    } else {
      size_t n = cr_block_size;
      do {
        *packed_w++ = 0;
      } while (--n != 0);
    }
    packed_w += cr - cr_block_size;
    for (size_t x = 0; x < w; x++) {
      for (size_t y = 0; y < h; y++) {
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
          const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
          *packed_w++ = kv;
        }
        packed_w += cr - cr_block_size;
      }
    }
  }
}

void xnn_pack_qu8_dwconv_hwg_w(
  size_t h,
  size_t w,
  size_t c,
  size_t cr,
  const uint8_t* k,
  const int32_t* b,
  void* packed_w,
  const struct xnn_qu8_packing_params* params)
{
  const int32_t izp = (int32_t) params->input_zero_point;
  const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
    const size_t cr_block_size = min(c - cr_block_start, cr);
    int32_t* packed_b = (int32_t*) packed_w;
    if XNN_LIKELY(b != NULL) {
      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
        *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
      }
    } else {
      size_t n = cr_block_size;
      do {
        *((int32_t*) packed_w) = boff;
        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
      } while (--n != 0);
    }
    packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
    for (size_t x = 0; x < w; x++) {
      for (size_t y = 0; y < h; y++) {
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
          const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
          packed_b[cr_block_offset] -= (int32_t) kv * izp;
          *((uint8_t*) packed_w) = kv;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
        }
        packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
      }
    }
  }
}

void xnn_pack_qs8_dwconv_hwg_w(
  size_t h,
  size_t w,
  size_t c,
  size_t cr,
  const int8_t* k,
  const int32_t* b,
  void* packed_w,
  const struct xnn_qs8_packing_params* params)
{
  const int32_t izp = (int32_t) params->input_zero_point;
  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
    const size_t cr_block_size = min(c - cr_block_start, cr);
    int32_t* packed_b = (int32_t*) packed_w;
    if XNN_LIKELY(b != NULL) {
      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
        *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
      }
    } else {
      size_t n = cr_block_size;
      do {
        *((int32_t*) packed_w) = 0;
        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
      } while (--n != 0);
    }
    packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
    for (size_t x = 0; x < w; x++) {
      for (size_t y = 0; y < h; y++) {
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
          const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
          packed_b[cr_block_offset] -= (int32_t) kv * izp;
          *((int8_t*) packed_w) = kv;
          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
        }
        packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
      }
    }
  }
}

void xnn_pack_f32_gemminc_goi_w(
  size_t g,
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const float* k,
  float* packed_w,
  const void* params)
{
  const size_t skr = sr * kr;
  const size_t skc = round_down_po2(kc, skr);
  const size_t sr_mask = (sr - 1) * kr;
  do {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);

      for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
            *packed_w++ =
              k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
          }
        }
        packed_w += (nr - nr_block_size) * kr;
      }

      for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
        const size_t kr_block_size = min(kc - kr_block_start, kr);
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
            *packed_w++ =
              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
          }
          packed_w += kr - kr_block_size;
        }
        packed_w += (nr - nr_block_size) * kr;
      }
    }
    k += nc * kc;
  } while (--g != 0);
}

void xnn_pack_f16_gemminc_goi_w(
  size_t g,
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const uint16_t* k,
  uint16_t* packed_w,
  const void* params)
{
  const size_t skr = sr * kr;
  const size_t skc = round_down_po2(kc, skr);
  const size_t sr_mask = (sr - 1) * kr;
  do {
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
      const size_t nr_block_size = min(nc - nr_block_start, nr);

      for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
            *packed_w++ =
              k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
          }
        }
        packed_w += (nr - nr_block_size) * kr;
      }

      for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
        const size_t kr_block_size = min(kc - kr_block_start, kr);
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
            *packed_w++ =
              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
          }
          packed_w += kr - kr_block_size;
        }
        packed_w += (nr - nr_block_size) * kr;
      }
    }
    k += nc * kc;
  } while (--g != 0);
}

void xnn_pack_f32_dconv_oki_w(
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kh,
  size_t kw,
  const float* k,
  const float* b,
  float* packed_w,
  const void* params)
{
  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
    const size_t nr_block_size = min(nc - nr_block_start, nr);
    if XNN_LIKELY(b != NULL) {
      for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
        *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
      }
    } else {
      size_t n = nr;
      do {
        *packed_w++ = 0.0f;
      } while (--n != 0);
    }

    for (size_t kx = 0; kx < kw; kx++) {
      for (size_t c = 0; c < kc; c++) {
        for (size_t ky = 0; ky < kh; ky++) {
          for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
            *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
          }
        }
      }
    }
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nr;
    }
  }
}

void xnn_pack_f16_dconv_oki_w(
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kh,
  size_t kw,
  const uint16_t* k,
  const uint16_t* b,
  uint16_t* packed_w,
  const void* params)
{
  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
    const size_t nr_block_size = min(nc - nr_block_start, nr);
    if XNN_LIKELY(b != NULL) {
      for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
        *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
      }
    } else {
      size_t n = nr;
      do {
        *packed_w++ = 0;
      } while (--n != 0);
    }

    for (size_t kx = 0; kx < kw; kx++) {
      for (size_t c = 0; c < kc; c++) {
        for (size_t ky = 0; ky < kh; ky++) {
          for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
            *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
          }
        }
      }
    }
    if XNN_UNPREDICTABLE(b != NULL) {
      b += nr;
    }
  }
}

void xnn_pack_f32_chw_dwconv_ghw_w(
  size_t kernel_size,
  size_t groups,
  const float* kernel,
  const float* bias,
  float* packed_weights,
  const void* params)
{
  for (size_t g = 0; g < groups; g++) {
    if XNN_LIKELY(bias != NULL) {
      *packed_weights = *bias++;
    } else {
      *packed_weights = 0.0f;
    }
    packed_weights += 1;
    for (size_t i = 0; i < kernel_size; i++) {
      *packed_weights++ = kernel[g * kernel_size + i];
    }
  }
}

void xnn_pack_f16_chw_dwconv_ghw_w(
  size_t kernel_size,
  size_t groups,
  const uint16_t* kernel,
  const uint16_t* bias,
  uint16_t* packed_weights,
  const void* params)
{
  for (size_t g = 0; g < groups; g++) {
    if XNN_LIKELY(bias != NULL) {
      *packed_weights = *bias++;
    } else {
      *packed_weights = 0;
    }
    packed_weights += 1;
    for (size_t i = 0; i < kernel_size; i++) {
      *packed_weights++ = kernel[g * kernel_size + i];
    }
  }
}

void xnn_pack_f32_chw_dwconv_hwg_w(
  size_t kernel_size,
  size_t groups,
  const float* kernel,
  const float* bias,
  float* packed_weights,
  const void* params)
{
  for (size_t g = 0; g < groups; g++) {
    if XNN_LIKELY(bias != NULL) {
      *packed_weights = *bias++;
    } else {
      *packed_weights = 0.0f;
    }
    packed_weights += 1;
    for (size_t i = 0; i < kernel_size; i++) {
      *packed_weights++ = kernel[i * groups + g];
    }
  }
}

void xnn_pack_f32_vmulcaddc_w(
  size_t c,
  size_t cr,
  const float* s,
  const float* b,
  float* packed_w,
  const void* params)
{
  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
    const size_t cr_block_size = min(c - cr_block_start, cr);
    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
      *packed_w++ = s[cr_block_start + cr_block_offset];
    }
    packed_w += cr - cr_block_size;
    if XNN_LIKELY(b != NULL) {
      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
        *packed_w++ = b[cr_block_start + cr_block_offset];
      }
    } else {
      size_t n = cr_block_size;
      do {
        *packed_w++ = 0.0f;
      } while (--n != 0);
    }
    packed_w += cr - cr_block_size;
  }
}

void xnn_pack_f16_vmulcaddc_w(
  size_t c,
  size_t cr,
  const uint16_t* s,
  const uint16_t* b,
  uint16_t* packed_w,
  const void* params)
{
  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
    const size_t cr_block_size = min(c - cr_block_start, cr);
    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
      *packed_w++ = s[cr_block_start + cr_block_offset];
    }
    packed_w += cr - cr_block_size;
    if XNN_LIKELY(b != NULL) {
      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
        *packed_w++ = b[cr_block_start + cr_block_offset];
      }
    } else {
      size_t n = cr_block_size;
      do {
        *packed_w++ = 0;
      } while (--n != 0);
    }
    packed_w += cr - cr_block_size;
  }
}