/*
 * Copyright 2016 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can
 * be found in the LICENSE file.
 *
 */

#include <stdlib.h>
#include <string.h>
#include <inttypes.h>

#include "common/util.h"
#include "common/macros.h"
#include "common/vk/assert_vk.h"

#include "hs_vk.h"
#include "hs_vk_target.h"

//
// We want concurrent kernel execution to occur in a few places.
//
// The summary is:
//
//   1) If necessary, some max valued keys are written to the end of
//      the vin/vout buffers.
//
//   2) Blocks of slabs of keys are sorted.
//
//   3) If necesary, the blocks of slabs are merged until complete.
//
//   4) If requested, the slabs will be converted from slab ordering
//      to linear ordering.
//
// Below is the general "happens-before" relationship between HotSort
// compute kernels.
//
// Note the diagram assumes vin and vout are different buffers.  If
// they're not, then the first merge doesn't include the pad_vout
// event in the wait list.
//
//                    +----------+            +---------+
//                    | pad_vout |            | pad_vin |
//                    +----+-----+            +----+----+
//                         |                       |
//                         |                WAITFOR(pad_vin)
//                         |                       |
//                         |                 +-----v-----+
//                         |                 |           |
//                         |            +----v----+ +----v----+
//                         |            | bs_full | | bs_frac |
//                         |            +----+----+ +----+----+
//                         |                 |           |
//                         |                 +-----v-----+
//                         |                       |
//                         |  +------NO------JUST ONE BLOCK?
//                         | /                     |
//                         |/                     YES
//                         +                       |
//                         |                       v
//                         |         END_WITH_EVENTS(bs_full,bs_frac)
//                         |
//                         |
//        WAITFOR(pad_vout,bs_full,bs_frac) >>> first iteration of loop <<<
//                         |
//                         |
//                         +-----------<------------+
//                         |                        |
//                   +-----v-----+                  |
//                   |           |                  |
//              +----v----+ +----v----+             |
//              | fm_full | | fm_frac |             |
//              +----+----+ +----+----+             |
//                   |           |                  ^
//                   +-----v-----+                  |
//                         |                        |
//              WAITFOR(fm_full,fm_frac)            |
//                         |                        |
//                         v                        |
//                      +--v--+                WAITFOR(bc)
//                      | hm  |                     |
//                      +-----+                     |
//                         |                        |
//                    WAITFOR(hm)                   |
//                         |                        ^
//                      +--v--+                     |
//                      | bc  |                     |
//                      +-----+                     |
//                         |                        |
//                         v                        |
//                  MERGING COMPLETE?-------NO------+
//                         |
//                        YES
//                         |
//                         v
//                END_WITH_EVENTS(bc)
//

struct hs_vk
{
  VkAllocationCallbacks const * allocator;
  VkDevice                      device;

  struct {
    struct {
      VkDescriptorSetLayout     vout_vin;
    } layout;
  } desc_set;

  struct {
    struct {
      VkPipelineLayout          vout_vin;
    } layout;
  } pipeline;

  struct hs_vk_target_config    config;

  uint32_t                      key_val_size;
  uint32_t                      slab_keys;
  uint32_t                      bs_slabs_log2_ru;
  uint32_t                      bc_slabs_log2_max;

  struct {
    uint32_t                    count;
    VkPipeline                * bs;
    VkPipeline                * bc;
    VkPipeline                * fm[3];
    VkPipeline                * hm[3];
    VkPipeline                * transpose;
    VkPipeline                  all[];
  } pipelines;
};

//
//
//

struct hs_state
{
  VkCommandBuffer      cb;

  // If sorting in-place, then vout == vin
  VkBuffer             vout;
  VkBuffer             vin;

  // bx_ru is number of rounded up warps in vin
  uint32_t             bx_ru;
};

//
//
//

static
void
hs_barrier_compute_w_to_compute_r(struct hs_state * const state)
{
  static VkMemoryBarrier const shader_w_to_r = {
    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
    .pNext         = NULL,
    .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
    .dstAccessMask = VK_ACCESS_SHADER_READ_BIT
  };

  vkCmdPipelineBarrier(state->cb,
                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                       0,
                       1,
                       &shader_w_to_r,
                       0,
                       NULL,
                       0,
                       NULL);
}

//
//
//

static
void
hs_barrier_to_compute_r(struct hs_state    * const state,
                        VkPipelineStageFlags const src_stage,
                        VkAccessFlagBits     const src_access)
{
  if (src_stage == 0)
    return;

  VkMemoryBarrier const compute_r = {
    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
    .pNext         = NULL,
    .srcAccessMask = src_access,
    .dstAccessMask = VK_ACCESS_SHADER_READ_BIT
  };

  vkCmdPipelineBarrier(state->cb,
                       src_stage,
                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                       0,
                       1,
                       &compute_r,
                       0,
                       NULL,
                       0,
                       NULL);
}

//
//
//

static
void
hs_barrier_to_transfer_fill(struct hs_state    * const state,
                            VkPipelineStageFlags const src_stage,
                            VkAccessFlagBits     const src_access)
{
  if (src_stage == 0)
    return;

  VkMemoryBarrier const fill_w = {
    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
    .pNext         = NULL,
    .srcAccessMask = src_access,
    .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT
  };

  vkCmdPipelineBarrier(state->cb,
                       src_stage,
                       VK_PIPELINE_STAGE_TRANSFER_BIT,
                       0,
                       1,
                       &fill_w,
                       0,
                       NULL,
                       0,
                       NULL);
}

//
//
//

static
void
hs_transpose(struct hs_vk const * const hs,
             struct hs_state    * const state)
{
  hs_barrier_compute_w_to_compute_r(state);

  vkCmdBindPipeline(state->cb,
                    VK_PIPELINE_BIND_POINT_COMPUTE,
                    hs->pipelines.transpose[0]);

  vkCmdDispatch(state->cb,state->bx_ru,1,1);
}

//
//
//

static
void
hs_bc(struct hs_vk const * const hs,
      struct hs_state    * const state,
      uint32_t             const down_slabs,
      uint32_t             const clean_slabs_log2)
{
  hs_barrier_compute_w_to_compute_r(state);

  // block clean the minimal number of down_slabs_log2 spans
  uint32_t const frac_ru = (1u << clean_slabs_log2) - 1;
  uint32_t const full_bc = (down_slabs + frac_ru) >> clean_slabs_log2;

  vkCmdBindPipeline(state->cb,
                    VK_PIPELINE_BIND_POINT_COMPUTE,
                    hs->pipelines.bc[clean_slabs_log2]);

  vkCmdDispatch(state->cb,full_bc,1,1);
}

//
//
//

static
uint32_t
hs_hm(struct hs_vk const * const hs,
      struct hs_state    * const state,
      uint32_t             const down_slabs,
      uint32_t             const clean_slabs_log2)
{
  hs_barrier_compute_w_to_compute_r(state);

  // how many scaled half-merge spans are there?
  uint32_t const frac_ru    = (1 << clean_slabs_log2) - 1;
  uint32_t const spans      = (down_slabs + frac_ru) >> clean_slabs_log2;

  // for now, just clamp to the max
  uint32_t const log2_rem   = clean_slabs_log2 - hs->bc_slabs_log2_max;
  uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.hm.scale_max,log2_rem);
  uint32_t const log2_out   = log2_rem - scale_log2;

  // size the grid
  uint32_t const slab_span  = hs->config.slab.height << log2_out;

  vkCmdBindPipeline(state->cb,
                    VK_PIPELINE_BIND_POINT_COMPUTE,
                    hs->pipelines.hm[scale_log2][0]);

  vkCmdDispatch(state->cb,slab_span,spans,1);

  return log2_out;
}

//
// FIXME -- some of this logic can be skipped if BS is a power-of-two
//

static
uint32_t
hs_fm(struct hs_vk const * const hs,
      struct hs_state    * const state,
      uint32_t           * const down_slabs,
      uint32_t             const up_scale_log2)
{
  //
  // FIXME OPTIMIZATION: in previous HotSort launchers it's sometimes
  // a performance win to bias toward launching the smaller flip merge
  // kernel in order to get more warps in flight (increased
  // occupancy).  This is useful when merging small numbers of slabs.
  //
  // Note that HS_FM_SCALE_MIN will always be 0 or 1.
  //
  // So, for now, just clamp to the max until there is a reason to
  // restore the fancier and probably low-impact approach.
  //
  uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.fm.scale_max,up_scale_log2);
  uint32_t const clean_log2 = up_scale_log2 - scale_log2;

  // number of slabs in a full-sized scaled flip-merge span
  uint32_t const full_span_slabs = hs->config.block.slabs << up_scale_log2;

  // how many full-sized scaled flip-merge spans are there?
  uint32_t full_fm = state->bx_ru / full_span_slabs;
  uint32_t frac_fm = 0;

  // initialize down_slabs
  *down_slabs = full_fm * full_span_slabs;

  // how many half-size scaled + fractional scaled spans are there?
  uint32_t const span_rem        = state->bx_ru - *down_slabs;
  uint32_t const half_span_slabs = full_span_slabs >> 1;

  // if we have over a half-span then fractionally merge it
  if (span_rem > half_span_slabs)
    {
      // the remaining slabs will be cleaned
      *down_slabs += span_rem;

      uint32_t const frac_rem      = span_rem - half_span_slabs;
      uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem);

      if (frac_rem_pow2 >= half_span_slabs)
        {
          // bump it up to a full span
          full_fm += 1;
        }
      else
        {
          // otherwise, add fractional
          frac_fm  = MAX_MACRO(1,frac_rem_pow2 >> clean_log2);
        }
    }

  //
  // Size the grid
  //
  // The simplifying choices below limit the maximum keys that can be
  // sorted with this grid scheme to around ~2B.
  //
  //   .x : slab height << clean_log2  -- this is the slab span
  //   .y : [1...65535]                -- this is the slab index
  //   .z : ( this could also be used to further expand .y )
  //
  // Note that OpenCL declares a grid in terms of global threads and
  // not grids and blocks
  //

  //
  // size the grid
  //
  uint32_t const slab_span = hs->config.slab.height << clean_log2;

  if (full_fm > 0)
    {
      uint32_t const full_idx = hs->bs_slabs_log2_ru - 1 + scale_log2;

      vkCmdBindPipeline(state->cb,
                        VK_PIPELINE_BIND_POINT_COMPUTE,
                        hs->pipelines.fm[scale_log2][full_idx]);

      vkCmdDispatch(state->cb,slab_span,full_fm,1);
    }

  if (frac_fm > 0)
    {
      vkCmdBindPipeline(state->cb,
                        VK_PIPELINE_BIND_POINT_COMPUTE,
                        hs->pipelines.fm[scale_log2][msb_idx_u32(frac_fm)]);

      vkCmdDispatchBase(state->cb,
                        0,full_fm,0,
                        slab_span,1,1);
    }

  return clean_log2;
}

//
//
//

static
void
hs_bs(struct hs_vk const * const hs,
      struct hs_state    * const state,
      uint32_t             const count_padded_in)
{
  uint32_t const slabs_in = count_padded_in / hs->slab_keys;
  uint32_t const full_bs  = slabs_in / hs->config.block.slabs;
  uint32_t const frac_bs  = slabs_in - full_bs * hs->config.block.slabs;

  if (full_bs > 0)
    {
      vkCmdBindPipeline(state->cb,
                        VK_PIPELINE_BIND_POINT_COMPUTE,
                        hs->pipelines.bs[hs->bs_slabs_log2_ru]);

      vkCmdDispatch(state->cb,full_bs,1,1);
    }

  if (frac_bs > 0)
    {
      uint32_t const frac_idx          = msb_idx_u32(frac_bs);
      uint32_t const full_to_frac_log2 = hs->bs_slabs_log2_ru - frac_idx;

      vkCmdBindPipeline(state->cb,
                        VK_PIPELINE_BIND_POINT_COMPUTE,
                        hs->pipelines.bs[msb_idx_u32(frac_bs)]);

      vkCmdDispatchBase(state->cb,
                        full_bs<<full_to_frac_log2,0,0,
                        1,1,1);
    }
}

//
//
//

static
void
hs_keyset_pre_fm(struct hs_vk const * const hs,
                 struct hs_state    * const state,
                 uint32_t             const count_lo,
                 uint32_t             const count_hi)
{
  uint32_t const vout_span = count_hi - count_lo;

  vkCmdFillBuffer(state->cb,
                  state->vout,
                  count_lo  * hs->key_val_size,
                  vout_span * hs->key_val_size,
                  UINT32_MAX);
}

//
//
//

static
void
hs_keyset_pre_bs(struct hs_vk const * const hs,
                 struct hs_state    * const state,
                 uint32_t             const count,
                 uint32_t             const count_hi)
{
  uint32_t const vin_span = count_hi - count;

  vkCmdFillBuffer(state->cb,
                  state->vin,
                  count    * hs->key_val_size,
                  vin_span * hs->key_val_size,
                  UINT32_MAX);
}

//
//
//

void
hs_vk_ds_bind(struct hs_vk const * const hs,
              VkDescriptorSet            hs_ds,
              VkCommandBuffer            cb,
              VkBuffer                   vin,
              VkBuffer                   vout)
{
  //
  // initialize the HotSort descriptor set
  //
  VkDescriptorBufferInfo const dbi[] = {
    {
      .buffer = vout == VK_NULL_HANDLE ? vin : vout,
      .offset = 0,
      .range  = VK_WHOLE_SIZE
    },
    {
      .buffer = vin,
      .offset = 0,
      .range  = VK_WHOLE_SIZE
    }
  };

  VkWriteDescriptorSet const wds[] = {
    {
      .sType            = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
      .pNext            = NULL,
      .dstSet           = hs_ds,
      .dstBinding       = 0,
      .dstArrayElement  = 0,
      .descriptorCount  = 2,
      .descriptorType   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
      .pImageInfo       = NULL,
      .pBufferInfo      = dbi,
      .pTexelBufferView = NULL
    }
  };

  vkUpdateDescriptorSets(hs->device,
                         ARRAY_LENGTH_MACRO(wds),
                         wds,
                         0,
                         NULL);

  //
  // All HotSort kernels can use the same descriptor set:
  //
  //   {
  //     HS_KEY_TYPE vout[];
  //     HS_KEY_TYPE vin[];
  //   }
  //
  // Note that only the bs() kernels read from vin().
  //
  vkCmdBindDescriptorSets(cb,
                          VK_PIPELINE_BIND_POINT_COMPUTE,
                          hs->pipeline.layout.vout_vin,
                          0,
                          1,
                          &hs_ds,
                          0,
                          NULL);
}

//
//
//

void
hs_vk_sort(struct hs_vk const * const hs,
           VkCommandBuffer            cb,
           VkBuffer                   vin,
           VkPipelineStageFlags const vin_src_stage,
           VkAccessFlagBits     const vin_src_access,
           VkBuffer                   vout,
           VkPipelineStageFlags const vout_src_stage,
           VkAccessFlagBits     const vout_src_access,
           uint32_t             const count,
           uint32_t             const count_padded_in,
           uint32_t             const count_padded_out,
           bool                 const linearize)
{
  // is this sort in place?
  bool const is_in_place = (vout == VK_NULL_HANDLE);

  //
  // create some common state
  //
  struct hs_state state = {
    .cb    = cb,
    .vin   = vin,
    .vout  = is_in_place ? vin : vout,
    .bx_ru = (count + hs->slab_keys - 1) / hs->slab_keys
  };

  // initialize vin
  uint32_t const count_hi          = is_in_place ? count_padded_out : count_padded_in;
  bool     const is_pre_sort_reqd  = count_hi > count;
  bool     const is_pre_merge_reqd = !is_in_place && (count_padded_out > count_padded_in);

  //
  // pre-sort  keyset needs to happen before bs()
  // pre-merge keyset needs to happen before fm()
  //

  VkPipelineStageFlags bs_src_stage  = 0;
  VkAccessFlagBits     bs_src_access = 0;

  // initialize any trailing keys in vin before sorting
  if (is_pre_sort_reqd)
    {
      hs_barrier_to_transfer_fill(&state,vin_src_stage,vin_src_access);

      hs_keyset_pre_bs(hs,&state,count,count_hi);

      bs_src_stage  |= VK_PIPELINE_STAGE_TRANSFER_BIT;
      bs_src_access |= VK_ACCESS_TRANSFER_WRITE_BIT;
    }
  else
    {
      bs_src_stage  = vin_src_stage;
      bs_src_access = vin_src_access;
    }

  hs_barrier_to_compute_r(&state,bs_src_stage,bs_src_access);

  // sort blocks of slabs... after hs_keyset_pre_sort()
  hs_bs(hs,&state,count_padded_in);

  VkPipelineStageFlags fm_src_stage  = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
  VkAccessFlagBits     fm_src_access = VK_ACCESS_SHADER_READ_BIT;

  // initialize any trailing keys in vout before merging
  if (is_pre_merge_reqd)
    {
      hs_barrier_to_transfer_fill(&state,vout_src_stage,vout_src_access);

      hs_keyset_pre_fm(hs,&state,count_padded_in,count_padded_out);

      fm_src_stage  |= VK_PIPELINE_STAGE_TRANSFER_BIT;
      fm_src_access |= VK_ACCESS_TRANSFER_WRITE_BIT;
    }
  else
    {
      fm_src_stage  |= vout_src_stage;
      fm_src_access |= vout_src_access;
    }

  //
  // if this was a single bs block then there is no merging
  //
  if (state.bx_ru > hs->config.block.slabs)
    {
      hs_barrier_to_compute_r(&state,fm_src_stage,fm_src_access);

      //
      // otherwise, merge sorted spans of slabs until done
      //
      int32_t up_scale_log2 = 1;

      while (true)
        {
          uint32_t down_slabs;

          // flip merge slabs -- return span of slabs that must be cleaned
          uint32_t clean_slabs_log2 = hs_fm(hs,&state,
                                            &down_slabs,
                                            up_scale_log2);

          // if span is gt largest slab block cleaner then half merge
          while (clean_slabs_log2 > hs->bc_slabs_log2_max)
            {
              clean_slabs_log2 = hs_hm(hs,&state,
                                       down_slabs,
                                       clean_slabs_log2);
            }

          // launch clean slab grid -- is it the final launch?
          hs_bc(hs,&state,down_slabs,clean_slabs_log2);

          // was this the final block clean?
          if (((uint32_t)hs->config.block.slabs << up_scale_log2) >= state.bx_ru)
            break;

          // otherwise, merge twice as many slabs
          up_scale_log2 += 1;

          // drop a barrier
          hs_barrier_compute_w_to_compute_r(&state);
        }
    }

  // slabs or linear?
  if (linearize)
    hs_transpose(hs,&state);
}

//
//
//

#ifdef HS_VK_VERBOSE_STATISTICS_AMD

#include <stdio.h>

static
void
hs_vk_verbose_statistics_amd(VkDevice device, struct hs_vk const * const hs)
{
  PFN_vkGetShaderInfoAMD vkGetShaderInfoAMD =
    (PFN_vkGetShaderInfoAMD)
    vkGetDeviceProcAddr(device,"vkGetShaderInfoAMD");

  if (vkGetShaderInfoAMD == NULL)
    return;

  fprintf(stdout,
          "                                   PHY   PHY  AVAIL AVAIL\n"
          "VGPRs SGPRs LDS_MAX LDS/WG  SPILL VGPRs SGPRs VGPRs SGPRs  WORKGROUP_SIZE\n");

  for (uint32_t ii=0; ii<hs->pipelines.count; ii++)
    {
      VkShaderStatisticsInfoAMD ssi_amd;
      size_t                    ssi_amd_size = sizeof(ssi_amd);

      if (vkGetShaderInfoAMD(hs->device,
                             hs->pipelines.all[ii],
                             VK_SHADER_STAGE_COMPUTE_BIT,
                             VK_SHADER_INFO_TYPE_STATISTICS_AMD,
                             &ssi_amd_size,
                             &ssi_amd) == VK_SUCCESS)
        {
          fprintf(stdout,
                  "%5" PRIu32 " "
                  "%5" PRIu32 "   "
                  "%5" PRIu32 " "

                  "%6zu "
                  "%6zu "

                  "%5" PRIu32 " "
                  "%5" PRIu32 " "
                  "%5" PRIu32 " "
                  "%5" PRIu32 "  "

                  "( %6" PRIu32 ", " "%6" PRIu32 ", " "%6" PRIu32 " )\n",
                  ssi_amd.resourceUsage.numUsedVgprs,
                  ssi_amd.resourceUsage.numUsedSgprs,
                  ssi_amd.resourceUsage.ldsSizePerLocalWorkGroup,
                  ssi_amd.resourceUsage.ldsUsageSizeInBytes,    // size_t
                  ssi_amd.resourceUsage.scratchMemUsageInBytes, // size_t
                  ssi_amd.numPhysicalVgprs,
                  ssi_amd.numPhysicalSgprs,
                  ssi_amd.numAvailableVgprs,
                  ssi_amd.numAvailableSgprs,
                  ssi_amd.computeWorkGroupSize[0],
                  ssi_amd.computeWorkGroupSize[1],
                  ssi_amd.computeWorkGroupSize[2]);
        }
    }
}

#endif

//
//
//

#ifdef HS_VK_VERBOSE_DISASSEMBLY_AMD

#include <stdio.h>

static
void
hs_vk_verbose_disassembly_amd(VkDevice device, struct hs_vk const * const hs)
{
  PFN_vkGetShaderInfoAMD vkGetShaderInfoAMD =
    (PFN_vkGetShaderInfoAMD)
    vkGetDeviceProcAddr(device,"vkGetShaderInfoAMD");

  if (vkGetShaderInfoAMD == NULL)
    return;

  for (uint32_t ii=0; ii<hs->pipelines.count; ii++)
    {
      size_t disassembly_amd_size;

      if (vkGetShaderInfoAMD(hs->device,
                             hs->pipelines.all[ii],
                             VK_SHADER_STAGE_COMPUTE_BIT,
                             VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD,
                             &disassembly_amd_size,
                             NULL) == VK_SUCCESS)
        {
          void * disassembly_amd = malloc(disassembly_amd_size);

          if (vkGetShaderInfoAMD(hs->device,
                                 hs->pipelines.all[ii],
                                 VK_SHADER_STAGE_COMPUTE_BIT,
                                 VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD,
                                 &disassembly_amd_size,
                                 disassembly_amd) == VK_SUCCESS)
            {
              fprintf(stdout,"%s",(char*)disassembly_amd);
            }

          free(disassembly_amd);
        }
    }
}

#endif

//
//
//

struct hs_vk *
hs_vk_create(struct hs_vk_target   const * const target,
             VkDevice                            device,
             VkAllocationCallbacks const *       allocator,
             VkPipelineCache                     pipeline_cache)
{
  //
  // we reference these values a lot
  //
  uint32_t const bs_slabs_log2_ru  = msb_idx_u32(pow2_ru_u32(target->config.block.slabs));
  uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs));

  //
  // how many kernels will be created?
  //
  uint32_t const count_bs    = bs_slabs_log2_ru + 1;
  uint32_t const count_bc    = bc_slabs_log2_max + 1;
  uint32_t       count_fm[3] = { 0 };
  uint32_t       count_hm[3] = { 0 };

  // guaranteed to be in range [0,2]
  for (uint32_t scale = target->config.merge.fm.scale_min;
       scale <= target->config.merge.fm.scale_max;
       scale++)
    {
      uint32_t fm_left = (target->config.block.slabs / 2) << scale;

      count_fm[scale] = msb_idx_u32(pow2_ru_u32(fm_left)) + 1;
    }

  // guaranteed to be in range [0,2]
  for (uint32_t scale = target->config.merge.hm.scale_min;
       scale <= target->config.merge.hm.scale_max;
       scale++)
    {
      count_hm[scale] = 1;
    }

  uint32_t const count_bc_fm_hm_transpose =
    + count_bc
    + count_fm[0] + count_fm[1] + count_fm[2]
    + count_hm[0] + count_hm[1] + count_hm[2] +
    1; // transpose

  uint32_t const count_all = count_bs + count_bc_fm_hm_transpose;

  //
  // allocate hs_vk
  //
  struct hs_vk * hs;

  if (allocator == NULL)
    {
      hs = malloc(sizeof(*hs) + sizeof(VkPipeline*) * count_all);
    }
  else
    {
      hs = allocator->pfnAllocation(NULL,
                                    sizeof(*hs) + sizeof(VkPipeline*) * count_all,
                                    0,
                                    VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
    }

  // save device & allocator
  hs->device    = device;
  hs->allocator = allocator;

  //
  // create one descriptor set layout
  //
  static VkDescriptorSetLayoutBinding const dslb_vout_vin[] = {
    {
      .binding            = 0, // vout
      .descriptorType     = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
      .descriptorCount    = 1,
      .stageFlags         = VK_SHADER_STAGE_COMPUTE_BIT,
      .pImmutableSamplers = NULL
    },
    {
      .binding            = 1, // vin
      .descriptorType     = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
      .descriptorCount    = 1,
      .stageFlags         = VK_SHADER_STAGE_COMPUTE_BIT,
      .pImmutableSamplers = NULL
    }
  };

  static VkDescriptorSetLayoutCreateInfo const dscli = {
    .sType        = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
    .pNext        = NULL,
    .flags        = 0,
    .bindingCount = 2, // 0:vout[], 1:vin[]
    .pBindings    = dslb_vout_vin
  };

  vk(CreateDescriptorSetLayout(device,
                               &dscli,
                               allocator,
                               &hs->desc_set.layout.vout_vin));

  //
  // create one pipeline layout
  //
  VkPipelineLayoutCreateInfo plci = {
    .sType                  = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
    .pNext                  = NULL,
    .flags                  = 0,
    .setLayoutCount         = 1,
    .pSetLayouts            = &hs->desc_set.layout.vout_vin,
    .pushConstantRangeCount = 0,
    .pPushConstantRanges    = NULL
  };

  vk(CreatePipelineLayout(device,
                          &plci,
                          allocator,
                          &hs->pipeline.layout.vout_vin));

  //
  // copy the config from the target -- we need these values later
  //
  memcpy(&hs->config,&target->config,sizeof(hs->config));

  // save some frequently used calculated values
  hs->key_val_size      = (target->config.words.key + target->config.words.val) * 4;
  hs->slab_keys         = target->config.slab.height << target->config.slab.width_log2;
  hs->bs_slabs_log2_ru  = bs_slabs_log2_ru;
  hs->bc_slabs_log2_max = bc_slabs_log2_max;

  // save kernel count
  hs->pipelines.count   = count_all;

  //
  // create all the compute pipelines by reusing this info
  //
  VkComputePipelineCreateInfo cpci = {
    .sType                 = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
    .pNext                 = NULL,
    .flags                 = VK_PIPELINE_CREATE_DISPATCH_BASE, // | VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT,
    .stage = {
      .sType               = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
      .pNext               = NULL,
      .flags               = 0,
      .stage               = VK_SHADER_STAGE_COMPUTE_BIT,
      .module              = VK_NULL_HANDLE,
      .pName               = "main",
      .pSpecializationInfo = NULL
    },
    .layout                = hs->pipeline.layout.vout_vin,
    .basePipelineHandle    = VK_NULL_HANDLE,
    .basePipelineIndex     = 0
  };

  //
  // Create a shader module, use it to create a pipeline... and
  // dispose of the shader module.
  //
  // The BS     compute shaders have the same layout
  // The non-BS compute shaders have the same layout
  //
  VkShaderModuleCreateInfo smci = {
    .sType    = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
    .pNext    = NULL,
    .flags    = 0,
    .codeSize = 0,
    .pCode    = (uint32_t const *)target->modules // FIXME -- unfortunate typecast
  };

  //
  // bs kernels have layout: (vout,vin)
  // remaining  have layout: (vout)
  //
  for (uint32_t ii=0; ii<count_all; ii++)
    {
      // convert bytes to words
      uint32_t const * const module = smci.pCode + smci.codeSize / sizeof(*module);

      smci.codeSize = NTOHL_MACRO(module[0]);
      smci.pCode    = module + 1;

      vk(CreateShaderModule(device,
                            &smci,
                            allocator,
                            &cpci.stage.module));

      vk(CreateComputePipelines(device,
                                pipeline_cache,
                                1,
                                &cpci,
                                allocator,
                                hs->pipelines.all+ii));

      vkDestroyShaderModule(device,
                            cpci.stage.module,
                            allocator);
    }

  //
  // initialize pointers to pipeline handles
  //
  VkPipeline * pipeline_next = hs->pipelines.all;

  // BS
  hs->pipelines.bs        = pipeline_next;
  pipeline_next          += count_bs;

  // BC
  hs->pipelines.bc        = pipeline_next;
  pipeline_next          += count_bc;

  // FM[0]
  hs->pipelines.fm[0]     = count_fm[0] ? pipeline_next : NULL;
  pipeline_next          += count_fm[0];

  // FM[1]
  hs->pipelines.fm[1]     = count_fm[1] ? pipeline_next : NULL;
  pipeline_next          += count_fm[1];

  // FM[2]
  hs->pipelines.fm[2]     = count_fm[2] ? pipeline_next : NULL;
  pipeline_next          += count_fm[2];

  // HM[0]
  hs->pipelines.hm[0]     = count_hm[0] ? pipeline_next : NULL;
  pipeline_next          += count_hm[0];

  // HM[1]
  hs->pipelines.hm[1]     = count_hm[1] ? pipeline_next : NULL;
  pipeline_next          += count_hm[1];

  // HM[2]
  hs->pipelines.hm[2]     = count_hm[2] ? pipeline_next : NULL;
  pipeline_next          += count_hm[2];

  // TRANSPOSE
  hs->pipelines.transpose = pipeline_next;
  pipeline_next          += 1;

  //
  // optionally dump pipeline stats
  //
#ifdef HS_VK_VERBOSE_STATISTICS_AMD
  hs_vk_verbose_statistics_amd(device,hs);
#endif
#ifdef HS_VK_VERBOSE_DISASSEMBLY_AMD
  hs_vk_verbose_disassembly_amd(device,hs);
#endif

  //
  //
  //

  return hs;
}

//
//
//

void
hs_vk_release(struct hs_vk * const hs)
{
  vkDestroyDescriptorSetLayout(hs->device,
                               hs->desc_set.layout.vout_vin,
                               hs->allocator);

  vkDestroyPipelineLayout(hs->device,
                          hs->pipeline.layout.vout_vin,
                          hs->allocator);

  for (uint32_t ii=0; ii<hs->pipelines.count; ii++)
    {
      vkDestroyPipeline(hs->device,
                        hs->pipelines.all[ii],
                        hs->allocator);
    }

  if (hs->allocator == NULL)
    {
      free(hs);
    }
  else
    {
      hs->allocator->pfnFree(NULL,hs);
    }
}

//
// Allocate a per-thread descriptor set for the vin and vout
// VkBuffers.  Note that HotSort uses only one descriptor set.
//

VkDescriptorSet
hs_vk_ds_alloc(struct hs_vk const * const hs, VkDescriptorPool desc_pool)
{
  VkDescriptorSetAllocateInfo const ds_alloc_info = {
    .sType              = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
    .pNext              = NULL,
    .descriptorPool     = desc_pool,
    .descriptorSetCount = 1,
    .pSetLayouts        = &hs->desc_set.layout.vout_vin
  };

  VkDescriptorSet hs_ds;

  vk(AllocateDescriptorSets(hs->device,
                            &ds_alloc_info,
                            &hs_ds));

  return hs_ds;
}

//
//
//

void
hs_vk_pad(struct hs_vk const * const hs,
          uint32_t             const count,
          uint32_t           * const count_padded_in,
          uint32_t           * const count_padded_out)
{
  //
  // round up the count to slabs
  //
  uint32_t const slabs_ru        = (count + hs->slab_keys - 1) / hs->slab_keys;
  uint32_t const blocks          = slabs_ru / hs->config.block.slabs;
  uint32_t const block_slabs     = blocks * hs->config.block.slabs;
  uint32_t const slabs_ru_rem    = slabs_ru - block_slabs;
  uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),hs->config.block.slabs);

  *count_padded_in  = (block_slabs + slabs_ru_rem_ru) * hs->slab_keys;
  *count_padded_out = *count_padded_in;

  //
  // will merging be required?
  //
  if (slabs_ru > hs->config.block.slabs)
    {
      // more than one block
      uint32_t const blocks_lo       = pow2_rd_u32(blocks);
      uint32_t const block_slabs_lo  = blocks_lo * hs->config.block.slabs;
      uint32_t const block_slabs_rem = slabs_ru - block_slabs_lo;

      if (block_slabs_rem > 0)
        {
          uint32_t const block_slabs_rem_ru     = pow2_ru_u32(block_slabs_rem);

          uint32_t const block_slabs_hi         = MAX_MACRO(block_slabs_rem_ru,
                                                            blocks_lo << (1 - hs->config.merge.fm.scale_min));

          uint32_t const block_slabs_padded_out = MIN_MACRO(block_slabs_lo+block_slabs_hi,
                                                            block_slabs_lo*2); // clamp non-pow2 blocks

          *count_padded_out = block_slabs_padded_out * hs->slab_keys;
        }
    }
}

//
//
//