/*
 * Copyright 2024 Valve Corporation
 * SPDX-License-Identifier: MIT
 */
#include "compiler/libcl/libcl_vk.h"
#include "agx_pack.h"
#include "geometry.h"
#include "libagx_dgc.h"

/*
 * To implement drawIndirectCount generically, we dispatch a kernel to
 * clone-and-patch the indirect buffer, predicating out draws as appropriate.
 */
KERNEL(32)
libagx_predicate_indirect(global uint32_t *out, constant uint32_t *in,
                          constant uint32_t *draw_count, uint32_t stride_el,
                          uint indexed__2)
{
   uint draw = get_global_id(0);
   uint words = indexed__2 ? 5 : 4;
   bool enabled = draw < *draw_count;
   out += draw * words;
   in += draw * stride_el;

   /* Copy enabled draws, zero predicated draws. */
   for (uint i = 0; i < words; ++i) {
      out[i] = enabled ? in[i] : 0;
   }
}

/*
 * Indexing/offseting is in software if necessary so we strip all
 * indexing/offset information.
 */
KERNEL(1)
libagx_draw_without_adj(global VkDrawIndirectCommand *out,
                        global VkDrawIndirectCommand *in,
                        global struct agx_ia_state *ia, uint64_t index_buffer,
                        uint64_t index_buffer_range_el, int index_size_B,
                        enum mesa_prim prim)
{
   *out = (VkDrawIndirectCommand){
      .vertexCount = libagx_remap_adj_count(in->vertexCount, prim),
      .instanceCount = in->instanceCount,
   };

   /* TODO: Deduplicate */
   if (index_size_B) {
      uint offs = in->firstVertex;

      ia->index_buffer = libagx_index_buffer(
         index_buffer, index_buffer_range_el, offs, index_size_B, 0);

      ia->index_buffer_range_el =
         libagx_index_buffer_range_el(index_buffer_range_el, offs);
   }
}

/* Precondition: len must be < the group size */
static void
libagx_memcpy_small(global uchar *dst, constant uchar *src, uint len, uint tid)
{
   if (tid < len) {
      dst[tid] = src[tid];
   }
}

static void
libagx_memcpy_aligned_uint4(global uint *dst, constant uint *src, uint len,
                            uint tid, uint group_size)
{
   for (uint i = tid; i < len; i += group_size) {
      vstore4(vload4(i, src), i, dst);
   }
}

static void
libagx_memcpy_to_aligned(global uint *dst, constant uchar *src, uint len,
                         uint tid, uint group_size)
{
   /* Copy a few bytes at the start */
   uint start_unaligned = ((uintptr_t)src) & 3;
   if (start_unaligned) {
      uint need = 4 - start_unaligned;
      libagx_memcpy_small((global uchar *)dst, src, need, tid);
      src += need;
      len -= need;
   }

   /* Copy a few bytes at the end */
   uint end_unaligned = len & 0xf;
   len -= end_unaligned;
   libagx_memcpy_small(((global uchar *)dst) + len, src + len, end_unaligned,
                       tid);

   /* Now both src and dst are word-aligned, and len is 16-aligned */
   libagx_memcpy_aligned_uint4(dst, (constant uint *)src, len / 16, tid,
                               group_size);
}

/* Precondition: len must be < the group size */
static void
libagx_memset_small(global uchar *dst, uchar b, int len, uint tid)
{
   if (tid < len) {
      dst[tid] = b;
   }
}

/*
 * AGX does not implement robustBufferAccess2 semantics for
 * index buffers, where out-of-bounds indices read as zero. When we
 * dynamically detect index buffer overread (this if-statement), we need
 * to clone the index buffer and zero-extend it to get robustness.
 *
 * We do this dynamically (generating a VDM draw to consume the result) to avoid
 * expensive allocations & memcpys in the happy path where no out-of-bounds
 * access occurs. Otherwise we could use a hardware indirect draw, rather than
 * generating VDM words directly in shader.
 *
 * TODO: Handle multiple draws in parallel.
 */
KERNEL(32)
libagx_draw_robust_index(global uint32_t *vdm,
                         global struct agx_geometry_state *heap,
                         constant VkDrawIndexedIndirectCommand *cmd,
                         uint64_t in_buf_ptr, uint32_t in_buf_range_B,
                         ushort restart, enum agx_primitive topology,
                         enum agx_index_size index_size__3)
{
   uint tid = get_sub_group_id();
   bool first = tid == 0;
   enum agx_index_size index_size = index_size__3;

   struct agx_draw draw = agx_draw_indexed(
      cmd->indexCount, cmd->instanceCount, cmd->firstIndex, cmd->vertexOffset,
      cmd->firstInstance, in_buf_ptr, in_buf_range_B, index_size, restart);

   if (agx_direct_draw_overreads_indices(draw)) {
      constant void *in_buf = (constant void *)agx_draw_index_buffer(draw);
      uint in_size_el = agx_draw_index_range_el(draw);
      uint in_size_B = agx_indices_to_B(in_size_el, index_size);

      /* After a small number of zeroes at the end, extra zeroes cannot change
       * rendering since they will duplicate the same degenerate primitive many
       * times. Therefore we clamp the number of zeroes we need to extend with.
       * This makes the memset constant time.
       */
      draw.b.count[0] = min(draw.b.count[0], in_size_el + 32);

      uint out_size_el = draw.b.count[0];
      uint out_size_B = agx_indices_to_B(out_size_el, index_size);

      /* Allocate memory for the shadow index buffer */
      global uchar *padded;
      if (first) {
         padded = agx_heap_alloc_nonatomic(heap, out_size_B);
      }
      padded = (global uchar *)sub_group_broadcast((uintptr_t)padded, 0);

      draw.index_buffer = (uintptr_t)padded;
      draw.index_buffer_range_B = out_size_B;
      draw.start = 0;

      /* Clone the index buffer. The destination is aligned as a post-condition
       * of agx_heap_alloc_nonatomic.
       */
      libagx_memcpy_to_aligned((global uint *)padded, in_buf, in_size_B, tid,
                               32);

      /* Extend with up to 32 zeroes with a small memset */
      libagx_memset_small(padded + in_size_B, 0, out_size_B - in_size_B, tid);
   }

   if (first) {
      agx_vdm_draw(vdm, 0, draw, topology);
   }
}