/*
 * Copyright 2024 Valve Corporation
 * SPDX-License-Identifier: MIT
 */

#pragma once

#include "compiler/libcl/libcl.h"
#include "compiler/shader_enums.h"
#include "agx_pack.h"

#define agx_push(ptr, T, cfg)                                                  \
   for (unsigned _loop = 0; _loop < 1;                                         \
        ++_loop, ptr = (GLOBAL void *)(((uintptr_t)ptr) + AGX_##T##_LENGTH))   \
      agx_pack(ptr, T, cfg)

#define agx_push_packed(ptr, src, T)                                           \
   static_assert(sizeof(src) == AGX_##T##_LENGTH);                             \
   memcpy(ptr, &src, sizeof(src));                                             \
   ptr = (GLOBAL void *)(((uintptr_t)ptr) + sizeof(src));

static inline enum agx_index_size
agx_translate_index_size(uint8_t size_B)
{
   /* Index sizes are encoded logarithmically */
   static_assert(__builtin_ctz(1) == AGX_INDEX_SIZE_U8);
   static_assert(__builtin_ctz(2) == AGX_INDEX_SIZE_U16);
   static_assert(__builtin_ctz(4) == AGX_INDEX_SIZE_U32);

   assert((size_B == 1) || (size_B == 2) || (size_B == 4));
   return __builtin_ctz(size_B);
}

static inline unsigned
agx_indices_to_B(unsigned x, enum agx_index_size size)
{
   return x << size;
}

static inline uint8_t
agx_index_size_to_B(enum agx_index_size size)
{
   return agx_indices_to_B(1, size);
}

struct agx_workgroup {
   uint32_t x, y, z;
};

static inline struct agx_workgroup
agx_workgroup(uint32_t x, uint32_t y, uint32_t z)
{
   return (struct agx_workgroup){.x = x, .y = y, .z = z};
}

static inline unsigned
agx_workgroup_threads(struct agx_workgroup wg)
{
   return wg.x * wg.y * wg.z;
}

struct agx_grid {
   enum agx_cdm_mode mode;
   union {
      uint32_t count[3];
      uint64_t ptr;
   };
};

static struct agx_grid
agx_3d(uint32_t x, uint32_t y, uint32_t z)
{
   return (struct agx_grid){.mode = AGX_CDM_MODE_DIRECT, .count = {x, y, z}};
}

static struct agx_grid
agx_1d(uint32_t x)
{
   return agx_3d(x, 1, 1);
}

static struct agx_grid
agx_grid_indirect(uint64_t ptr)
{
   return (struct agx_grid){.mode = AGX_CDM_MODE_INDIRECT_GLOBAL, .ptr = ptr};
}

static struct agx_grid
agx_grid_indirect_local(uint64_t ptr)
{
   return (struct agx_grid){.mode = AGX_CDM_MODE_INDIRECT_LOCAL, .ptr = ptr};
}

static inline bool
agx_is_indirect(struct agx_grid grid)
{
   return grid.mode != AGX_CDM_MODE_DIRECT;
}

enum agx_barrier {
   /* No barrier/cache operations needed */
   AGX_BARRIER_NONE = 0,

   /* Catch-all for all defined barriers. Because we have not yet
    * reverse-engineered the finer details here, this is the only barrier we
    * have....
    */
   AGX_BARRIER_ALL = (1 << 0),
};

struct agx_draw {
   struct agx_grid b;
   uint64_t index_buffer;
   uint32_t index_buffer_range_B;
   uint32_t start;
   uint32_t index_bias;
   uint32_t start_instance;

   /* Primitive restart enabled. If true, implies indexed */
   bool restart;
   enum agx_index_size index_size;

   /* TODO: Optimize this boolean. We can't just check if index_buffer != 0
    * because that breaks with null index buffers.
    */
   bool indexed;
};

static inline struct agx_draw
agx_draw_indirect(uint64_t ptr)
{
   return (struct agx_draw){.b = agx_grid_indirect(ptr)};
}

static inline struct agx_draw
agx_draw_indexed(uint32_t index_count, uint32_t instance_count,
                 uint32_t first_index, uint32_t index_bias,
                 uint32_t first_instance, uint64_t buf, uint32_t range_B,
                 enum agx_index_size index_size, bool restart)
{
   return (struct agx_draw){
      .b = agx_3d(index_count, instance_count, 1),
      .index_buffer = buf,
      .index_buffer_range_B = range_B,
      .start = first_index,
      .index_bias = index_bias,
      .start_instance = first_instance,
      .index_size = index_size,
      .restart = restart,
      .indexed = true,
   };
}

static inline struct agx_draw
agx_draw_indexed_indirect(uint64_t ptr, uint64_t buf, uint32_t range_B,
                          enum agx_index_size index_size, bool restart)
{
   return (struct agx_draw){
      .b = agx_grid_indirect(ptr),
      .index_buffer = buf,
      .index_buffer_range_B = range_B,
      .index_size = index_size,
      .restart = restart,
      .indexed = true,
   };
}

static inline unsigned
agx_draw_index_range_B(struct agx_draw d)
{
   uint range_B = d.index_buffer_range_B;
   if (!agx_is_indirect(d.b))
      range_B -= agx_indices_to_B(d.start, d.index_size);

   return range_B;
}

static inline unsigned
agx_draw_index_range_el(struct agx_draw d)
{
   assert(d.indexed);
   return agx_draw_index_range_B(d) >> d.index_size;
}

static inline uint64_t
agx_draw_index_buffer(struct agx_draw d)
{
   assert(d.indexed);

   uint64_t ib = d.index_buffer;
   if (!agx_is_indirect(d.b))
      ib += agx_indices_to_B(d.start, d.index_size);

   return ib;
}

static bool
agx_direct_draw_overreads_indices(struct agx_draw d)
{
   uint32_t range_B = agx_indices_to_B(d.start + d.b.count[0], d.index_size);
   return range_B > d.index_buffer_range_B;
}

enum agx_chip {
   AGX_CHIP_G13G,
   AGX_CHIP_G13X,
   AGX_CHIP_G14G,
   AGX_CHIP_G14X,
};

static inline GLOBAL uint32_t *
agx_cdm_launch(GLOBAL uint32_t *out, enum agx_chip chip, struct agx_grid grid,
               struct agx_workgroup wg,
               struct agx_cdm_launch_word_0_packed launch, uint32_t usc)
{
#ifndef __OPENCL_VERSION__
   struct agx_cdm_launch_word_0_packed mode;
   agx_pack(&mode, CDM_LAUNCH_WORD_0, cfg) {
      cfg.mode = grid.mode;
   }

   agx_merge(launch, mode, CDM_LAUNCH_WORD_0);
#endif

   agx_push_packed(out, launch, CDM_LAUNCH_WORD_0);

   agx_push(out, CDM_LAUNCH_WORD_1, cfg) {
      cfg.pipeline = usc;
   }

   if (chip == AGX_CHIP_G14X) {
      agx_push(out, CDM_UNK_G14X, cfg)
         ;
   }

   if (agx_is_indirect(grid)) {
      agx_push(out, CDM_INDIRECT, cfg) {
         cfg.address_hi = grid.ptr >> 32;
         cfg.address_lo = grid.ptr;
      }
   } else {
      agx_push(out, CDM_GLOBAL_SIZE, cfg) {
         cfg.x = grid.count[0];
         cfg.y = grid.count[1];
         cfg.z = grid.count[2];
      }
   }

   if (grid.mode != AGX_CDM_MODE_INDIRECT_LOCAL) {
      agx_push(out, CDM_LOCAL_SIZE, cfg) {
         cfg.x = wg.x;
         cfg.y = wg.y;
         cfg.z = wg.z;
      }
   }

   return out;
}

static inline GLOBAL uint32_t *
agx_vdm_draw(GLOBAL uint32_t *out, enum agx_chip chip, struct agx_draw draw,
             enum agx_primitive topology)
{
   uint64_t ib = draw.indexed ? agx_draw_index_buffer(draw) : 0;

   agx_push(out, INDEX_LIST, cfg) {
      cfg.primitive = topology;

      if (agx_is_indirect(draw.b)) {
         cfg.indirect_buffer_present = true;
      } else {
         cfg.instance_count_present = true;
         cfg.index_count_present = true;
         cfg.start_present = true;
      }

      if (draw.indexed) {
         cfg.restart_enable = draw.restart;
         cfg.index_buffer_hi = ib >> 32;
         cfg.index_size = draw.index_size;

         cfg.index_buffer_present = true;
         cfg.index_buffer_size_present = true;
      }
   }

   if (draw.indexed) {
      agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
         cfg.buffer_lo = ib;
      }
   }

   if (agx_is_indirect(draw.b)) {
      agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
         cfg.address_hi = draw.b.ptr >> 32;
         cfg.address_lo = draw.b.ptr & BITFIELD_MASK(32);
      }
   } else {
      agx_push(out, INDEX_LIST_COUNT, cfg) {
         cfg.count = draw.b.count[0];
      }

      agx_push(out, INDEX_LIST_INSTANCES, cfg) {
         cfg.count = draw.b.count[1];
      }

      agx_push(out, INDEX_LIST_START, cfg) {
         cfg.start = draw.indexed ? draw.index_bias : draw.start;
      }
   }

   if (draw.indexed) {
      agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
         cfg.size = align(agx_draw_index_range_B(draw), 4);
      }
   }

   return out;
}

static inline uint32_t
agx_vdm_draw_size(enum agx_chip chip, struct agx_draw draw)
{
   uint32_t size = AGX_INDEX_LIST_LENGTH;

   if (agx_is_indirect(draw.b)) {
      size += AGX_INDEX_LIST_INDIRECT_BUFFER_LENGTH;
   } else {
      size += AGX_INDEX_LIST_COUNT_LENGTH;
      size += AGX_INDEX_LIST_INSTANCES_LENGTH;
      size += AGX_INDEX_LIST_START_LENGTH;
   }

   if (draw.indexed) {
      size += AGX_INDEX_LIST_BUFFER_LO_LENGTH;
      size += AGX_INDEX_LIST_BUFFER_SIZE_LENGTH;
   }

   return size;
}

static inline GLOBAL uint32_t *
agx_cdm_barrier(GLOBAL uint32_t *out, enum agx_chip chip)
{
   agx_push(out, CDM_BARRIER, cfg) {
      cfg.unk_5 = true;
      cfg.unk_6 = true;
      cfg.unk_8 = true;
      // cfg.unk_11 = true;
      // cfg.unk_20 = true;
      // cfg.unk_24 = true; if clustered?
      if (chip == AGX_CHIP_G13X) {
         cfg.unk_4 = true;
         // cfg.unk_26 = true;
      }

      /* With multiple launches in the same CDM stream, we can get cache
       * coherency (? or sync?) issues. We hit this with blits, which need - in
       * between dispatches - need the PBE cache to be flushed and the texture
       * cache to be invalidated. Until we know what bits mean what exactly,
       * let's just set these after every launch to be safe. We can revisit in
       * the future when we figure out what the bits mean.
       */
      cfg.unk_0 = true;
      cfg.unk_1 = true;
      cfg.unk_2 = true;
      cfg.usc_cache_inval = true;
      cfg.unk_4 = true;
      cfg.unk_5 = true;
      cfg.unk_6 = true;
      cfg.unk_7 = true;
      cfg.unk_8 = true;
      cfg.unk_9 = true;
      cfg.unk_10 = true;
      cfg.unk_11 = true;
      cfg.unk_12 = true;
      cfg.unk_13 = true;
      cfg.unk_14 = true;
      cfg.unk_15 = true;
      cfg.unk_16 = true;
      cfg.unk_17 = true;
      cfg.unk_18 = true;
      cfg.unk_19 = true;
   }

   return out;
}

static inline GLOBAL uint32_t *
agx_vdm_return(GLOBAL uint32_t *out)
{
   agx_push(out, VDM_BARRIER, cfg) {
      cfg.returns = true;
   }

   return out;
}

static inline GLOBAL uint32_t *
agx_cdm_return(GLOBAL uint32_t *out)
{
   agx_push(out, CDM_STREAM_RETURN, cfg)
      ;

   return out;
}

static inline GLOBAL uint32_t *
agx_cdm_terminate(GLOBAL uint32_t *out)
{
   agx_push(out, CDM_STREAM_TERMINATE, _)
      ;

   return out;
}

static inline GLOBAL uint32_t *
agx_vdm_terminate(GLOBAL uint32_t *out)
{
   agx_push(out, VDM_STREAM_TERMINATE, _)
      ;

   return out;
}

static inline GLOBAL uint32_t *
agx_cdm_jump(GLOBAL uint32_t *out, uint64_t target)
{
   agx_push(out, CDM_STREAM_LINK, cfg) {
      cfg.target_lo = target & BITFIELD_MASK(32);
      cfg.target_hi = target >> 32;
   }

   return out;
}

static inline GLOBAL uint32_t *
agx_vdm_jump(GLOBAL uint32_t *out, uint64_t target)
{
   agx_push(out, VDM_STREAM_LINK, cfg) {
      cfg.target_lo = target & BITFIELD_MASK(32);
      cfg.target_hi = target >> 32;
   }

   return out;
}

static inline GLOBAL uint32_t *
agx_cs_jump(GLOBAL uint32_t *out, uint64_t target, bool vdm)
{
   return vdm ? agx_vdm_jump(out, target) : agx_cdm_jump(out, target);
}

static inline GLOBAL uint32_t *
agx_cdm_call(GLOBAL uint32_t *out, uint64_t target)
{
   agx_push(out, CDM_STREAM_LINK, cfg) {
      cfg.target_lo = target & BITFIELD_MASK(32);
      cfg.target_hi = target >> 32;
      cfg.with_return = true;
   }

   return out;
}

static inline GLOBAL uint32_t *
agx_vdm_call(GLOBAL uint32_t *out, uint64_t target)
{
   agx_push(out, VDM_STREAM_LINK, cfg) {
      cfg.target_lo = target & BITFIELD_MASK(32);
      cfg.target_hi = target >> 32;
      cfg.with_return = true;
   }

   return out;
}

#define AGX_MAX_LINKED_USC_SIZE                                                \
   (AGX_USC_PRESHADER_LENGTH + AGX_USC_FRAGMENT_PROPERTIES_LENGTH +            \
    AGX_USC_REGISTERS_LENGTH + AGX_USC_SHADER_LENGTH + AGX_USC_SHARED_LENGTH + \
    AGX_USC_SAMPLER_LENGTH + (AGX_USC_UNIFORM_LENGTH * 9))

/*
 * This data structure contains everything needed to dispatch a compute shader
 * (and hopefully eventually graphics?).
 *
 * It is purely flat, no CPU pointers. That makes it suitable for sharing
 * between CPU and GPU. The intention is that it is packed on the CPU side and
 * then consumed on either host or device for dispatching work.
 */
struct agx_shader {
   struct agx_cdm_launch_word_0_packed launch;
   struct agx_workgroup workgroup;

   struct {
      uint32_t size;
      uint8_t data[AGX_MAX_LINKED_USC_SIZE];
   } usc;
};

/* Opaque structure representing a USC program being constructed */
struct agx_usc_builder {
   GLOBAL uint8_t *head;

#ifndef NDEBUG
   uint8_t *begin;
   size_t size;
#endif
} PACKED;

static struct agx_usc_builder
agx_usc_builder(GLOBAL void *out, ASSERTED size_t size)
{
   return (struct agx_usc_builder){
      .head = out,

#ifndef NDEBUG
      .begin = out,
      .size = size,
#endif
   };
}

static bool
agx_usc_builder_validate(struct agx_usc_builder *b, size_t size)
{
#ifndef NDEBUG
   assert(((b->head - b->begin) + size) <= b->size);
#endif

   return true;
}

#define agx_usc_pack(b, struct_name, template)                                 \
   for (bool it =                                                              \
           agx_usc_builder_validate((b), AGX_USC_##struct_name##_LENGTH);      \
        it; it = false, (b)->head += AGX_USC_##struct_name##_LENGTH)           \
      agx_pack((b)->head, USC_##struct_name, template)

#define agx_usc_push_blob(b, blob, length)                                     \
   for (bool it = agx_usc_builder_validate((b), length); it;                   \
        it = false, (b)->head += length)                                       \
      memcpy((b)->head, blob, length);

#define agx_usc_push_packed(b, struct_name, packed)                            \
   agx_usc_push_blob(b, packed.opaque, AGX_USC_##struct_name##_LENGTH);

static void
agx_usc_uniform(struct agx_usc_builder *b, unsigned start_halfs,
                unsigned size_halfs, uint64_t buffer)
{
   assert((start_halfs + size_halfs) <= (1 << 9) && "uniform file overflow");
   assert(size_halfs <= 64 && "caller's responsibility to split");
   assert(size_halfs > 0 && "no empty uniforms");

   if (start_halfs & BITFIELD_BIT(8)) {
      agx_usc_pack(b, UNIFORM_HIGH, cfg) {
         cfg.start_halfs = start_halfs & BITFIELD_MASK(8);
         cfg.size_halfs = size_halfs;
         cfg.buffer = buffer;
      }
   } else {
      agx_usc_pack(b, UNIFORM, cfg) {
         cfg.start_halfs = start_halfs;
         cfg.size_halfs = size_halfs;
         cfg.buffer = buffer;
      }
   }
}

static inline void
agx_usc_words_precomp(GLOBAL uint32_t *out, CONST struct agx_shader *s,
                      uint64_t data, unsigned data_size)
{
   /* Map the data directly as uniforms starting at u0 */
   struct agx_usc_builder b = agx_usc_builder(out, sizeof(s->usc.data));
   agx_usc_uniform(&b, 0, DIV_ROUND_UP(data_size, 2), data);
   agx_usc_push_blob(&b, s->usc.data, s->usc.size);
}

/* This prototype is sufficient for sizing the output */
static inline unsigned
libagx_draw_robust_index_vdm_size()
{
   struct agx_draw draw = agx_draw_indexed(0, 0, 0, 0, 0, 0, 0, 0, 0);
   return agx_vdm_draw_size(0, draw);
}

static inline unsigned
libagx_remap_adj_count(unsigned count, enum mesa_prim prim)
{
   if (prim == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
      /* Spec gives formula for # of primitives in a tri strip adj */
      unsigned c4 = count >= 4 ? count - 4 : 0;
      return 3 * (c4 / 2);
   } else if (prim == MESA_PRIM_LINE_STRIP_ADJACENCY) {
      return 2 * (count >= 3 ? count - 3 : 0);
   } else {
      /* Adjacency lists just drop half the vertices. */
      return count / 2;
   }
}