/*
 * Copyright © 2016 Red Hat.
 * Copyright © 2016 Bas Nieuwenhuizen
 * SPDX-License-Identifier: MIT
 *
 * based in part on anv driver which is:
 * Copyright © 2015 Intel Corporation
 */

#ifndef TU_DEVICE_H
#define TU_DEVICE_H

#include "tu_common.h"

#include "vk_device_memory.h"

#include "tu_autotune.h"
#include "tu_cs.h"
#include "tu_pass.h"
#include "tu_perfetto.h"
#include "tu_suballoc.h"
#include "tu_util.h"

#include "common/freedreno_rd_output.h"
#include "util/vma.h"
#include "util/u_vector.h"

/* queue types */
#define TU_QUEUE_GENERAL 0

#define TU_MAX_QUEUE_FAMILIES 1

#define TU_BORDER_COLOR_COUNT 4096
#define TU_BORDER_COLOR_BUILTIN 6

#define TU_BLIT_SHADER_SIZE 4096

/* extra space in vsc draw/prim streams */
#define VSC_PAD 0x40

enum global_shader {
   GLOBAL_SH_VS_BLIT,
   GLOBAL_SH_VS_CLEAR,
   GLOBAL_SH_FS_BLIT,
   GLOBAL_SH_FS_BLIT_ZSCALE,
   GLOBAL_SH_FS_COPY_MS,
   GLOBAL_SH_FS_COPY_MS_HALF,
   GLOBAL_SH_FS_CLEAR0,
   GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
   GLOBAL_SH_COUNT,
};

struct tu_memory_heap {
   /* Standard bits passed on to the client */
   VkDeviceSize      size;
   VkMemoryHeapFlags flags;

   /** Copied from ANV:
    *
    * Driver-internal book-keeping.
    *
    * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
    */
   alignas(8) VkDeviceSize used;
};

enum tu_kgsl_dma_type
{
   TU_KGSL_DMA_TYPE_ION_LEGACY,
   TU_KGSL_DMA_TYPE_ION,
   TU_KGSL_DMA_TYPE_DMAHEAP,
};

extern uint64_t os_page_size;

struct tu_physical_device
{
   struct vk_physical_device vk;

   struct tu_instance *instance;

   const char *name;
   uint8_t driver_uuid[VK_UUID_SIZE];
   uint8_t device_uuid[VK_UUID_SIZE];
   uint8_t cache_uuid[VK_UUID_SIZE];

   struct wsi_device wsi_device;

   char fd_path[20];
   int local_fd;
   bool has_local;
   int64_t local_major;
   int64_t local_minor;
   int master_fd;
   bool has_master;
   int64_t master_major;
   int64_t master_minor;

   int kgsl_dma_fd;
   enum tu_kgsl_dma_type kgsl_dma_type;

   uint32_t gmem_size;
   uint64_t gmem_base;

   uint32_t usable_gmem_size_gmem;
   uint32_t ccu_offset_gmem;
   uint32_t ccu_offset_bypass;
   uint32_t ccu_depth_offset_bypass;
   uint32_t vpc_attr_buf_offset_gmem;
   uint32_t vpc_attr_buf_size_gmem;
   uint32_t vpc_attr_buf_offset_bypass;
   uint32_t vpc_attr_buf_size_bypass;

   /* Amount of usable descriptor sets, this excludes any reserved set */
   uint32_t usable_sets;
   /* Index of the reserved descriptor set, may be -1 if unset */
   int32_t reserved_set_idx;

   bool has_set_iova;
   uint64_t va_start;
   uint64_t va_size;

   bool has_cached_coherent_memory;
   bool has_cached_non_coherent_memory;
   uintptr_t level1_dcache_size;

   struct fdl_ubwc_config ubwc_config;

   bool has_preemption;

   struct {
      uint32_t type_count;
      VkMemoryPropertyFlags types[VK_MAX_MEMORY_TYPES];
   } memory;

   struct fd_dev_id dev_id;
   struct fd_dev_info dev_info;
   const struct fd_dev_info *info;

   int msm_major_version;
   int msm_minor_version;

   /* with 0 being the highest priority */
   uint32_t submitqueue_priority_count;

   struct tu_memory_heap heap;

   struct vk_sync_type syncobj_type;
   struct vk_sync_timeline_type timeline_type;
   const struct vk_sync_type *sync_types[3];

   uint32_t device_count;
};
VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
                       VK_OBJECT_TYPE_PHYSICAL_DEVICE)

struct tu_knl;

struct tu_instance
{
   struct vk_instance vk;

   const struct tu_knl *knl;

   uint32_t instance_idx;
   uint32_t api_version;

   struct driOptionCache dri_options;
   struct driOptionCache available_dri_options;

   bool dont_care_as_load;

   /* Conservative LRZ (default true) invalidates LRZ on draws with
    * blend and depth-write enabled, because this can lead to incorrect
    * rendering.  Driconf can be used to disable conservative LRZ for
    * games which do not have the problematic sequence of draws *and*
    * suffer a performance loss with conservative LRZ.
    */
   bool conservative_lrz;

   /* If to internally reserve a descriptor set for descriptor set
    * dynamic offsets, a descriptor set can be freed at the cost of
    * being unable to use the feature. As it is a part of the Vulkan
    * core, this is enabled by default.
    */
   bool reserve_descriptor_set;

   /* Allow out of bounds UBO access by disabling lowering of UBO loads for
    * indirect access, which rely on the UBO bounds specified in the shader,
    * rather than the bound UBO size which isn't known until draw time.
    *
    * See: https://github.com/doitsujin/dxvk/issues/3861
    */
   bool allow_oob_indirect_ubo_loads;

   /* DXVK and VKD3D-Proton use customBorderColorWithoutFormat
    * and have most of D24S8 images with USAGE_SAMPLED, in such case we
    * disable UBWC for correctness. However, games don't use border color for
    * depth-stencil images. So we elect to ignore this edge case and force
    * UBWC to be enabled.
    */
   bool disable_d24s8_border_color_workaround;
};
VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
                       VK_OBJECT_TYPE_INSTANCE)

/* This struct defines the layout of the global_bo */
struct tu6_global
{
   /* clear/blit shaders */
   uint32_t shaders[TU_BLIT_SHADER_SIZE];

   uint32_t seqno_dummy;          /* dummy seqno for CP_EVENT_WRITE */
   uint32_t _pad0;
   volatile uint32_t vsc_draw_overflow;
   uint32_t _pad1;
   volatile uint32_t vsc_prim_overflow;
   uint32_t _pad2;
   uint64_t predicate;

   /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
   struct {
      uint32_t offset;
      uint32_t pad[7];
   } flush_base[4];

   alignas(16) uint32_t cs_indirect_xyz[12];

   uint32_t vsc_state[32];

   volatile uint32_t vtx_stats_query_not_running;

   /* To know when renderpass stats for autotune are valid */
   volatile uint32_t autotune_fence;

   /* For recycling command buffers for dynamic suspend/resume comamnds */
   volatile uint32_t dynamic_rendering_fence;

   volatile uint32_t dbg_one;
   volatile uint32_t dbg_gmem_total_loads;
   volatile uint32_t dbg_gmem_taken_loads;
   volatile uint32_t dbg_gmem_total_stores;
   volatile uint32_t dbg_gmem_taken_stores;

   /* Written from GPU */
   volatile uint32_t breadcrumb_gpu_sync_seqno;
   uint32_t _pad3;
   /* Written from CPU, acknowledges value written from GPU */
   volatile uint32_t breadcrumb_cpu_sync_seqno;
   uint32_t _pad4;

   volatile uint32_t userspace_fence;
   uint32_t _pad5;

   /* note: larger global bo will be used for customBorderColors */
   struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
};
#define gb_offset(member) offsetof(struct tu6_global, member)
#define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
#define global_iova_arr(cmd, member, idx)                                    \
   (global_iova(cmd, member) + sizeof_field(struct tu6_global, member[0]) * (idx))

struct tu_pvtmem_bo {
      mtx_t mtx;
      struct tu_bo *bo;
      uint32_t per_fiber_size, per_sp_size;
};

struct tu_virtio_device;
struct tu_queue;

struct tu_device
{
   struct vk_device vk;
   struct tu_instance *instance;

   struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
   int queue_count[TU_MAX_QUEUE_FAMILIES];

   struct tu_physical_device *physical_device;
   uint32_t device_idx;
   int fd;

   struct ir3_compiler *compiler;

   /* Backup in-memory cache to be used if the app doesn't provide one */
   struct vk_pipeline_cache *mem_cache;

#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */

   /* Currently the kernel driver uses a 32-bit GPU address space, but it
    * should be impossible to go beyond 48 bits.
    */
   struct {
      struct tu_bo *bo;
      mtx_t construct_mtx;
      bool initialized;
   } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];

   struct tu_pvtmem_bo fiber_pvtmem_bo, wave_pvtmem_bo;

   struct tu_bo *global_bo;
   struct tu6_global *global_bo_map;

   uint32_t implicit_sync_bo_count;

   /* Device-global BO suballocator for reducing BO management overhead for
    * (read-only) pipeline state.  Synchronized by pipeline_mutex.
    */
   struct tu_suballocator pipeline_suballoc;
   mtx_t pipeline_mutex;

   /* Device-global BO suballocator for reducing BO management for small
    * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
    */
   struct tu_suballocator autotune_suballoc;
   mtx_t autotune_mutex;

   /* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
    * each submission.
    */
   struct tu_suballocator kgsl_profiling_suballoc;
   mtx_t kgsl_profiling_mutex;

   /* the blob seems to always use 8K factor and 128K param sizes, copy them */
#define TU_TESS_FACTOR_SIZE (8 * 1024)
#define TU_TESS_PARAM_SIZE (128 * 1024)
#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
   /* Lazily allocated, protected by the device mutex. */
   struct tu_bo *tess_bo;

   struct ir3_shader_variant *global_shader_variants[GLOBAL_SH_COUNT];
   struct ir3_shader *global_shaders[GLOBAL_SH_COUNT];
   uint64_t global_shader_va[GLOBAL_SH_COUNT];

   struct tu_shader *empty_tcs, *empty_tes, *empty_gs, *empty_fs, *empty_fs_fdm;

   uint32_t vsc_draw_strm_pitch;
   uint32_t vsc_prim_strm_pitch;
   BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
   mtx_t mutex;

   mtx_t vma_mutex;
   struct util_vma_heap vma;

   /* bo list for submits: */
   struct drm_msm_gem_submit_bo *submit_bo_list;
   /* map bo handles to bo list index: */
   uint32_t submit_bo_count, submit_bo_list_size;
   /* bo list for dumping: */
   struct util_dynarray dump_bo_list;
   mtx_t bo_mutex;
   /* protects imported BOs creation/freeing */
   struct u_rwlock dma_bo_lock;

   /* Tracking of name -> size allocated for TU_DEBUG_BOS */
   struct hash_table *bo_sizes;

   /* This array holds all our 'struct tu_bo' allocations. We use this
    * so we can add a refcount to our BOs and check if a particular BO
    * was already allocated in this device using its GEM handle. This is
    * necessary to properly manage BO imports, because the kernel doesn't
    * refcount the underlying BO memory.
    *
    * Specifically, when self-importing (i.e. importing a BO into the same
    * device that created it), the kernel will give us the same BO handle
    * for both BOs and we must only free it once when  both references are
    * freed. Otherwise, if we are not self-importing, we get two different BO
    * handles, and we want to free each one individually.
    *
    * The refcount is also useful for being able to maintain BOs across
    * VK object lifetimes, such as pipelines suballocating out of BOs
    * allocated on the device.
    */
   struct util_sparse_array bo_map;

   /* We cannot immediately free VMA when freeing BO, kernel truly
    * frees BO when it stops being busy.
    * So we have to free our VMA only after the kernel does it.
    */
   struct u_vector zombie_vmas;

   struct tu_cs sub_cs;

   /* Command streams to set pass index to a scratch reg */
   struct tu_cs_entry *perfcntrs_pass_cs_entries;

   struct tu_cs_entry cmdbuf_start_a725_quirk_entry;

   struct tu_cs_entry bin_preamble_entry;

   struct util_dynarray dynamic_rendering_pending;
   VkCommandPool dynamic_rendering_pool;
   uint32_t dynamic_rendering_fence;

   /* Condition variable for timeline semaphore to notify waiters when a
    * new submit is executed. */
   pthread_cond_t timeline_cond;
   pthread_mutex_t submit_mutex;

   struct tu_autotune autotune;

   struct breadcrumbs_context *breadcrumbs_ctx;

   struct tu_cs *dbg_cmdbuf_stomp_cs;
   struct tu_cs *dbg_renderpass_stomp_cs;

#ifdef TU_HAS_VIRTIO
   struct tu_virtio_device *vdev;
#endif

   uint32_t submit_count;

   /* Address space and global fault count for this local_fd with DRM backend */
   uint64_t fault_count;

   struct u_trace_context trace_context;

   #ifdef HAVE_PERFETTO
   struct tu_perfetto_state perfetto;
   #endif

   bool use_z24uint_s8uint;
   bool use_lrz;

   struct fd_rd_output rd_output;
};
VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)

struct tu_device_memory
{
   struct vk_device_memory vk;

   struct tu_bo *bo;

   /* for dedicated allocations */
   struct tu_image *image;
};
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, vk.base, VkDeviceMemory,
                               VK_OBJECT_TYPE_DEVICE_MEMORY)

struct tu_attachment_info
{
   struct tu_image_view *attachment;
};

struct tu_tiling_config {
   /* size of the first tile */
   VkExtent2D tile0;
   /* number of tiles */
   VkExtent2D tile_count;

   /* size of the first VSC pipe */
   VkExtent2D pipe0;
   /* number of VSC pipes */
   VkExtent2D pipe_count;

   /* Whether using GMEM is even possible with this configuration */
   bool possible;

   /* Whether binning should be used for gmem rendering using this framebuffer. */
   bool binning;

   /* Whether binning could be used for gmem rendering using this framebuffer. */
   bool binning_possible;

   /* pipe register values */
   uint32_t pipe_config[MAX_VSC_PIPES];
   uint32_t pipe_sizes[MAX_VSC_PIPES];
};

struct tu_framebuffer
{
   struct vk_object_base base;

   uint32_t width;
   uint32_t height;
   uint32_t layers;

   struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];

   uint32_t attachment_count;
   struct tu_attachment_info attachments[0];
};
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
                               VK_OBJECT_TYPE_FRAMEBUFFER)

uint64_t
tu_get_system_heap_size(struct tu_physical_device *physical_device);

VkResult
tu_physical_device_init(struct tu_physical_device *device,
                        struct tu_instance *instance);

void
tu_physical_device_get_global_priority_properties(const struct tu_physical_device *pdevice,
                                                  VkQueueFamilyGlobalPriorityPropertiesKHR *props);

uint64_t
tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);

static inline struct tu_bo *
tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
{
   return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
}

struct u_trace_context *
tu_device_get_u_trace(struct tu_device *device);

/* Get a scratch bo for use inside a command buffer. This will always return
 * the same bo given the same size or similar sizes, so only one scratch bo
 * can be used at the same time. It's meant for short-lived things where we
 * need to write to some piece of memory, read from it, and then immediately
 * discard it.
 */
VkResult
tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);

void tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
                                  const VkRenderingInfo *pRenderingInfo);

void
tu_copy_buffer(struct u_trace_context *utctx, void *cmdstream,
               void *ts_from, uint64_t from_offset_B,
               void *ts_to, uint64_t to_offset_B,
               uint64_t size_B);


VkResult
tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
                            struct u_trace **trace_copy);

/* If we copy trace and timestamps we will have to free them. */
struct tu_u_trace_cmd_data
{
   struct tu_cs *timestamp_copy_cs;
   struct u_trace *trace;
};

/* Data necessary to retrieve timestamps and clean all
 * associated resources afterwards.
 */
struct tu_u_trace_submission_data
{
   uint32_t submission_id;

   /* We have to know when timestamps are available,
    * this queue and fence indicates it.
    */
   struct tu_queue *queue;
   uint32_t fence;

   uint32_t cmd_buffer_count;
   uint32_t last_buffer_with_tracepoints;
   struct tu_u_trace_cmd_data *cmd_trace_data;

   /* GPU time is reset on GPU power cycle and the GPU time
    * offset may change between submissions due to power cycle.
    */
   uint64_t gpu_ts_offset;

   /* KGSL needs a GPU memory to write submission timestamps into */
   struct tu_suballoc_bo kgsl_timestamp_bo;
};

VkResult
tu_u_trace_submission_data_create(
   struct tu_device *device,
   struct tu_cmd_buffer **cmd_buffers,
   uint32_t cmd_buffer_count,
   struct tu_u_trace_submission_data **submission_data);

void
tu_u_trace_submission_data_finish(
   struct tu_device *device,
   struct tu_u_trace_submission_data *submission_data);

const char *
tu_debug_bos_add(struct tu_device *dev, uint64_t size, const char *name);
void
tu_debug_bos_del(struct tu_device *dev, struct tu_bo *bo);
void
tu_debug_bos_print_stats(struct tu_device *dev);

void
tu_dump_bo_init(struct tu_device *dev, struct tu_bo *bo);
void
tu_dump_bo_del(struct tu_device *dev, struct tu_bo *bo);


#endif /* TU_DEVICE_H */