/*
 * Copyright © 2019 Raspberry Pi
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "v3dv_private.h"
#include "vulkan/util/vk_util.h"
#include "util/blob.h"
#include "nir/nir_serialize.h"

static const bool dump_stats = false;
static const bool dump_stats_verbose = false;

/* Shared for nir/variants */
#define V3DV_MAX_PIPELINE_CACHE_ENTRIES 4096

static uint32_t
sha1_hash_func(const void *sha1)
{
   return _mesa_hash_data(sha1, 20);
}

static bool
sha1_compare_func(const void *sha1_a, const void *sha1_b)
{
   return memcmp(sha1_a, sha1_b, 20) == 0;
}

struct serialized_nir {
   unsigned char sha1_key[20];
   size_t size;
   char data[0];
};

static void
cache_dump_stats(struct v3dv_pipeline_cache *cache)
{
   if (!dump_stats_verbose)
      return;

   fprintf(stderr, "  NIR cache entries:      %d\n", cache->nir_stats.count);
   fprintf(stderr, "  NIR cache miss count:   %d\n", cache->nir_stats.miss);
   fprintf(stderr, "  NIR cache hit  count:   %d\n", cache->nir_stats.hit);

   fprintf(stderr, "  variant cache entries:      %d\n", cache->variant_stats.count);
   fprintf(stderr, "  variant cache miss count:   %d\n", cache->variant_stats.miss);
   fprintf(stderr, "  variant cache hit  count:   %d\n", cache->variant_stats.hit);
}

void
v3dv_pipeline_cache_upload_nir(struct v3dv_pipeline *pipeline,
                               struct v3dv_pipeline_cache *cache,
                               nir_shader *nir,
                               unsigned char sha1_key[20])
{
   if (!cache || !cache->nir_cache)
      return;

   if (cache->nir_stats.count > V3DV_MAX_PIPELINE_CACHE_ENTRIES)
      return;

   pthread_mutex_lock(&cache->mutex);
   struct hash_entry *entry =
      _mesa_hash_table_search(cache->nir_cache, sha1_key);
   pthread_mutex_unlock(&cache->mutex);
   if (entry)
      return;

   struct blob blob;
   blob_init(&blob);

   nir_serialize(&blob, nir, false);
   if (blob.out_of_memory) {
      blob_finish(&blob);
      return;
   }

   pthread_mutex_lock(&cache->mutex);
   /* Because ralloc isn't thread-safe, we have to do all this inside the
    * lock.  We could unlock for the big memcpy but it's probably not worth
    * the hassle.
    */
   entry = _mesa_hash_table_search(cache->nir_cache, sha1_key);
   if (entry) {
      blob_finish(&blob);
      pthread_mutex_unlock(&cache->mutex);
      return;
   }

   struct serialized_nir *snir =
      ralloc_size(cache->nir_cache, sizeof(*snir) + blob.size);
   memcpy(snir->sha1_key, sha1_key, 20);
   snir->size = blob.size;
   memcpy(snir->data, blob.data, blob.size);

   blob_finish(&blob);

   cache->nir_stats.count++;
   if (unlikely(dump_stats)) {
      char sha1buf[41];
      _mesa_sha1_format(sha1buf, snir->sha1_key);
      fprintf(stderr, "pipeline cache %p, new nir entry %s\n", cache, sha1buf);
      cache_dump_stats(cache);
   }

   _mesa_hash_table_insert(cache->nir_cache, snir->sha1_key, snir);

   pthread_mutex_unlock(&cache->mutex);
}

nir_shader*
v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline,
                                   struct v3dv_pipeline_cache *cache,
                                   const nir_shader_compiler_options *nir_options,
                                   unsigned char sha1_key[20])
{
   if (!cache || !cache->nir_cache)
      return NULL;

   if (unlikely(dump_stats)) {
      char sha1buf[41];
      _mesa_sha1_format(sha1buf, sha1_key);

      fprintf(stderr, "pipeline cache %p, search for nir %s\n", cache, sha1buf);
   }

   const struct serialized_nir *snir = NULL;

   pthread_mutex_lock(&cache->mutex);
   struct hash_entry *entry =
      _mesa_hash_table_search(cache->nir_cache, sha1_key);
   if (entry)
      snir = entry->data;
   pthread_mutex_unlock(&cache->mutex);

   if (snir) {
      struct blob_reader blob;
      blob_reader_init(&blob, snir->data, snir->size);

      /* We use context NULL as we want the p_stage to keep the reference to
       * nir, as we keep open the possibility of provide a shader variant
       * after cache creation
       */
      nir_shader *nir = nir_deserialize(NULL, nir_options, &blob);
      if (blob.overrun) {
         ralloc_free(nir);
      } else {
         cache->nir_stats.hit++;
         cache_dump_stats(cache);
         return nir;
      }
   }

   cache->nir_stats.miss++;
   cache_dump_stats(cache);

   return NULL;
}

void
v3dv_pipeline_cache_init(struct v3dv_pipeline_cache *cache,
                         struct v3dv_device *device,
                         bool cache_enabled)
{
   cache->_loader_data.loaderMagic = ICD_LOADER_MAGIC;

   cache->device = device;
   pthread_mutex_init(&cache->mutex, NULL);

   if (cache_enabled) {
      cache->nir_cache = _mesa_hash_table_create(NULL, sha1_hash_func,
                                                 sha1_compare_func);
      cache->nir_stats.miss = 0;
      cache->nir_stats.hit = 0;
      cache->nir_stats.count = 0;

      cache->variant_cache = _mesa_hash_table_create(NULL, sha1_hash_func,
                                                     sha1_compare_func);
      cache->variant_stats.miss = 0;
      cache->variant_stats.hit = 0;
      cache->variant_stats.count = 0;
   } else {
      cache->nir_cache = NULL;
      cache->variant_cache = NULL;
   }

}

struct v3dv_shader_variant*
v3dv_pipeline_cache_search_for_variant(struct v3dv_pipeline *pipeline,
                                       struct v3dv_pipeline_cache *cache,
                                       unsigned char sha1_key[20])
{
   if (!cache || !cache->variant_cache)
      return NULL;

   if (unlikely(dump_stats)) {
      char sha1buf[41];
      _mesa_sha1_format(sha1buf, sha1_key);

      fprintf(stderr, "pipeline cache %p, search variant with key %s\n", cache, sha1buf);
   }

   pthread_mutex_lock(&cache->mutex);

   struct hash_entry *entry =
      _mesa_hash_table_search(cache->variant_cache, sha1_key);

   if (entry) {
      struct v3dv_shader_variant *variant =
         (struct v3dv_shader_variant *) entry->data;

      cache->variant_stats.hit++;
      if (unlikely(dump_stats)) {
         fprintf(stderr, "\tcache hit: %p\n", variant);
         cache_dump_stats(cache);
      }

      if (variant)
         v3dv_shader_variant_ref(variant);

      pthread_mutex_unlock(&cache->mutex);
      return variant;
   }

   cache->variant_stats.miss++;
   if (unlikely(dump_stats)) {
      fprintf(stderr, "\tcache miss\n");
      cache_dump_stats(cache);
   }

   pthread_mutex_unlock(&cache->mutex);
   return NULL;
}

void
v3dv_pipeline_cache_upload_variant(struct v3dv_pipeline *pipeline,
                                   struct v3dv_pipeline_cache *cache,
                                   struct v3dv_shader_variant  *variant)
{
   if (!cache || !cache->variant_cache)
      return;

   if (cache->variant_stats.count > V3DV_MAX_PIPELINE_CACHE_ENTRIES)
      return;

   pthread_mutex_lock(&cache->mutex);
   struct hash_entry *entry =
      _mesa_hash_table_search(cache->variant_cache, variant->variant_sha1);

   if (entry) {
      pthread_mutex_unlock(&cache->mutex);
      return;
   }

   v3dv_shader_variant_ref(variant);
   _mesa_hash_table_insert(cache->variant_cache, variant->variant_sha1, variant);
   cache->variant_stats.count++;
   if (unlikely(dump_stats)) {
      char sha1buf[41];
      _mesa_sha1_format(sha1buf, variant->variant_sha1);

      fprintf(stderr, "pipeline cache %p, new variant entry with key %s\n\t%p\n",
              cache, sha1buf, variant);
      cache_dump_stats(cache);
   }

   pthread_mutex_unlock(&cache->mutex);
}

static struct serialized_nir*
serialized_nir_create_from_blob(struct v3dv_pipeline_cache *cache,
                                struct blob_reader *blob)
{
   const unsigned char *sha1_key = blob_read_bytes(blob, 20);
   uint32_t snir_size = blob_read_uint32(blob);
   const char* snir_data = blob_read_bytes(blob, snir_size);
   if (blob->overrun)
      return NULL;

   struct serialized_nir *snir =
      ralloc_size(cache->nir_cache, sizeof(*snir) + snir_size);
   memcpy(snir->sha1_key, sha1_key, 20);
   snir->size = snir_size;
   memcpy(snir->data, snir_data, snir_size);

   return snir;
}

static struct v3dv_shader_variant*
shader_variant_create_from_blob(struct v3dv_device *device,
                                struct blob_reader *blob)
{
   VkResult result;

   gl_shader_stage stage = blob_read_uint32(blob);
   bool is_coord = blob_read_uint8(blob);

   uint32_t v3d_key_size = blob_read_uint32(blob);
   const struct v3d_key *v3d_key = blob_read_bytes(blob, v3d_key_size);

   const unsigned char *variant_sha1 = blob_read_bytes(blob, 20);

   uint32_t prog_data_size = blob_read_uint32(blob);
   /* FIXME: as we include the stage perhaps we can avoid prog_data_size? */
   assert(prog_data_size == v3d_prog_data_size(stage));

   const void *prog_data = blob_read_bytes(blob, prog_data_size);
   if (blob->overrun)
      return NULL;

   uint32_t ulist_count = blob_read_uint32(blob);
   uint32_t contents_size = sizeof(enum quniform_contents) * ulist_count;
   const void *contents_data = blob_read_bytes(blob, contents_size);
   if (blob->overrun)
      return NULL;

   uint ulist_data_size = sizeof(uint32_t) * ulist_count;
   const void *ulist_data_data = blob_read_bytes(blob, ulist_data_size);
   if (blob->overrun)
      return NULL;

   uint32_t qpu_insts_size = blob_read_uint32(blob);
   const uint64_t *qpu_insts = blob_read_bytes(blob, qpu_insts_size);
   if (blob->overrun)
      return NULL;

   /* shader_variant_create expects a newly created prog_data for their own,
    * as it is what the v3d compiler returns. So we are also allocating one
    * (including the uniform list) and filled it up with the data that we read
    * from the blob
    */
   struct v3d_prog_data *new_prog_data = rzalloc_size(NULL, prog_data_size);
   memcpy(new_prog_data, prog_data, prog_data_size);
   struct v3d_uniform_list *ulist = &new_prog_data->uniforms;
   ulist->count = ulist_count;
   ulist->contents = ralloc_array(new_prog_data, enum quniform_contents, ulist->count);
   memcpy(ulist->contents, contents_data, contents_size);
   ulist->data = ralloc_array(new_prog_data, uint32_t, ulist->count);
   memcpy(ulist->data, ulist_data_data, ulist_data_size);

   return v3dv_shader_variant_create(device, stage, is_coord,
                                     variant_sha1,
                                     v3d_key, v3d_key_size,
                                     new_prog_data, prog_data_size,
                                     qpu_insts, qpu_insts_size,
                                     &result);
}

static void
pipeline_cache_load(struct v3dv_pipeline_cache *cache,
                    size_t size,
                    const void *data)
{
   struct v3dv_device *device = cache->device;
   struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
   struct vk_pipeline_cache_header header;

   if (cache->variant_cache == NULL)
      return;

   struct blob_reader blob;
   blob_reader_init(&blob, data, size);

   blob_copy_bytes(&blob, &header, sizeof(header));
   if (size < sizeof(header))
      return;
   memcpy(&header, data, sizeof(header));
   if (header.header_size < sizeof(header))
      return;
   if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE)
      return;
   if (header.vendor_id != v3dv_physical_device_vendor_id(pdevice))
      return;
   if (header.device_id != v3dv_physical_device_device_id(pdevice))
      return;
   if (memcmp(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE) != 0)
      return;

   uint32_t nir_count = blob_read_uint32(&blob);
   if (blob.overrun)
      return;

   for (uint32_t i = 0; i < nir_count; i++) {
      struct serialized_nir *snir =
         serialized_nir_create_from_blob(cache, &blob);

      if (!snir)
         break;

      _mesa_hash_table_insert(cache->nir_cache, snir->sha1_key, snir);
      cache->nir_stats.count++;
   }

   uint32_t count = blob_read_uint32(&blob);
   if (blob.overrun)
      return;

   for (uint32_t i = 0; i < count; i++) {
      struct v3dv_shader_variant *variant =
         shader_variant_create_from_blob(device, &blob);
      if (!variant)
         break;
      _mesa_hash_table_insert(cache->variant_cache, variant->variant_sha1, variant);
      cache->variant_stats.count++;
   }

   if (unlikely(dump_stats)) {
      fprintf(stderr, "pipeline cache %p, loaded %i nir shaders and "
              "%i variant entries\n", cache, nir_count, count);
      cache_dump_stats(cache);
   }
}

VkResult
v3dv_CreatePipelineCache(VkDevice _device,
                         const VkPipelineCacheCreateInfo *pCreateInfo,
                         const VkAllocationCallbacks *pAllocator,
                         VkPipelineCache *pPipelineCache)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);
   struct v3dv_pipeline_cache *cache;

   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);
   assert(pCreateInfo->flags == 0);

   cache = vk_alloc2(&device->alloc, pAllocator,
                     sizeof(*cache), 8,
                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

   if (cache == NULL)
      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

   v3dv_pipeline_cache_init(cache, device,
                            device->instance->pipeline_cache_enabled);

   if (pCreateInfo->initialDataSize > 0) {
      pipeline_cache_load(cache,
                          pCreateInfo->initialDataSize,
                          pCreateInfo->pInitialData);
   }

   *pPipelineCache = v3dv_pipeline_cache_to_handle(cache);

   return VK_SUCCESS;
}

void
v3dv_pipeline_cache_finish(struct v3dv_pipeline_cache *cache)
{
   pthread_mutex_destroy(&cache->mutex);

   if (cache->nir_cache) {
      hash_table_foreach(cache->nir_cache, entry)
         ralloc_free(entry->data);

      _mesa_hash_table_destroy(cache->nir_cache, NULL);
   }

   if (cache->variant_cache) {
      hash_table_foreach(cache->variant_cache, entry) {
         struct v3dv_shader_variant *variant = entry->data;
         if (variant)
            v3dv_shader_variant_unref(cache->device, variant);
      }

      _mesa_hash_table_destroy(cache->variant_cache, NULL);

   }
}

void
v3dv_DestroyPipelineCache(VkDevice _device,
                          VkPipelineCache _cache,
                          const VkAllocationCallbacks *pAllocator)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);
   V3DV_FROM_HANDLE(v3dv_pipeline_cache, cache, _cache);

   if (!cache)
      return;

   v3dv_pipeline_cache_finish(cache);

   vk_free2(&device->alloc, pAllocator, cache);
}

VkResult
v3dv_MergePipelineCaches(VkDevice device,
                         VkPipelineCache dstCache,
                         uint32_t srcCacheCount,
                         const VkPipelineCache *pSrcCaches)
{
   V3DV_FROM_HANDLE(v3dv_pipeline_cache, dst, dstCache);

   if (!dst->variant_cache || !dst->nir_cache)
      return VK_SUCCESS;

   for (uint32_t i = 0; i < srcCacheCount; i++) {
      V3DV_FROM_HANDLE(v3dv_pipeline_cache, src, pSrcCaches[i]);
      if (!src->variant_cache || !src->nir_cache)
         continue;

      hash_table_foreach(src->nir_cache, entry) {
         struct serialized_nir *src_snir = entry->data;
         assert(src_snir);

         if (_mesa_hash_table_search(dst->nir_cache, src_snir->sha1_key))
            continue;

         /* FIXME: we are using serialized nir shaders because they are
          * convenient to create and store on the cache, but requires to do a
          * copy here (and some other places) of the serialized NIR. Perhaps
          * it would make sense to move to handle the NIR shaders with shared
          * structures with ref counts, as the variants.
          */
         struct serialized_nir *snir_dst =
            ralloc_size(dst->nir_cache, sizeof(*snir_dst) + src_snir->size);
         memcpy(snir_dst->sha1_key, src_snir->sha1_key, 20);
         snir_dst->size = src_snir->size;
         memcpy(snir_dst->data, src_snir->data, src_snir->size);

         _mesa_hash_table_insert(dst->nir_cache, snir_dst->sha1_key, snir_dst);
         dst->nir_stats.count++;
         if (unlikely(dump_stats)) {
            char sha1buf[41];
            _mesa_sha1_format(sha1buf, snir_dst->sha1_key);

            fprintf(stderr, "pipeline cache %p, added nir entry %s "
                    "from pipeline cache %p\n",
                    dst, sha1buf, src);
            cache_dump_stats(dst);
         }
      }

      hash_table_foreach(src->variant_cache, entry) {
         struct v3dv_shader_variant *variant = entry->data;
         assert(variant);

         if (_mesa_hash_table_search(dst->variant_cache, variant->variant_sha1))
            continue;

         v3dv_shader_variant_ref(variant);
         _mesa_hash_table_insert(dst->variant_cache, variant->variant_sha1, variant);

         dst->variant_stats.count++;
         if (unlikely(dump_stats)) {
            char sha1buf[41];
            _mesa_sha1_format(sha1buf, variant->variant_sha1);

            fprintf(stderr, "pipeline cache %p, added variant entry %s "
                    "from pipeline cache %p\n",
                    dst, sha1buf, src);
            cache_dump_stats(dst);
         }
      }
   }

   return VK_SUCCESS;
}

static bool
shader_variant_write_to_blob(const struct v3dv_shader_variant *variant,
                             struct blob *blob)
{
   blob_write_uint32(blob, variant->stage);
   blob_write_uint8(blob, variant->is_coord);

   blob_write_uint32(blob, variant->v3d_key_size);
   blob_write_bytes(blob, &variant->key, variant->v3d_key_size);

   blob_write_bytes(blob, variant->variant_sha1, sizeof(variant->variant_sha1));

   blob_write_uint32(blob, variant->prog_data_size);
   blob_write_bytes(blob, variant->prog_data.base, variant->prog_data_size);

   struct v3d_uniform_list *ulist = &variant->prog_data.base->uniforms;
   blob_write_uint32(blob, ulist->count);
   blob_write_bytes(blob, ulist->contents, sizeof(enum quniform_contents) * ulist->count);
   blob_write_bytes(blob, ulist->data, sizeof(uint32_t) * ulist->count);

   blob_write_uint32(blob, variant->qpu_insts_size);
   assert(variant->assembly_bo->map);
   blob_write_bytes(blob, variant->assembly_bo->map, variant->qpu_insts_size);

   return !blob->out_of_memory;
}

VkResult
v3dv_GetPipelineCacheData(VkDevice _device,
                          VkPipelineCache _cache,
                          size_t *pDataSize,
                          void *pData)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);
   V3DV_FROM_HANDLE(v3dv_pipeline_cache, cache, _cache);

   struct blob blob;
   if (pData) {
      blob_init_fixed(&blob, pData, *pDataSize);
   } else {
      blob_init_fixed(&blob, NULL, SIZE_MAX);
   }

   struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
   VkResult result = VK_SUCCESS;

   pthread_mutex_lock(&cache->mutex);

   struct vk_pipeline_cache_header header = {
      .header_size = sizeof(struct vk_pipeline_cache_header),
      .header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
      .vendor_id = v3dv_physical_device_vendor_id(pdevice),
      .device_id = v3dv_physical_device_device_id(pdevice),
   };
   memcpy(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE);
   blob_write_bytes(&blob, &header, sizeof(header));

   uint32_t nir_count = 0;
   intptr_t nir_count_offset = blob_reserve_uint32(&blob);
   if (nir_count_offset < 0) {
      *pDataSize = 0;
      blob_finish(&blob);
      pthread_mutex_unlock(&cache->mutex);
      return VK_INCOMPLETE;
   }

   if (cache->nir_cache) {
      hash_table_foreach(cache->nir_cache, entry) {
         const struct serialized_nir *snir = entry->data;

         size_t save_size = blob.size;

         blob_write_bytes(&blob, snir->sha1_key, 20);
         blob_write_uint32(&blob, snir->size);
         blob_write_bytes(&blob, snir->data, snir->size);

         if (blob.out_of_memory) {
            blob.size = save_size;
            pthread_mutex_unlock(&cache->mutex);
            result = VK_INCOMPLETE;
            break;
         }

         nir_count++;
      }
   }
   blob_overwrite_uint32(&blob, nir_count_offset, nir_count);

   uint32_t count = 0;
   intptr_t count_offset = blob_reserve_uint32(&blob);
   if (count_offset < 0) {
      *pDataSize = 0;
      blob_finish(&blob);
      pthread_mutex_unlock(&cache->mutex);
      return VK_INCOMPLETE;
   }

   if (cache->variant_cache) {
      hash_table_foreach(cache->variant_cache, entry) {
         struct v3dv_shader_variant *variant = entry->data;

         size_t save_size = blob.size;
         if (!shader_variant_write_to_blob(variant, &blob)) {
            /* If it fails reset to the previous size and bail */
            blob.size = save_size;
            pthread_mutex_unlock(&cache->mutex);
            result = VK_INCOMPLETE;
            break;
         }

         count++;
      }
   }

   blob_overwrite_uint32(&blob, count_offset, count);

   *pDataSize = blob.size;

   blob_finish(&blob);

   if (unlikely(dump_stats)) {
      assert(count <= cache->variant_stats.count);
      fprintf(stderr, "GetPipelineCacheData: serializing cache %p, "
              "%i nir shader entries "
              "%i variant entries, %u DataSize\n",
              cache, nir_count, count, (uint32_t) *pDataSize);
   }

   pthread_mutex_unlock(&cache->mutex);

   return result;
}