/*
 * Copyright © 2021 Collabora Ltd.
 *
 * Derived from tu_shader.c which is:
 * Copyright © 2019 Google LLC
 *
 * Also derived from anv_pipeline.c which is
 * Copyright © 2015 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include "genxml/gen_macros.h"

#include "panvk_private.h"

#include "nir_builder.h"
#include "nir_deref.h"
#include "nir_lower_blend.h"
#include "nir_conversion_builder.h"
#include "spirv/nir_spirv.h"
#include "util/mesa-sha1.h"
#include "vk_shader_module.h"

#include "pan_shader.h"
#include "util/pan_lower_framebuffer.h"

#include "vk_util.h"

static void
panvk_init_sysvals(struct panfrost_sysvals *sysvals,
                   gl_shader_stage stage)
{
   memset(sysvals, 0, sizeof(*sysvals));

#define SYSVAL_SLOT(name) \
   (assert(offsetof(struct panvk_sysvals, name) % 16 == 0), \
    offsetof(struct panvk_sysvals, name) / 16)

#define INIT_SYSVAL(name, SYSVAL) \
   sysvals->sysvals[SYSVAL_SLOT(name)] = PAN_SYSVAL_##SYSVAL

   if (gl_shader_stage_is_compute(stage)) {
      INIT_SYSVAL(num_work_groups, NUM_WORK_GROUPS);
      INIT_SYSVAL(local_group_size, LOCAL_GROUP_SIZE);
   } else {
      INIT_SYSVAL(viewport_scale, VIEWPORT_SCALE);
      INIT_SYSVAL(viewport_offset, VIEWPORT_OFFSET);
      INIT_SYSVAL(vertex_instance_offsets, VERTEX_INSTANCE_OFFSETS);
      INIT_SYSVAL(blend_constants, BLEND_CONSTANTS);
   }
   sysvals->sysval_count = SYSVAL_SLOT(dyn_ssbos);

#undef SYSVAL_SLOT
#undef INIT_SYSVAL
}

static bool
panvk_inline_blend_constants(nir_builder *b, nir_instr *instr, void *data)
{
   if (instr->type != nir_instr_type_intrinsic)
      return false;

   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
   if (intr->intrinsic != nir_intrinsic_load_blend_const_color_rgba)
      return false;

   const nir_const_value *constants = data;

   b->cursor = nir_after_instr(instr);
   nir_ssa_def *constant = nir_build_imm(b, 4, 32, constants);
   nir_ssa_def_rewrite_uses(&intr->dest.ssa, constant);
   nir_instr_remove(instr);
   return true;
}

static void
panvk_lower_blend(struct panfrost_device *pdev,
                  nir_shader *nir,
                  struct panfrost_compile_inputs *inputs,
                  struct pan_blend_state *blend_state,
                  bool static_blend_constants)
{
   nir_lower_blend_options options = {
      .logicop_enable = blend_state->logicop_enable,
      .logicop_func = blend_state->logicop_func,
   };

   bool lower_blend = false;

   for (unsigned rt = 0; rt < blend_state->rt_count; rt++) {
      struct pan_blend_rt_state *rt_state = &blend_state->rts[rt];

      if (!panvk_per_arch(blend_needs_lowering)(pdev, blend_state, rt))
         continue;

      enum pipe_format fmt = rt_state->format;

      options.format[rt] = fmt;
      options.rt[rt].colormask = rt_state->equation.color_mask;

      if (!rt_state->equation.blend_enable) {
         static const nir_lower_blend_channel replace = {
            .func = BLEND_FUNC_ADD,
            .src_factor = BLEND_FACTOR_ZERO,
            .invert_src_factor = true,
            .dst_factor = BLEND_FACTOR_ZERO,
            .invert_dst_factor = false,
         };

         options.rt[rt].rgb = replace;
         options.rt[rt].alpha = replace;
      } else {
         options.rt[rt].rgb.func = rt_state->equation.rgb_func;
         options.rt[rt].rgb.src_factor = rt_state->equation.rgb_src_factor;
         options.rt[rt].rgb.invert_src_factor = rt_state->equation.rgb_invert_src_factor;
         options.rt[rt].rgb.dst_factor = rt_state->equation.rgb_dst_factor;
         options.rt[rt].rgb.invert_dst_factor = rt_state->equation.rgb_invert_dst_factor;
         options.rt[rt].alpha.func = rt_state->equation.alpha_func;
         options.rt[rt].alpha.src_factor = rt_state->equation.alpha_src_factor;
         options.rt[rt].alpha.invert_src_factor = rt_state->equation.alpha_invert_src_factor;
         options.rt[rt].alpha.dst_factor = rt_state->equation.alpha_dst_factor;
         options.rt[rt].alpha.invert_dst_factor = rt_state->equation.alpha_invert_dst_factor;
      }

      /* Update the equation to force a color replacement */
      rt_state->equation.color_mask = 0xf;
      rt_state->equation.rgb_func = BLEND_FUNC_ADD;
      rt_state->equation.rgb_src_factor = BLEND_FACTOR_ZERO;
      rt_state->equation.rgb_invert_src_factor = true;
      rt_state->equation.rgb_dst_factor = BLEND_FACTOR_ZERO;
      rt_state->equation.rgb_invert_dst_factor = false;
      rt_state->equation.alpha_func = BLEND_FUNC_ADD;
      rt_state->equation.alpha_src_factor = BLEND_FACTOR_ZERO;
      rt_state->equation.alpha_invert_src_factor = true;
      rt_state->equation.alpha_dst_factor = BLEND_FACTOR_ZERO;
      rt_state->equation.alpha_invert_dst_factor = false;
      lower_blend = true;

      inputs->bifrost.static_rt_conv = true;
      inputs->bifrost.rt_conv[rt] =
         GENX(pan_blend_get_internal_desc)(pdev, fmt, rt, 32, false) >> 32;
   }

   if (lower_blend) {
      NIR_PASS_V(nir, nir_lower_blend, &options);

      if (static_blend_constants) {
         const nir_const_value constants[4] = {
            { .f32 = CLAMP(blend_state->constants[0], 0.0f, 1.0f) },
            { .f32 = CLAMP(blend_state->constants[1], 0.0f, 1.0f) },
            { .f32 = CLAMP(blend_state->constants[2], 0.0f, 1.0f) },
            { .f32 = CLAMP(blend_state->constants[3], 0.0f, 1.0f) },
         };
         NIR_PASS_V(nir, nir_shader_instructions_pass,
                    panvk_inline_blend_constants,
                    nir_metadata_block_index |
                    nir_metadata_dominance,
                    (void *)constants);
      }
   }
}

static bool
panvk_lower_load_push_constant(nir_builder *b, nir_instr *instr, void *data)
{
   if (instr->type != nir_instr_type_intrinsic)
      return false;

   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
   if (intr->intrinsic != nir_intrinsic_load_push_constant)
      return false;

   b->cursor = nir_before_instr(instr);
   nir_ssa_def *ubo_load =
      nir_load_ubo(b, nir_dest_num_components(intr->dest),
                   nir_dest_bit_size(intr->dest),
                   nir_imm_int(b, PANVK_PUSH_CONST_UBO_INDEX),
                   intr->src[0].ssa,
                   .align_mul = nir_dest_bit_size(intr->dest) / 8,
                   .align_offset = 0,
                   .range_base = nir_intrinsic_base(intr),
                   .range = nir_intrinsic_range(intr));
   nir_ssa_def_rewrite_uses(&intr->dest.ssa, ubo_load);
   nir_instr_remove(instr);
   return true;
}

static void
shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
{
   assert(glsl_type_is_vector_or_scalar(type));

   uint32_t comp_size = glsl_type_is_boolean(type)
      ? 4 : glsl_get_bit_size(type) / 8;
   unsigned length = glsl_get_vector_elements(type);
   *size = comp_size * length,
   *align = comp_size * (length == 3 ? 4 : length);
}

struct panvk_shader *
panvk_per_arch(shader_create)(struct panvk_device *dev,
                              gl_shader_stage stage,
                              const VkPipelineShaderStageCreateInfo *stage_info,
                              const struct panvk_pipeline_layout *layout,
                              unsigned sysval_ubo,
                              struct pan_blend_state *blend_state,
                              bool static_blend_constants,
                              const VkAllocationCallbacks *alloc)
{
   VK_FROM_HANDLE(vk_shader_module, module, stage_info->module);
   struct panfrost_device *pdev = &dev->physical_device->pdev;
   struct panvk_shader *shader;

   shader = vk_zalloc2(&dev->vk.alloc, alloc, sizeof(*shader), 8,
                       VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
   if (!shader)
      return NULL;

   util_dynarray_init(&shader->binary, NULL);

   /* TODO these are made-up */
   const struct spirv_to_nir_options spirv_options = {
      .caps = {
         .variable_pointers = true,
      },
      .ubo_addr_format = nir_address_format_32bit_index_offset,
      .ssbo_addr_format = dev->vk.enabled_features.robustBufferAccess ?
                          nir_address_format_64bit_bounded_global :
                          nir_address_format_64bit_global_32bit_offset,
   };

   nir_shader *nir;
   VkResult result = vk_shader_module_to_nir(&dev->vk, module, stage,
                                             stage_info->pName,
                                             stage_info->pSpecializationInfo,
                                             &spirv_options,
                                             GENX(pan_shader_get_compiler_options)(),
                                             NULL, &nir);
   if (result != VK_SUCCESS) {
      vk_free2(&dev->vk.alloc, alloc, shader);
      return NULL;
   }

   NIR_PASS_V(nir, nir_lower_io_to_temporaries,
              nir_shader_get_entrypoint(nir), true, true);

   struct panfrost_sysvals fixed_sysvals;
   panvk_init_sysvals(&fixed_sysvals, stage);

   struct panfrost_compile_inputs inputs = {
      .gpu_id = pdev->gpu_id,
      .no_ubo_to_push = true,
      .no_idvs = true, /* TODO */
      .fixed_sysval_ubo = sysval_ubo,
      .fixed_sysval_layout = &fixed_sysvals,
   };

   NIR_PASS_V(nir, nir_lower_indirect_derefs,
              nir_var_shader_in | nir_var_shader_out,
              UINT32_MAX);

   NIR_PASS_V(nir, nir_opt_copy_prop_vars);
   NIR_PASS_V(nir, nir_opt_combine_stores, nir_var_all);
   NIR_PASS_V(nir, nir_opt_trivial_continues);

   /* Do texture lowering here.  Yes, it's a duplication of the texture
    * lowering in bifrost_compile.  However, we need to lower texture stuff
    * now, before we call panvk_per_arch(nir_lower_descriptors)() because some
    * of the texture lowering generates nir_texop_txs which we handle as part
    * of descriptor lowering.
    *
    * TODO: We really should be doing this in common code, not dpulicated in
    * panvk.  In order to do that, we need to rework the panfrost compile
    * flow to look more like the Intel flow:
    *
    *  1. Compile SPIR-V to NIR and maybe do a tiny bit of lowering that needs
    *     to be done really early.
    *
    *  2. bi_preprocess_nir: Does common lowering and runs the optimization
    *     loop.  Nothing here should be API-specific.
    *
    *  3. Do additional lowering in panvk
    *
    *  4. bi_postprocess_nir: Does final lowering and runs the optimization
    *     loop again.  This can happen as part of the final compile.
    *
    * This would give us a better place to do panvk-specific lowering.
    */
   nir_lower_tex_options lower_tex_options = {
      .lower_txs_lod = true,
      .lower_txp = ~0,
      .lower_tg4_broadcom_swizzle = true,
      .lower_txd = true,
      .lower_invalid_implicit_lod = true,
   };
   NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);

   NIR_PASS_V(nir, panvk_per_arch(nir_lower_descriptors),
              dev, layout, &shader->has_img_access);

   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo,
              nir_address_format_32bit_index_offset);
   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo,
              spirv_options.ssbo_addr_format);
   NIR_PASS_V(nir, nir_lower_explicit_io,
              nir_var_mem_push_const,
              nir_address_format_32bit_offset);

   if (gl_shader_stage_uses_workgroup(stage)) {
      if (!nir->info.shared_memory_explicit_layout) {
         NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
                    nir_var_mem_shared,
                    shared_type_info);
      }

      NIR_PASS_V(nir, nir_lower_explicit_io,
                 nir_var_mem_shared,
                 nir_address_format_32bit_offset);
   }

   NIR_PASS_V(nir, nir_shader_instructions_pass,
              panvk_lower_load_push_constant,
              nir_metadata_block_index |
              nir_metadata_dominance,
              (void *)layout);

   NIR_PASS_V(nir, nir_lower_system_values);
   NIR_PASS_V(nir, nir_lower_compute_system_values, NULL);

   NIR_PASS_V(nir, nir_split_var_copies);
   NIR_PASS_V(nir, nir_lower_var_copies);

   /* We have to run nir_lower_blend() after we've gotten rid of copies (it
    * requires load/store) and before we assign output locations.
    */
   if (stage == MESA_SHADER_FRAGMENT) {
      /* This is required for nir_lower_blend */
      NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, true);
      panvk_lower_blend(pdev, nir, &inputs, blend_state, static_blend_constants);
   }

   nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, stage);
   nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, stage);

   /* Needed to turn shader_temp into function_temp since the backend only
    * handles the latter for now.
    */
   NIR_PASS_V(nir, nir_lower_global_vars_to_local);

   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
   if (unlikely(dev->physical_device->instance->debug_flags & PANVK_DEBUG_NIR)) {
      fprintf(stderr, "translated nir:\n");
      nir_print_shader(nir, stderr);
   }

   GENX(pan_shader_compile)(nir, &inputs, &shader->binary, &shader->info);

   /* System values shouldn't have changed */
   assert(memcmp(&shader->info.sysvals, &fixed_sysvals,
                 sizeof(fixed_sysvals)) == 0);

   /* Patch the descriptor count */
   shader->info.ubo_count = PANVK_NUM_BUILTIN_UBOS +
                            layout->num_ubos + layout->num_dyn_ubos;
   shader->info.sampler_count = layout->num_samplers;
   shader->info.texture_count = layout->num_textures;
   if (shader->has_img_access)
      shader->info.attribute_count += layout->num_imgs;

   shader->sysval_ubo = sysval_ubo;
   shader->local_size.x = nir->info.workgroup_size[0];
   shader->local_size.y = nir->info.workgroup_size[1];
   shader->local_size.z = nir->info.workgroup_size[2];

   ralloc_free(nir);

   return shader;
}