/*
 * Copyright 2018 Advanced Micro Devices, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */

/* This file implements tests on the si_clearbuffer function. */

#include "si_pipe.h"
#include "si_query.h"

#define MIN_SIZE   512
#define MAX_SIZE   (128 * 1024 * 1024)
#define SIZE_SHIFT 1
#define NUM_RUNS   128

static double get_MBps_rate(unsigned num_bytes, unsigned ns)
{
   return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
}

void si_test_dma_perf(struct si_screen *sscreen)
{
   struct pipe_screen *screen = &sscreen->b;
   struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
   struct si_context *sctx = (struct si_context *)ctx;
   const uint32_t clear_value = 0x12345678;
   static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
   static const unsigned cs_waves_per_sh_list[] = {0, 4, 8, 16};

#define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
#define NUM_METHODS (4 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))

   static const char *method_str[] = {
      "CP MC   ",
      "CP L2   ",
      "CP L2   ",
      "SDMA    ",
   };
   static const char *placement_str[] = {
      /* Clear */
      "fill->VRAM",
      "fill->GTT ",
      /* Copy */
      "VRAM->VRAM",
      "VRAM->GTT ",
      "GTT ->VRAM",
   };

   printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
   printf("Heap       ,Method  ,L2p,Wa,");
   for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
      if (size >= 1024)
         printf("%6uKB,", size / 1024);
      else
         printf(" %6uB,", size);
   }
   printf("\n");

   /* results[log2(size)][placement][method][] */
   struct si_result {
      bool is_valid;
      bool is_cp;
      bool is_sdma;
      bool is_cs;
      unsigned cache_policy;
      unsigned dwords_per_thread;
      unsigned waves_per_sh;
      unsigned score;
      unsigned index; /* index in results[x][y][index] */
   } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};

   /* Run benchmarks. */
   for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
      bool is_copy = placement >= 2;

      printf("-----------,--------,---,--,");
      for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
         printf("--------,");
      printf("\n");

      for (unsigned method = 0; method < NUM_METHODS; method++) {
         bool test_cp = method <= 2;
         bool test_sdma = method == 3;
         bool test_cs = method >= 4;
         unsigned cs_method = method - 4;
         unsigned cs_waves_per_sh =
            test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0;
         cs_method %= 3 * NUM_SHADERS;
         unsigned cache_policy =
            test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0;
         unsigned cs_dwords_per_thread =
            test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;

         if (test_sdma && !sctx->sdma_cs)
            continue;

         if (sctx->chip_class == GFX6) {
            /* GFX6 doesn't support CP DMA operations through L2. */
            if (test_cp && cache_policy != L2_BYPASS)
               continue;
            /* WAVES_PER_SH is in multiples of 16 on GFX6. */
            if (test_cs && cs_waves_per_sh % 16 != 0)
               continue;
         }

         /* SI_RESOURCE_FLAG_UNCACHED setting RADEON_FLAG_UNCACHED doesn't affect
          * chips before gfx9.
          */
         if (test_cs && cache_policy && sctx->chip_class < GFX9)
            continue;

         printf("%s ,", placement_str[placement]);
         if (test_cs) {
            printf("CS x%-4u,%3s,", cs_dwords_per_thread,
                   cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
         } else {
            printf("%s,%3s,", method_str[method],
                   method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
         }
         if (test_cs && cs_waves_per_sh)
            printf("%2u,", cs_waves_per_sh);
         else
            printf("  ,");

         void *compute_shader = NULL;
         if (test_cs) {
            compute_shader = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
                                              cache_policy == L2_STREAM, is_copy);
         }

         double score = 0;
         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
            /* Don't test bigger sizes if it's too slow. Print 0. */
            if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
               printf("%7.0f ,", 0.0);
               continue;
            }

            enum pipe_resource_usage dst_usage, src_usage;
            struct pipe_resource *dst, *src;
            unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
            unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0;

            if (test_sdma) {
               if (sctx->chip_class == GFX6)
                  query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
               else
                  query_type = SI_QUERY_TIME_ELAPSED_SDMA;
            }

            if (placement == 0 || placement == 2 || placement == 4)
               dst_usage = PIPE_USAGE_DEFAULT;
            else
               dst_usage = PIPE_USAGE_STREAM;

            if (placement == 2 || placement == 3)
               src_usage = PIPE_USAGE_DEFAULT;
            else
               src_usage = PIPE_USAGE_STREAM;

            dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256);
            src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL;

            /* Wait for idle before testing, so that other processes don't mess up the results. */
            sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
                           SI_CONTEXT_FLUSH_AND_INV_CB |
                           SI_CONTEXT_FLUSH_AND_INV_DB;
            sctx->emit_cache_flush(sctx);

            struct pipe_query *q = ctx->create_query(ctx, query_type, 0);
            ctx->begin_query(ctx, q);

            /* Run tests. */
            for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
               if (test_cp) {
                  /* CP DMA */
                  if (is_copy) {
                     si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, SI_COHERENCY_NONE,
                                           cache_policy);
                  } else {
                     si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, clear_value, 0,
                                            SI_COHERENCY_NONE, cache_policy);
                  }
               } else if (test_sdma) {
                  /* SDMA */
                  if (is_copy) {
                     si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
                  } else {
                     si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
                  }
               } else {
                  /* Compute */
                  /* The memory accesses are coalesced, meaning that the 1st instruction writes
                   * the 1st contiguous block of data for the whole wave, the 2nd instruction
                   * writes the 2nd contiguous block of data, etc.
                   */
                  unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
                  unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
                  unsigned dwords_per_wave = cs_dwords_per_thread * 64;

                  unsigned num_dwords = size / 4;
                  unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);

                  struct pipe_grid_info info = {};
                  info.block[0] = MIN2(64, num_instructions);
                  info.block[1] = 1;
                  info.block[2] = 1;
                  info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
                  info.grid[1] = 1;
                  info.grid[2] = 1;

                  struct pipe_shader_buffer sb[2] = {};
                  sb[0].buffer = dst;
                  sb[0].buffer_size = size;

                  if (is_copy) {
                     sb[1].buffer = src;
                     sb[1].buffer_size = size;
                  } else {
                     for (unsigned i = 0; i < 4; i++)
                        sctx->cs_user_data[i] = clear_value;
                  }

                  ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
                  ctx->bind_compute_state(ctx, compute_shader);
                  sctx->cs_max_waves_per_sh = cs_waves_per_sh;

                  ctx->launch_grid(ctx, &info);

                  ctx->bind_compute_state(ctx, NULL);
                  sctx->cs_max_waves_per_sh = 0; /* disable the limit */
               }

               /* Flush L2, so that we don't just test L2 cache performance except for L2_LRU. */
               if (!test_sdma) {
                  sctx->flags |= SI_CONTEXT_INV_VCACHE |
                                 (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) |
                                 SI_CONTEXT_CS_PARTIAL_FLUSH;
                  sctx->emit_cache_flush(sctx);
               }
            }

            ctx->end_query(ctx, q);
            ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);

            pipe_resource_reference(&dst, NULL);
            pipe_resource_reference(&src, NULL);

            /* Get results. */

            union pipe_query_result result;

            ctx->get_query_result(ctx, q, true, &result);
            ctx->destroy_query(ctx, q);

            score = get_MBps_rate(size, result.u64 / (double)NUM_RUNS);
            printf("%7.0f ,", score);
            fflush(stdout);

            struct si_result *r = &results[util_logbase2(size)][placement][method];
            r->is_valid = true;
            r->is_cp = test_cp;
            r->is_sdma = test_sdma;
            r->is_cs = test_cs;
            r->cache_policy = cache_policy;
            r->dwords_per_thread = cs_dwords_per_thread;
            r->waves_per_sh = cs_waves_per_sh;
            r->score = score;
            r->index = method;
         }
         puts("");

         if (compute_shader)
            ctx->delete_compute_state(ctx, compute_shader);
      }
   }

   puts("");
   puts("static struct si_method");
   printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
          "cached)\n",
          sctx->screen->info.name);
   puts("{");
   puts("   unsigned size = MIN2(size64, UINT_MAX);\n");

   /* Analyze results and find the best methods. */
   for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
      if (placement == 0)
         puts("   if (dst == RADEON_DOMAIN_VRAM) {");
      else if (placement == 1)
         puts("   } else { /* GTT */");
      else if (placement == 2) {
         puts("}");
         puts("");
         puts("static struct si_method");
         printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
                sctx->screen->info.name);
         printf("                     uint64_t size64, bool async, bool cached)\n");
         puts("{");
         puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
         puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
      } else if (placement == 3)
         puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
      else
         puts("   } else { /* GTT -> VRAM */");

      for (unsigned mode = 0; mode < 3; mode++) {
         bool async = mode == 0;
         bool cached = mode == 1;

         if (async)
            puts("      if (async) { /* SDMA or async compute */");
         else if (cached)
            puts("      if (cached) { /* gfx ring */");
         else
            puts("      } else { /* gfx ring - uncached */");

         /* The list of best chosen methods. */
         struct si_result *methods[32];
         unsigned method_max_size[32];
         unsigned num_methods = 0;

         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
            /* Find the best method. */
            struct si_result *best = NULL;

            for (unsigned i = 0; i < NUM_METHODS; i++) {
               struct si_result *r = &results[util_logbase2(size)][placement][i];

               if (!r->is_valid)
                  continue;

               /* Ban CP DMA clears via MC on <= GFX8. They are super slow
                * on GTT, which we can get due to BO evictions.
                */
               if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp &&
                   r->cache_policy == L2_BYPASS)
                  continue;

               if (async) {
                  /* The following constraints for compute IBs try to limit
                   * resource usage so as not to decrease the performance
                   * of gfx IBs too much.
                   */

                  /* Don't use CP DMA on asynchronous rings, because
                   * the engine is shared with gfx IBs.
                   */
                  if (r->is_cp)
                     continue;

                  /* Don't use L2 caching on asynchronous rings to minimize
                   * L2 usage.
                   */
                  if (r->cache_policy == L2_LRU)
                     continue;

                  /* Asynchronous compute recommends waves_per_sh != 0
                   * to limit CU usage. */
                  if (r->is_cs && r->waves_per_sh == 0)
                     continue;
               } else {
                  /* SDMA is always asynchronous */
                  if (r->is_sdma)
                     continue;

                  if (cached && r->cache_policy == L2_BYPASS)
                     continue;
                  if (!cached && r->cache_policy == L2_LRU)
                     continue;
               }

               if (!best) {
                  best = r;
                  continue;
               }

               /* Assume some measurement error. Earlier methods occupy fewer
                * resources, so the next method is always more greedy, and we
                * don't want to select it due to a measurement error.
                */
               double min_improvement = 1.03;

               if (best->score * min_improvement < r->score)
                  best = r;
            }

            if (num_methods > 0) {
               unsigned prev_index = num_methods - 1;
               struct si_result *prev = methods[prev_index];
               struct si_result *prev_this_size =
                  &results[util_logbase2(size)][placement][prev->index];

               /* If the best one is also the best for the previous size,
                * just bump the size for the previous one.
                *
                * If there is no best, it means all methods were too slow
                * for this size and were not tested. Use the best one for
                * the previous size.
                */
               if (!best ||
                   /* If it's the same method as for the previous size: */
                   (prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma &&
                    prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
                    prev->dwords_per_thread == best->dwords_per_thread &&
                    prev->waves_per_sh == best->waves_per_sh) ||
                   /* If the method for the previous size is also the best
                    * for this size: */
                   (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
                  method_max_size[prev_index] = size;
                  continue;
               }
            }

            /* Add it to the list. */
            assert(num_methods < ARRAY_SIZE(methods));
            methods[num_methods] = best;
            method_max_size[num_methods] = size;
            num_methods++;
         }

         for (unsigned i = 0; i < num_methods; i++) {
            struct si_result *best = methods[i];
            unsigned size = method_max_size[i];

            /* The size threshold is between the current benchmarked
             * size and the next benchmarked size. */
            if (i < num_methods - 1)
               printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
            else if (i > 0)
               printf("         else                   ");
            else
               printf("         ");
            printf("return ");

            assert(best);
            const char *cache_policy_str =
               best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
               best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM";

            if (best->is_cp) {
               printf("CP_DMA(%s);\n", cache_policy_str);
            }
            if (best->is_sdma)
               printf("SDMA;\n");
            if (best->is_cs) {
               printf("COMPUTE(%s, %u, %u);\n", cache_policy_str,
                      best->dwords_per_thread, best->waves_per_sh);
            }
         }
      }
      puts("      }");
   }
   puts("   }");
   puts("}");

   ctx->destroy(ctx);
   exit(0);
}