/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include "app.h"
#include "stats.h"
#include "utils.h"

namespace gpuinfo {

// Textures are drastically different from buffers in terms of data layout.
// While buffers are a contiguous range of memory, textures are opaque objects
// defined by the vendor and it is possible that nearby points of data are not
// neighboring in memory. Likewise, data points are accessed in
// multi-dimensional patches instead of simple lines. This makes the stride
// method for figuring out the cache line size not applicable. To go around
// this, this experiment runs an increasing amount of threads accessing
// different datapoints in the texture and measures latency. If the cache line
// is big enough to contain all requested data for the amount of threads,
// latency will be low. When there are more threads and hence more data than
// what a single cache line can handle, a second line must be fetched,
// increasing latency in a measurable way.
void tex_cacheline_concurr(const App& app) {
  if (!app.enabled("tex_cacheline_concurr")) {
    std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl;
    return;
  }

  const uint32_t TEXEL_WIDTH = 4;
  const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH;

  const double COMPENSATE =
      app.get_config("tex_cacheline_concurr", "compensate");
  const double THRESHOLD = app.get_config("tex_cacheline_concurr", "threshold");

  for (int dim = 0; dim < 3; ++dim) {
    std::cout << std::endl;
    std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim
              << ") ------" << std::endl;

    uint32_t NITER;

    const uint32_t IMG_OTHER_EDGE = dim == 0 ? app.max_tex_width
        : dim == 1                           ? app.max_tex_height
                                             : app.max_tex_depth;

    const uint32_t MAX_NTHREAD = std::min(app.nthread_logic, IMG_OTHER_EDGE);

    auto bench = [&](uint32_t nthread) {
      std::vector<int64_t> sizes_whd = {
          app.max_tex_width, app.max_tex_height, app.max_tex_depth};

      auto sizes_nchw = whd_to_nchw(sizes_whd);

      vTensor in_tensor =
          api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);

      StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);

      vkapi::PipelineBarrier pipeline_barrier{};

      auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim);

      auto time = benchmark_on_gpu(shader_name, 100, [&]() {
        context()->submit_compute_job(
            VK_KERNEL_FROM_STR(shader_name),
            pipeline_barrier,
            {nthread, 1, 1},
            {nthread, 1, 1},
            {SV(NITER)},
            VK_NULL_HANDLE,
            0,
            in_tensor.image(),
            out_buf.buffer());
      });
      return time;
    };

    ensure_min_niter(1000, NITER, [&]() { return bench(1); });

    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
    uint32_t nthread = 1;
    for (; nthread <= MAX_NTHREAD; ++nthread) {
      double time = bench(nthread);
      std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
                << std::endl;

      if (dj.push(time)) {
        auto max_concurrency = nthread - 1;
        std::cout << "TextureCachelineConcurrencyDim" << dim << " (B),"
                  << max_concurrency * TEXEL_SIZE << std::endl;
        break;
      }
    }
    if (nthread >= MAX_NTHREAD) {
      std::cout
          << "Unable to conclude an optimal texture cacheline concurrency for dim "
          << dim << std::endl;
    };
  }

  // TODO: Use concurrency information to obtain the cache line size for
  // textures as done in https://fburl.com/98xiou3g
}

void tex_bandwidth(const App& app) {
  if (!app.enabled("tex_bandwidth")) {
    std::cout << "Skipped Texture Bandwidth" << std::endl;
    return;
  }

  for (int dim = 0; dim < 3; dim++) {
    std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
              << std::endl;
    const uint32_t MAX_SIZE = dim == 0 ? app.max_tex_width
        : dim == 1                     ? app.max_tex_height
                                       : app.max_tex_depth;

    // rgba, float
    const uint32_t VEC_WIDTH = 4;
    const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
    const uint32_t NVEC = MAX_SIZE;

    const uint32_t RANGE = NVEC * VEC_SIZE;

    // Cache lines flushed
    const uint32_t NFLUSH = app.get_config("tex_bandwidth", "nflush");
    // Number of loop unrolls. Changing this value requires an equal change in
    // tex_bandwidth.yaml
    const uint32_t NUNROLL = app.get_config("tex_bandwidth", "nunroll");
    // Number of iterations. Increasing this value reduces noise in exchange
    // for higher latency.
    const uint32_t NITER = app.get_config("tex_bandwidth", "niter");
    // Number of memory reads per thread
    const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
    // Number of threads needed to read all texells
    const uint32_t NTHREAD = NVEC;
    // Occupy all threads
    const uint32_t local_x = app.nthread_logic;
    // Ensure that global is a multiple of local, and distribute across all
    // SMs
    const uint32_t global_x =
        (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH;

    auto shader_name = "tex_bandwidth_" + std::to_string(dim);

    std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
    if (dim == 1) {
      sizes_whd = {1, MAX_SIZE, 1};
    } else if (dim == 2) {
      sizes_whd = {1, 1, MAX_SIZE};
    }
    auto sizes_nchw = whd_to_nchw(sizes_whd);

    vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);

    auto bench = [&](uint32_t access_size, uint32_t dim) {
      // Number of texels that fit in this iteration
      const uint32_t ntexel_access = access_size / VEC_SIZE;

      // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
      // This will help us limit address accessing to a specific set of unique
      // addresses depending on the access size we want to measure.
      const uint32_t addr_mask = ntexel_access - 1;

      // This is to distribute the accesses to unique addresses across the
      // workgroups, once the size of the access excedes the workgroup width.
      const uint32_t workgroup_width = local_x * NITER * NUNROLL;

      StagingBuffer out_buf(
          context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
      vkapi::PipelineBarrier pipeline_barrier{};

      auto time = benchmark_on_gpu(shader_name, 10, [&]() {
        context()->submit_compute_job(
            VK_KERNEL_FROM_STR(shader_name),
            pipeline_barrier,
            {global_x, 1, 1},
            {local_x, 1, 1},
            {SV(NITER),
             SV(ntexel_access),
             SV(local_x),
             SV(addr_mask),
             SV(workgroup_width)},
            VK_NULL_HANDLE,
            0,
            in_tensor.image(),
            out_buf.buffer());
      });

      const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
      double gbps = SIZE_TRANS * 1e-3 / time;
      std::cout << "Texture bandwidth accessing \t" << access_size
                << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
                << "\tus)" << std::endl;
      return gbps;
    };

    double max_bandwidth = 0;
    double min_bandwidth = DBL_MAX;
    for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
         access_size *= 2) {
      double gbps = bench(access_size, dim);
      max_bandwidth = std::max(gbps, max_bandwidth);
      min_bandwidth = std::min(gbps, min_bandwidth);
    }

    std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
              << std::endl;
    std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
              << std::endl;
  }
}
} // namespace gpuinfo