gemmlowp/meta/single_thread_gemm.h

// Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef GEMMLOWP_META_SINGLE_THREAD_GEMM_H_
#define GEMMLOWP_META_SINGLE_THREAD_GEMM_H_

#include <iostream>
#include "base.h"

namespace gemmlowp {
namespace meta {

template <typename Executor, typename Params, int kernel_m, int kernel_n,
          int kernel_k>
void Gemm(const Params& params);

class GemmExecutorPackRHS {
 public:
  template <typename P>
  static int EstimateScratchSize(const P& params, int kernel_m, int kernel_n,
                                 int kernel_k) {
    const int lhs_scratch =
        StreamUtil<typename P::InType, typename P::LeftStream>::Scratch(
            params.left_stream, kernel_m, kernel_k);
    const int rhs_chunks = ((params.n + kernel_n - 1) / kernel_n);
    const int rhs_scratch =
        rhs_chunks *
        StreamUtil<typename P::InType, typename P::RightStream>::Scratch(
            params.right_stream, kernel_n, kernel_k);
    return AlignTo<64 * 1024>(lhs_scratch + rhs_scratch);
  }

  template <typename P, int m, int n, int k, int m_leftovers, int n_leftovers,
            int k_leftovers>
  static void ExecuteDispatch3D(const P& params) {
    // Shorthand typedefs for streams and multiply kernels.
    typedef typename P::InType InType;
    typedef typename P::OutType OutType;

    typedef Stream<typename P::InType, m, k, k_leftovers,
                   typename P::LeftStream>
        LeftStreamF;
    typedef Stream<typename P::InType, m_leftovers, k, k_leftovers,
                   typename P::LeftStream>
        LeftStreamL;

    typedef Stream<typename P::InType, n, k, k_leftovers,
                   typename P::RightStream>
        RightStreamF;
    typedef Stream<typename P::InType, n_leftovers, k, k_leftovers,
                   typename P::RightStream>
        RightStreamL;

    typedef Stream<typename P::OutType, m, n, 0, typename P::OutputStream>
        OutputStreamFF;
    typedef Stream<typename P::OutType, m_leftovers, n, 0,
                   typename P::OutputStream>
        OutputStreamLF;

    typedef MulKernel<typename P::InType, typename P::OutType,
                      typename P::Kernel, typename P::OutputStream, m, n, k>
        KernelFF;
    typedef MulKernel<typename P::InType, typename P::OutType,
                      typename P::Kernel, typename P::OutputStream, m,
                      n_leftovers, k>
        KernelFL;
    typedef MulKernel<typename P::InType, typename P::OutType,
                      typename P::Kernel, typename P::OutputStream, m_leftovers,
                      n, k>
        KernelLF;
    typedef MulKernel<typename P::InType, typename P::OutType,
                      typename P::Kernel, typename P::OutputStream, m_leftovers,
                      n_leftovers, k>
        KernelLL;

#ifdef DEBUG
#ifdef DEBUG_METAGEMM_VERBOSE
    std::cout << "GemmExecutor(" << typeid(P).name() << "): " << m << "x" << n
              << "x" << k << " -- " << m_leftovers << "x" << n_leftovers << "x"
              << k_leftovers << " -- " << params.m << "x" << params.n << "x"
              << params.k << std::endl;
    LeftStreamF::Debug(params.left_stream);
    LeftStreamL::Debug(params.left_stream);

    RightStreamF::Debug(params.right_stream);
    RightStreamL::Debug(params.right_stream);

    OutputStreamFF::Debug(params.fused_kernel.output_stream);
    OutputStreamLF::Debug(params.fused_kernel.output_stream);

    KernelFF::Debug(params.fused_kernel);
    KernelFL::Debug(params.fused_kernel);
    KernelLF::Debug(params.fused_kernel);
    KernelLL::Debug(params.fused_kernel);
#endif
#endif

    int lhs_chunks = params.m / m;
    int rhs_chunks = params.n / n;

    // Scratch memory for packed LHS & RHS chunks.

    std::uint8_t* packed_lhs = params.scratch;
    std::uint8_t* packed_rhs =
        params.scratch + LeftStreamF::Scratch(params.left_stream);

    // Pack full RHS first.

    std::uint8_t* packed_rhs_chunk = packed_rhs;
    const int packed_rhs_chunk_size =
        RightStreamF::PackedStride(params.right_stream);

    {
      const std::uint8_t* rhs_chunk =
          reinterpret_cast<const std::uint8_t*>(params.rhs);
      const int rhs_chunk_size =
          RightStreamF::UnpackedStride(params.right_stream);

      for (int i = 0; i < rhs_chunks; ++i) {
        RightStreamF::Pack(reinterpret_cast<const InType*>(rhs_chunk),
                           params.right_stream,
                           reinterpret_cast<InType*>(packed_rhs_chunk));

        rhs_chunk += rhs_chunk_size;
        packed_rhs_chunk += packed_rhs_chunk_size;
      }

      RightStreamL::Pack(reinterpret_cast<const InType*>(rhs_chunk),
                         params.right_stream,
                         reinterpret_cast<InType*>(packed_rhs_chunk));
    }

    // Multiply RHS by LHS one LHS chunk at a time.

    const std::uint8_t* lhs_chunk =
        reinterpret_cast<const std::uint8_t*>(params.lhs);
    std::uint8_t* result_strip = reinterpret_cast<std::uint8_t*>(params.result);
    std::uint8_t* result_chunk = result_strip;

    {
      const int lhs_chunk_size =
          LeftStreamF::UnpackedStride(params.left_stream);
      const int result_strip_size =
          OutputStreamFF::UnpackedStride(params.fused_kernel.output_stream);
      const int result_chunk_size =
          OutputStreamFF::UnpackedAdvance(params.fused_kernel.output_stream);

      for (int i = 0; i < lhs_chunks; ++i) {
        LeftStreamF::Pack(reinterpret_cast<const InType*>(lhs_chunk),
                          params.left_stream,
                          reinterpret_cast<InType*>(packed_lhs));

        result_chunk = result_strip;
        packed_rhs_chunk = packed_rhs;

        for (int j = 0; j < rhs_chunks; ++j) {
          KernelFF::Multiply(reinterpret_cast<const InType*>(packed_lhs),
                             reinterpret_cast<const InType*>(packed_rhs_chunk),
                             params.fused_kernel,
                             reinterpret_cast<OutType*>(result_chunk));

          result_chunk += result_chunk_size;
          packed_rhs_chunk += packed_rhs_chunk_size;
        }

        KernelFL::Multiply(reinterpret_cast<const InType*>(packed_lhs),
                           reinterpret_cast<const InType*>(packed_rhs_chunk),
                           params.fused_kernel,
                           reinterpret_cast<OutType*>(result_chunk));

        lhs_chunk += lhs_chunk_size;
        result_strip += result_strip_size;
      }
    }

    // Leftover LHS chunk.
    if (m_leftovers > 0) {  // static if
      const int result_chunk_size =
          OutputStreamLF::UnpackedAdvance(params.fused_kernel.output_stream);

      LeftStreamL::Pack(reinterpret_cast<const InType*>(lhs_chunk),
                        params.left_stream,
                        reinterpret_cast<InType*>(packed_lhs));

      result_chunk = result_strip;
      packed_rhs_chunk = packed_rhs;

      for (int i = 0; i < rhs_chunks; ++i) {
        KernelLF::Multiply(reinterpret_cast<const InType*>(packed_lhs),
                           reinterpret_cast<const InType*>(packed_rhs_chunk),
                           params.fused_kernel,
                           reinterpret_cast<OutType*>(result_chunk));

        result_chunk += result_chunk_size;
        packed_rhs_chunk += packed_rhs_chunk_size;
      }

      KernelLL::Multiply(reinterpret_cast<const InType*>(packed_lhs),
                         reinterpret_cast<const InType*>(packed_rhs_chunk),
                         params.fused_kernel,
                         reinterpret_cast<OutType*>(result_chunk));
    }
  }
};

class GemmExecutorPackLHS {
 public:
  template <typename P>
  static int EstimateScratchSize(const P& params, int kernel_m, int kernel_n,
                                 int kernel_k) {
    const int lhs_chunks = ((params.m + kernel_m - 1) / kernel_m);
    const int lhs_scratch =
        lhs_chunks *
        StreamUtil<typename P::InType, typename P::LeftStream>::Scratch(
            params.left_stream, kernel_m, kernel_k);
    const int rhs_scratch =
        StreamUtil<typename P::InType, typename P::RightStream>::Scratch(
            params.right_stream, kernel_n, kernel_k);
    return AlignTo<64 * 1024>(lhs_scratch + rhs_scratch);
  }

  template <typename P, int m, int n, int k, int m_leftovers, int n_leftovers,
            int k_leftovers>
  static void ExecuteDispatch3D(const P& params) {
    // Shorthand typedefs for streams and multiply kernels.
    typedef typename P::InType InType;
    typedef typename P::OutType OutType;

    typedef Stream<typename P::InType, m, k, k_leftovers,
                   typename P::LeftStream>
        LeftStreamF;
    typedef Stream<typename P::InType, m_leftovers, k, k_leftovers,
                   typename P::LeftStream>
        LeftStreamL;

    typedef Stream<typename P::InType, n, k, k_leftovers,
                   typename P::RightStream>
        RightStreamF;
    typedef Stream<typename P::InType, n_leftovers, k, k_leftovers,
                   typename P::RightStream>
        RightStreamL;

    typedef Stream<typename P::OutType, m, n, 0, typename P::OutputStream>
        OutputStreamFF;
    typedef Stream<typename P::OutType, m, n_leftovers, 0,
                   typename P::OutputStream>
        OutputStreamFL;

    typedef MulKernel<typename P::InType, typename P::OutType,
                      typename P::Kernel, typename P::OutputStream, m, n, k>
        KernelFF;
    typedef MulKernel<typename P::InType, typename P::OutType,
                      typename P::Kernel, typename P::OutputStream, m,
                      n_leftovers, k>
        KernelFL;
    typedef MulKernel<typename P::InType, typename P::OutType,
                      typename P::Kernel, typename P::OutputStream, m_leftovers,
                      n, k>
        KernelLF;
    typedef MulKernel<typename P::InType, typename P::OutType,
                      typename P::Kernel, typename P::OutputStream, m_leftovers,
                      n_leftovers, k>
        KernelLL;
#ifdef DEBUG
#ifdef DEBUG_METAGEMM_VERBOSE
    std::cout << "GemmExecutor(" << typeid(P).name() << "): " << m << "x" << n
              << "x" << k << " -- " << m_leftovers << "x" << n_leftovers << "x"
              << k_leftovers << " -- " << params.m << "x" << params.n << "x"
              << params.k << std::endl;
    LeftStreamF::Debug(params.left_stream);
    LeftStreamL::Debug(params.left_stream);

    RightStreamF::Debug(params.right_stream);
    RightStreamL::Debug(params.right_stream);

    OutputStreamFF::Debug(params.fused_kernel.output_stream);
    OutputStreamFL::Debug(params.fused_kernel.output_stream);

    KernelFF::Debug(params.fused_kernel);
    KernelFL::Debug(params.fused_kernel);
    KernelLF::Debug(params.fused_kernel);
    KernelLL::Debug(params.fused_kernel);
#endif
#endif

    int lhs_chunks = params.m / m;
    int rhs_chunks = params.n / n;

    // Scratch memory for packed LHS & RHS chunks.
    std::uint8_t* packed_rhs = params.scratch;
    std::uint8_t* packed_lhs =
        params.scratch + RightStreamF::Scratch(params.right_stream);

    // Pack full LHS first.

    std::uint8_t* packed_lhs_chunk = packed_lhs;
    const int packed_lhs_chunk_size =
        LeftStreamF::PackedStride(params.left_stream);

    {
      const std::uint8_t* lhs_chunk =
          reinterpret_cast<const std::uint8_t*>(params.lhs);
      const int lhs_chunk_size =
          LeftStreamF::UnpackedStride(params.left_stream);

      for (int i = 0; i < lhs_chunks; ++i) {
        LeftStreamF::Pack(reinterpret_cast<const InType*>(lhs_chunk),
                          params.left_stream,
                          reinterpret_cast<InType*>(packed_lhs_chunk));

        lhs_chunk += lhs_chunk_size;
        packed_lhs_chunk += packed_lhs_chunk_size;
      }

      LeftStreamL::Pack(reinterpret_cast<const InType*>(lhs_chunk),
                        params.left_stream,
                        reinterpret_cast<InType*>(packed_lhs_chunk));
    }

    // Multiply RHS by LHS one RHS chunk at a time.

    const std::uint8_t* rhs_chunk =
        reinterpret_cast<const std::uint8_t*>(params.rhs);
    std::uint8_t* result_strip = reinterpret_cast<std::uint8_t*>(params.result);
    std::uint8_t* result_chunk = result_strip;

    {
      const int rhs_chunk_size =
          RightStreamF::UnpackedStride(params.right_stream);
      const int result_strip_size =
          OutputStreamFF::UnpackedAdvance(params.fused_kernel.output_stream);
      const int result_chunk_size =
          OutputStreamFF::UnpackedStride(params.fused_kernel.output_stream);

      for (int i = 0; i < rhs_chunks; ++i) {
        RightStreamF::Pack(reinterpret_cast<const InType*>(rhs_chunk),
                           params.right_stream,
                           reinterpret_cast<InType*>(packed_rhs));

        result_chunk = result_strip;
        packed_lhs_chunk = packed_lhs;

        for (int j = 0; j < lhs_chunks; ++j) {
          KernelFF::Multiply(reinterpret_cast<const InType*>(packed_lhs_chunk),
                             reinterpret_cast<const InType*>(packed_rhs),
                             params.fused_kernel,
                             reinterpret_cast<OutType*>(result_chunk));

          result_chunk += result_chunk_size;
          packed_lhs_chunk += packed_lhs_chunk_size;
        }

        KernelLF::Multiply(reinterpret_cast<const InType*>(packed_lhs_chunk),
                           reinterpret_cast<const InType*>(packed_rhs),
                           params.fused_kernel,
                           reinterpret_cast<OutType*>(result_chunk));

        rhs_chunk += rhs_chunk_size;
        result_strip += result_strip_size;
      }
    }

    // Leftover RHS chunk.
    if (n_leftovers > 0) {  // static if
      const int result_chunk_size =
          OutputStreamFL::UnpackedStride(params.fused_kernel.output_stream);

      RightStreamL::Pack(reinterpret_cast<const InType*>(rhs_chunk),
                         params.right_stream,
                         reinterpret_cast<InType*>(packed_rhs));

      result_chunk = result_strip;
      packed_lhs_chunk = packed_lhs;

      for (int i = 0; i < lhs_chunks; ++i) {
        KernelFL::Multiply(reinterpret_cast<const InType*>(packed_lhs_chunk),
                           reinterpret_cast<const InType*>(packed_rhs),
                           params.fused_kernel,
                           reinterpret_cast<OutType*>(result_chunk));

        result_chunk += result_chunk_size;
        packed_lhs_chunk += packed_lhs_chunk_size;
      }

      KernelLL::Multiply(reinterpret_cast<const InType*>(packed_lhs_chunk),
                         reinterpret_cast<const InType*>(packed_rhs),
                         params.fused_kernel,
                         reinterpret_cast<OutType*>(result_chunk));
    }
  }
};

namespace internal {

inline int CalculateCacheFriendlyTasksCount(int cache_size, int constant_memory,
                                            int per_chunk_memory, int total_dim,
                                            int chunk_dim) {
  assert(constant_memory + per_chunk_memory < cache_size);
  const int available_cache = cache_size - constant_memory;
  const int available_chunks = available_cache / per_chunk_memory;
  const int chunks_count = (total_dim + chunk_dim - 1) / chunk_dim;
  return (chunks_count + available_chunks - 1) / available_chunks;
}

template <typename Params>
inline void UpdateCacheFriendlyTask(int m_offset, int m, int n_offset, int n,
                                    const Params& params, Params* task_params) {
  task_params->m = m;
  task_params->lhs =
      StreamUtil<typename Params::InType, typename Params::LeftStream>::Offset(
          params.left_stream, params.lhs, m_offset, 0);

  task_params->n = n;
  task_params->rhs =
      StreamUtil<typename Params::InType, typename Params::RightStream>::Offset(
          params.right_stream, params.rhs, n_offset, 0);

  task_params->result =
      StreamUtil<typename Params::OutType, typename Params::OutputStream>::
          Offset(params.fused_kernel.output_stream, params.result, m_offset,
                 n_offset);
}

}  // namespace internal

template <int cache_size = 256 * 1024>
class GemmExecutorPackRHSCacheFriendly {
 public:
  template <typename P>
  static int EstimateScratchSize(const P& params, int kernel_m, int kernel_n,
                                 int kernel_k) {
    return cache_size;
  }

  template <typename P, int m, int n, int k, int m_leftovers, int n_leftovers,
            int k_leftovers>
  static void ExecuteDispatch3D(const P& params) {
    typedef Stream<typename P::InType, m, k, k_leftovers,
                   typename P::LeftStream>
        LeftStream;

    typedef Stream<typename P::InType, n, k, k_leftovers,
                   typename P::RightStream>
        RightStream;

    const int lhs_scratch = LeftStream::Scratch(params.left_stream);
    const int rhs_scratch = RightStream::Scratch(params.right_stream);

    const int cache_friendly_tasks_count =
        internal::CalculateCacheFriendlyTasksCount(cache_size, lhs_scratch,
                                                   rhs_scratch, params.n, n);

    if (cache_friendly_tasks_count == 1) {
      GemmExecutorPackRHS::ExecuteDispatch3D<P, m, n, k, m_leftovers,
                                             n_leftovers, k_leftovers>(params);
      return;
    }

    const int cache_friendly_dim = params.n / cache_friendly_tasks_count;

    P task_params = params;
    for (int i = 0; i < cache_friendly_tasks_count - 1; ++i) {
      internal::UpdateCacheFriendlyTask(0, params.m, i * cache_friendly_dim,
                                        cache_friendly_dim, params,
                                        &task_params);
      Gemm<GemmExecutorPackRHS, P, m, n, k>(task_params);
    }
    const int dim_sum = (cache_friendly_tasks_count - 1) * cache_friendly_dim;
    internal::UpdateCacheFriendlyTask(0, params.m, dim_sum, params.n - dim_sum,
                                      params, &task_params);
    Gemm<GemmExecutorPackRHS, P, m, n, k>(task_params);
  }
};

template <int cache_size = 256 * 1024>
class GemmExecutorPackLHSCacheFriendly {
 public:
  template <typename P>
  static int EstimateScratchSize(const P& params, int kernel_m, int kernel_n,
                                 int kernel_k) {
    return cache_size;
  }

  template <typename P, int m, int n, int k, int m_leftovers, int n_leftovers,
            int k_leftovers>
  static void ExecuteDispatch3D(const P& params) {
    typedef Stream<typename P::InType, m, k, k_leftovers,
                   typename P::LeftStream>
        LeftStream;

    typedef Stream<typename P::InType, n, k, k_leftovers,
                   typename P::RightStream>
        RightStream;

    const int lhs_scratch = LeftStream::Scratch(params.left_stream);
    const int rhs_scratch = RightStream::Scratch(params.right_stream);

    const int cache_friendly_tasks_count =
        internal::CalculateCacheFriendlyTasksCount(cache_size, rhs_scratch,
                                                   lhs_scratch, params.m, m);

    if (cache_friendly_tasks_count == 1) {
      GemmExecutorPackLHS::ExecuteDispatch3D<P, m, n, k, m_leftovers,
                                             n_leftovers, k_leftovers>(params);
      return;
    }

    const int cache_friendly_dim = params.m / cache_friendly_tasks_count;

    P task_params = params;
    for (int i = 0; i < cache_friendly_tasks_count - 1; ++i) {
      internal::UpdateCacheFriendlyTask(i * cache_friendly_dim,
                                        cache_friendly_dim, 0, params.n, params,
                                        &task_params);
      Gemm<GemmExecutorPackLHS, P, m, n, k>(task_params);
    }
    const int dim_sum = (cache_friendly_tasks_count - 1) * cache_friendly_dim;
    internal::UpdateCacheFriendlyTask(dim_sum, params.m - dim_sum, 0, params.n,
                                      params, &task_params);
    Gemm<GemmExecutorPackLHS, P, m, n, k>(task_params);
  }
};

namespace internal {

// Stage 3.

template <typename E, typename P, int dim_m, int dim_n, int dim_k, int fixed_m,
          int fixed_n, int variable_k>
struct Dispatch3DStage3 {
  static void Execute(const P& params, int k) {
#ifdef DEBUG
#ifdef DEBUG_METAGEMM_VERBOSE
    std::cout << "Dispatch(3): " << dim_m << "x" << dim_n << "x" << dim_k
              << " : " << fixed_m << "x" << fixed_n << "x" << variable_k
              << std::endl
              << std::flush;
#endif
#endif
    if (k == variable_k) {
      E::template ExecuteDispatch3D<P, dim_m, dim_n, dim_k, fixed_m, fixed_n,
                                    variable_k>(params);
    } else {
      Dispatch3DStage3<E, P, dim_m, dim_n, dim_k, fixed_m, fixed_n,
                       variable_k - 1>::Execute(params, k);
    }
  }
};

template <typename E, typename P, int dim_m, int dim_n, int dim_k, int fixed_m,
          int fixed_n>
struct Dispatch3DStage3<E, P, dim_m, dim_n, dim_k, fixed_m, fixed_n, 0> {
  static void Execute(const P& params, int k) {
#ifdef DEBUG
#ifdef DEBUG_METAGEMM_VERBOSE
    std::cout << "Dispatch(3): " << dim_m << "x" << dim_n << "x" << dim_k
              << " : " << fixed_m << "x" << fixed_n << "x" << 0 << std::endl
              << std::flush;
#endif
#endif
    if (k == 0) {
      E::template ExecuteDispatch3D<P, dim_m, dim_n, dim_k, fixed_m, fixed_n,
                                    0>(params);
    } else {
      std::cerr << "FATAL: dispatch3DStage3 failed: ran out of cases."
                << std::endl
                << std::flush;
      std::exit(1);
    }
  }
};

// Stage 2.

template <typename E, typename P, int dim_m, int dim_n, int dim_k, int fixed_m,
          int variable_n>
struct Dispatch3DStage2 {
  static void Execute(const P& params, int n, int k) {
#ifdef DEBUG
#ifdef DEBUG_METAGEMM_VERBOSE
    std::cout << "Dispatch(2): " << dim_m << "x" << dim_n << "x" << dim_k
              << " : " << fixed_m << "x" << variable_n << std::endl
              << std::flush;
#endif
#endif
    if (n == variable_n) {
      Dispatch3DStage3<E, P, dim_m, dim_n, dim_k, fixed_m, variable_n,
                       dim_k - 1>::Execute(params, k);
    } else {
      Dispatch3DStage2<E, P, dim_m, dim_n, dim_k, fixed_m,
                       variable_n - 1>::Execute(params, n, k);
    }
  }
};

template <typename E, typename P, int dim_m, int dim_n, int dim_k, int fixed_m>
struct Dispatch3DStage2<E, P, dim_m, dim_n, dim_k, fixed_m, 0> {
  static void Execute(const P& params, int n, int k) {
#ifdef DEBUG
#ifdef DEBUG_METAGEMM_VERBOSE
    std::cout << "Dispatch(2): " << dim_m << "x" << dim_n << "x" << dim_k
              << " : " << fixed_m << "x" << 0 << std::endl
              << std::flush;
#endif
#endif
    if (n == 0) {
      Dispatch3DStage3<E, P, dim_m, dim_n, dim_k, fixed_m, 0,
                       dim_k - 1>::Execute(params, k);
    } else {
      std::cerr << "FATAL: dispatch3DStage2 failed: ran out of cases."
                << std::endl
                << std::flush;
      std::exit(1);
    }
  }
};

// Stage 1.

template <typename E, typename P, int dim_m, int dim_n, int dim_k,
          int variable_m>
struct Dispatch3DStage1 {
  static void Execute(const P& params, int m, int n, int k) {
#ifdef DEBUG
#ifdef DEBUG_METAGEMM_VERBOSE
    std::cout << "Dispatch(1): " << dim_m << "x" << dim_n << "x" << dim_k
              << " : " << variable_m << std::endl
              << std::flush;
#endif
#endif
    if (m == variable_m) {
      Dispatch3DStage2<E, P, dim_m, dim_n, dim_k, variable_m,
                       dim_n - 1>::Execute(params, n, k);
    } else {
      Dispatch3DStage1<E, P, dim_m, dim_n, dim_k, variable_m - 1>::Execute(
          params, m, n, k);
    }
  }
};

template <typename E, typename P, int dim_m, int dim_n, int dim_k>
struct Dispatch3DStage1<E, P, dim_m, dim_n, dim_k, 0> {
  static void Execute(const P& params, int m, int n, int k) {
#ifdef DEBUG
#ifdef DEBUG_METAGEMM_VERBOSE
    std::cout << "Dispatch(1): " << dim_m << "x" << dim_n << "x" << dim_k
              << " : " << 0 << std::endl
              << std::flush;
#endif
#endif
    if (m == 0) {
      Dispatch3DStage2<E, P, dim_m, dim_n, dim_k, 0, dim_n - 1>::Execute(params,
                                                                         n, k);
    } else {
      std::cerr << "FATAL: dispatch3DStage1 failed: ran out of cases."
                << std::endl
                << std::flush;
      std::exit(1);
    }
  }
};

}  // namespace internal

template <typename Executor, typename Params, int kernel_m, int kernel_n,
          int kernel_k>
inline void Gemm(const Params& params) {
  internal::Dispatch3DStage1<Executor, Params, kernel_m, kernel_n, kernel_k,
                             kernel_m - 1>::Execute(params, params.m % kernel_m,
                                                    params.n % kernel_n,
                                                    params.k % kernel_k);
}

}  // namespace meta
}  // namespace gemmlowp

#endif  // GEMMLOWP_META_SINGLE_THREAD_GEMM_H_