1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 12 13 namespace Eigen { 14 15 /** \class TensorExecutor 16 * \ingroup CXX11_Tensor_Module 17 * 18 * \brief The tensor executor class. 19 * 20 * This class is responsible for launch the evaluation of the expression on 21 * the specified computing device. 22 */ 23 namespace internal { 24 25 // Default strategy: the expression is evaluated with a single cpu thread. 26 template<typename Expression, typename Device, bool Vectorizable> 27 class TensorExecutor 28 { 29 public: 30 typedef typename Expression::Index Index; 31 EIGEN_DEVICE_FUNC 32 static inline void run(const Expression& expr, const Device& device = Device()) 33 { 34 TensorEvaluator<Expression, Device> evaluator(expr, device); 35 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); 36 if (needs_assign) 37 { 38 const Index size = array_prod(evaluator.dimensions()); 39 for (Index i = 0; i < size; ++i) { 40 evaluator.evalScalar(i); 41 } 42 } 43 evaluator.cleanup(); 44 } 45 }; 46 47 48 template<typename Expression> 49 class TensorExecutor<Expression, DefaultDevice, true> 50 { 51 public: 52 typedef typename Expression::Index Index; 53 EIGEN_DEVICE_FUNC 54 static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) 55 { 56 TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device); 57 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); 58 if (needs_assign) 59 { 60 const Index size = array_prod(evaluator.dimensions()); 61 const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; 62 // Give the compiler a strong hint to unroll the loop. But don't insist 63 // on unrolling, because if the function is expensive the compiler should not 64 // unroll the loop at the expense of inlining. 65 const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; 66 for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { 67 for (Index j = 0; j < 4; j++) { 68 evaluator.evalPacket(i + j * PacketSize); 69 } 70 } 71 const Index VectorizedSize = (size / PacketSize) * PacketSize; 72 for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { 73 evaluator.evalPacket(i); 74 } 75 for (Index i = VectorizedSize; i < size; ++i) { 76 evaluator.evalScalar(i); 77 } 78 } 79 evaluator.cleanup(); 80 } 81 }; 82 83 84 85 // Multicore strategy: the index space is partitioned and each partition is executed on a single core 86 #ifdef EIGEN_USE_THREADS 87 template <typename Evaluator, typename Index, bool Vectorizable> 88 struct EvalRange { runEvalRange89 static void run(Evaluator* evaluator_in, const Index first, const Index last) { 90 Evaluator evaluator = *evaluator_in; 91 eigen_assert(last >= first); 92 for (Index i = first; i < last; ++i) { 93 evaluator.evalScalar(i); 94 } 95 } 96 alignBlockSizeEvalRange97 static Index alignBlockSize(Index size) { 98 return size; 99 } 100 }; 101 102 template <typename Evaluator, typename Index> 103 struct EvalRange<Evaluator, Index, true> { 104 static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; 105 106 static void run(Evaluator* evaluator_in, const Index first, const Index last) { 107 Evaluator evaluator = *evaluator_in; 108 eigen_assert(last >= first); 109 Index i = first; 110 if (last - first >= PacketSize) { 111 eigen_assert(first % PacketSize == 0); 112 Index last_chunk_offset = last - 4 * PacketSize; 113 // Give the compiler a strong hint to unroll the loop. But don't insist 114 // on unrolling, because if the function is expensive the compiler should not 115 // unroll the loop at the expense of inlining. 116 for (; i <= last_chunk_offset; i += 4*PacketSize) { 117 for (Index j = 0; j < 4; j++) { 118 evaluator.evalPacket(i + j * PacketSize); 119 } 120 } 121 last_chunk_offset = last - PacketSize; 122 for (; i <= last_chunk_offset; i += PacketSize) { 123 evaluator.evalPacket(i); 124 } 125 } 126 for (; i < last; ++i) { 127 evaluator.evalScalar(i); 128 } 129 } 130 131 static Index alignBlockSize(Index size) { 132 // Align block size to packet size and account for unrolling in run above. 133 if (size >= 16 * PacketSize) { 134 return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); 135 } 136 // Aligning to 4 * PacketSize would increase block size by more than 25%. 137 return (size + PacketSize - 1) & ~(PacketSize - 1); 138 } 139 }; 140 141 template <typename Expression, bool Vectorizable> 142 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { 143 public: 144 typedef typename Expression::Index Index; 145 static inline void run(const Expression& expr, const ThreadPoolDevice& device) 146 { 147 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; 148 Evaluator evaluator(expr, device); 149 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); 150 if (needs_assign) 151 { 152 const Index size = array_prod(evaluator.dimensions()); 153 #if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) 154 device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), 155 EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize, 156 [&evaluator](Index first, Index last) { 157 EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last); 158 }); 159 #else 160 size_t num_threads = device.numThreads(); 161 if (num_threads > 1) { 162 num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( 163 size, evaluator.costPerCoeff(Vectorizable), num_threads); 164 } 165 if (num_threads == 1) { 166 EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size); 167 } else { 168 const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; 169 Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1; 170 const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); 171 const Index numblocks = size / blocksize; 172 173 Barrier barrier(numblocks); 174 for (int i = 0; i < numblocks; ++i) { 175 device.enqueue_with_barrier( 176 &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, 177 &evaluator, i * blocksize, (i + 1) * blocksize); 178 } 179 if (numblocks * blocksize < size) { 180 EvalRange<Evaluator, Index, Vectorizable>::run( 181 &evaluator, numblocks * blocksize, size); 182 } 183 barrier.Wait(); 184 } 185 #endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) 186 } 187 evaluator.cleanup(); 188 } 189 }; 190 #endif // EIGEN_USE_THREADS 191 192 193 // GPU: the evaluation of the expression is offloaded to a GPU. 194 #if defined(EIGEN_USE_GPU) 195 196 template <typename Expression, bool Vectorizable> 197 class TensorExecutor<Expression, GpuDevice, Vectorizable> { 198 public: 199 typedef typename Expression::Index Index; 200 static void run(const Expression& expr, const GpuDevice& device); 201 }; 202 203 204 #if defined(__CUDACC__) 205 template <typename Evaluator, typename Index, bool Vectorizable> 206 struct EigenMetaKernelEval { 207 static __device__ EIGEN_ALWAYS_INLINE 208 void run(Evaluator& eval, Index first, Index last, Index step_size) { 209 for (Index i = first; i < last; i += step_size) { 210 eval.evalScalar(i); 211 } 212 } 213 }; 214 215 template <typename Evaluator, typename Index> 216 struct EigenMetaKernelEval<Evaluator, Index, true> { 217 static __device__ EIGEN_ALWAYS_INLINE 218 void run(Evaluator& eval, Index first, Index last, Index step_size) { 219 const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; 220 const Index vectorized_size = (last / PacketSize) * PacketSize; 221 const Index vectorized_step_size = step_size * PacketSize; 222 223 // Use the vector path 224 for (Index i = first * PacketSize; i < vectorized_size; 225 i += vectorized_step_size) { 226 eval.evalPacket(i); 227 } 228 for (Index i = vectorized_size + first; i < last; i += step_size) { 229 eval.evalScalar(i); 230 } 231 } 232 }; 233 234 template <typename Evaluator, typename Index> 235 __global__ void 236 __launch_bounds__(1024) 237 EigenMetaKernel(Evaluator eval, Index size) { 238 239 const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; 240 const Index step_size = blockDim.x * gridDim.x; 241 242 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; 243 EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size); 244 } 245 246 /*static*/ 247 template <typename Expression, bool Vectorizable> 248 inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run( 249 const Expression& expr, const GpuDevice& device) { 250 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); 251 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); 252 if (needs_assign) { 253 const int block_size = device.maxCudaThreadsPerBlock(); 254 const int max_blocks = device.getNumCudaMultiProcessors() * 255 device.maxCudaThreadsPerMultiProcessor() / block_size; 256 const Index size = array_prod(evaluator.dimensions()); 257 // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. 258 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); 259 260 LAUNCH_CUDA_KERNEL( 261 (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), 262 num_blocks, block_size, 0, device, evaluator, size); 263 } 264 evaluator.cleanup(); 265 } 266 267 #endif // __CUDACC__ 268 #endif // EIGEN_USE_GPU 269 270 // SYCL Executor policy 271 #ifdef EIGEN_USE_SYCL 272 273 template <typename Expression, bool Vectorizable> 274 class TensorExecutor<Expression, SyclDevice, Vectorizable> { 275 public: 276 static inline void run(const Expression &expr, const SyclDevice &device) { 277 // call TensorSYCL module 278 TensorSycl::run(expr, device); 279 } 280 }; 281 282 #endif 283 284 } // end namespace internal 285 286 } // end namespace Eigen 287 288 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 289