• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
12 
13 namespace Eigen {
14 
15 /**
16  * \class TensorExecutor
17  * \ingroup CXX11_Tensor_Module
18  *
19  * \brief The tensor executor class.
20  *
21  * This class is responsible for launch the evaluation of the expression on
22  * the specified computing device.
23  *
24  * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
25  *                      instructions)
26  * @tparam Tiling       can use block based tensor evaluation
27  *                      (see TensorBlock.h)
28  */
29 namespace internal {
30 
31 /**
32  * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely
33  * expensive. If expression has at least one broadcast op in it, and it supports
34  * block based evaluation, we always prefer it, even for the small tensors. For
35  * all other tileable ops, block evaluation overhead for small tensors (fits
36  * into L1) is too large, and we fallback on vectorized evaluation.
37  */
38 
39 // TODO(ezhulenev): Add specializations for all other types of Tensor ops.
40 
41 template<typename Expression>
42 struct ExpressionHasTensorBroadcastingOp {
43   enum { value = false };
44 };
45 
46 template<typename LhsXprType, typename RhsXprType>
47 struct ExpressionHasTensorBroadcastingOp<
48     const TensorAssignOp<LhsXprType, RhsXprType> > {
49   enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
50 };
51 
52 template<typename UnaryOp, typename XprType>
53 struct ExpressionHasTensorBroadcastingOp<
54     const TensorCwiseUnaryOp<UnaryOp, XprType> > {
55   enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
56 };
57 
58 template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
59 struct ExpressionHasTensorBroadcastingOp<
60     const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
61   enum {
62     value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
63         ExpressionHasTensorBroadcastingOp<RhsXprType>::value
64   };
65 };
66 
67 template<typename Broadcast, typename XprType>
68 struct ExpressionHasTensorBroadcastingOp<
69     const TensorBroadcastingOp<Broadcast, XprType> > {
70   enum { value = true };
71 };
72 
73 // -------------------------------------------------------------------------- //
74 
75 /**
76  * Default strategy: the expression is evaluated sequentially with a single cpu
77  * thread, without vectorization and block evaluation.
78  */
79 template <typename Expression, typename Device, bool Vectorizable,
80           TiledEvaluation Tiling>
81 class TensorExecutor {
82  public:
83   typedef typename Expression::Index StorageIndex;
84 
85   // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
86   // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
87   // violation. If this template is instantiated with a non-default device, it
88   // means that this header file was included without defining
89   // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
90   static_assert(std::is_same<Device, DefaultDevice>::value,
91                 "Default executor instantiated with non-default device. "
92                 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
93                 "EIGEN_USE_SYCL before including Eigen headers.");
94 
95   EIGEN_DEVICE_FUNC
96   static EIGEN_STRONG_INLINE void run(const Expression& expr,
97                                       const Device& device = Device()) {
98     TensorEvaluator<Expression, Device> evaluator(expr, device);
99     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
100     if (needs_assign) {
101       const StorageIndex size = array_prod(evaluator.dimensions());
102       for (StorageIndex i = 0; i < size; ++i) {
103         evaluator.evalScalar(i);
104       }
105     }
106     evaluator.cleanup();
107   }
108 };
109 
110 /**
111  * Default async execution strategy is not implemented. Currently it's only
112  * available for ThreadPoolDevice (see definition below).
113  */
114 template <typename Expression, typename Device, typename DoneCallback,
115           bool Vectorizable, TiledEvaluation Tiling>
116 class TensorAsyncExecutor {};
117 
118 /**
119  * Process all the data with a single cpu thread, using vectorized instructions.
120  */
121 template <typename Expression>
122 class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
123                      /*Tiling=*/TiledEvaluation::Off> {
124  public:
125   typedef typename Expression::Index StorageIndex;
126 
127   EIGEN_DEVICE_FUNC
128   static EIGEN_STRONG_INLINE void run(
129       const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
130     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
131     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
132     if (needs_assign) {
133       const StorageIndex size = array_prod(evaluator.dimensions());
134       const int PacketSize = unpacket_traits<typename TensorEvaluator<
135           Expression, DefaultDevice>::PacketReturnType>::size;
136 
137       // Give compiler a strong possibility to unroll the loop. But don't insist
138       // on unrolling, because if the function is expensive compiler should not
139       // unroll the loop at the expense of inlining.
140       const StorageIndex UnrolledSize =
141           (size / (4 * PacketSize)) * 4 * PacketSize;
142       for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
143         for (StorageIndex j = 0; j < 4; j++) {
144           evaluator.evalPacket(i + j * PacketSize);
145         }
146       }
147       const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
148       for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
149         evaluator.evalPacket(i);
150       }
151       for (StorageIndex i = VectorizedSize; i < size; ++i) {
152         evaluator.evalScalar(i);
153       }
154     }
155     evaluator.cleanup();
156   }
157 };
158 
159 /**
160  * Process all the data with a single cpu thread, using blocks of data. By
161  * sizing a block to fit L1 cache we get better cache performance.
162  */
163 template <typename Expression, bool Vectorizable>
164 class TensorExecutor<Expression, DefaultDevice, Vectorizable,
165                      /*Tiling=*/TiledEvaluation::On> {
166  public:
167   typedef typename traits<Expression>::Scalar Scalar;
168   typedef typename remove_const<Scalar>::type ScalarNoConst;
169 
170   typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
171   typedef typename traits<Expression>::Index StorageIndex;
172 
173   static const int NumDims = traits<Expression>::NumDimensions;
174 
175   EIGEN_DEVICE_FUNC
176   static EIGEN_STRONG_INLINE void run(const Expression& expr,
177                          const DefaultDevice& device = DefaultDevice()) {
178     typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
179         TensorBlockMapper;
180 
181     typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
182         TensorBlockDesc;
183     typedef internal::TensorBlockScratchAllocator<DefaultDevice>
184         TensorBlockScratch;
185 
186     Evaluator evaluator(expr, device);
187 
188     // TODO(ezhulenev): Do not use tiling for small tensors?
189     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
190 
191     if (needs_assign) {
192       // Query expression tree for desired block size/shape.
193       const TensorBlockResourceRequirements requirements =
194           evaluator.getResourceRequirements();
195 
196       const TensorBlockMapper block_mapper(
197           typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
198           requirements);
199 
200       // Share scratch memory allocator between all blocks.
201       TensorBlockScratch scratch(device);
202 
203       const StorageIndex total_block_count = block_mapper.blockCount();
204       for (StorageIndex i = 0; i < total_block_count; ++i) {
205         TensorBlockDesc desc = block_mapper.blockDescriptor(i);
206         evaluator.evalBlock(desc, scratch);
207         scratch.reset();
208       }
209     }
210     evaluator.cleanup();
211   }
212 };
213 
214 /**
215  * Multicore strategy: the index space is partitioned and each partition is
216  * executed on a single core.
217  *
218  * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread
219  *     pool, and will block the caller thread until all tasks are finished.
220  *
221  * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to
222  *     the ThreadPoolDevice managed thread pool, and will return immediately.
223  *     It will call 'done' callback after all tasks are finished.
224  */
225 #ifdef EIGEN_USE_THREADS
226 
227 template <typename TensorBlockMapper>
228 struct TensorExecutorTilingContext {
229   TensorExecutorTilingContext() = default;
230   TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
231                               const TensorOpCost& b_cost, size_t b_aligned_size)
232       : block_mapper(b_mapper),
233         cost(b_cost),
234         aligned_blocksize(b_aligned_size) {}
235 
236   TensorBlockMapper block_mapper;  // navigate through blocks
237   TensorOpCost cost;               // cost of computing a single block
238   size_t aligned_blocksize;        // block size after memory alignment
239 };
240 
241 // Computes a block evaluation parameters, and allocates temporary memory buffer
242 // for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
243 template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
244 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
245     const Evaluator& evaluator) {
246   // Query expression tree for desired block size/shape.
247   TensorBlockResourceRequirements requirements =
248       evaluator.getResourceRequirements();
249 
250   // Update target block size based on cost model.
251   double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
252       1, requirements.cost_per_coeff);
253   requirements.size = static_cast<size_t>(1.0 / taskSize);
254 
255   TensorBlockMapper block_mapper(
256       typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
257       requirements);
258 
259   size_t block_size = block_mapper.blockTotalSize();
260   const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
261   const size_t aligned_blocksize =
262       align *
263       divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
264 
265   return {block_mapper, requirements.cost_per_coeff * block_size,
266           aligned_blocksize};
267 }
268 
269 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
270 struct EvalRange {
271   static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
272                   const StorageIndex lastIdx) {
273     Evaluator evaluator = *evaluator_in;
274     eigen_assert(lastIdx >= firstIdx);
275     for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
276       evaluator.evalScalar(i);
277     }
278   }
279 
280   static StorageIndex alignBlockSize(StorageIndex size) { return size; }
281 };
282 
283 template <typename Evaluator, typename StorageIndex>
284 struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
285   static const int PacketSize =
286       unpacket_traits<typename Evaluator::PacketReturnType>::size;
287 
288   static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
289                   const StorageIndex lastIdx) {
290     Evaluator evaluator = *evaluator_in;
291     eigen_assert(lastIdx >= firstIdx);
292     StorageIndex i = firstIdx;
293     if (lastIdx - firstIdx >= PacketSize) {
294       eigen_assert(firstIdx % PacketSize == 0);
295       StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
296       // Give compiler a strong possibility to unroll the loop. But don't insist
297       // on unrolling, because if the function is expensive compiler should not
298       // unroll the loop at the expense of inlining.
299       for (; i <= last_chunk_offset; i += 4 * PacketSize) {
300         for (StorageIndex j = 0; j < 4; j++) {
301           evaluator.evalPacket(i + j * PacketSize);
302         }
303       }
304       last_chunk_offset = lastIdx - PacketSize;
305       for (; i <= last_chunk_offset; i += PacketSize) {
306         evaluator.evalPacket(i);
307       }
308     }
309     for (; i < lastIdx; ++i) {
310       evaluator.evalScalar(i);
311     }
312   }
313 
314   static StorageIndex alignBlockSize(StorageIndex size) {
315     // Align block size to packet size and account for unrolling in run above.
316     if (size >= 16 * PacketSize) {
317       return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
318     }
319     // Aligning to 4 * PacketSize would increase block size by more than 25%.
320     return (size + PacketSize - 1) & ~(PacketSize - 1);
321   }
322 };
323 
324 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
325 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
326  public:
327   typedef typename Expression::Index StorageIndex;
328 
329   static EIGEN_STRONG_INLINE void run(const Expression& expr,
330                          const ThreadPoolDevice& device) {
331     typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
332     typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
333 
334     Evaluator evaluator(expr, device);
335     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
336     if (needs_assign) {
337       const StorageIndex size = array_prod(evaluator.dimensions());
338       device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
339                          EvalRange::alignBlockSize,
340                          [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
341                            EvalRange::run(&evaluator, firstIdx, lastIdx);
342                          });
343     }
344     evaluator.cleanup();
345   }
346 };
347 
348 template <typename Expression, bool Vectorizable>
349 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
350                      /*Tiling=*/TiledEvaluation::On> {
351  public:
352   typedef typename traits<Expression>::Index IndexType;
353   typedef typename traits<Expression>::Scalar Scalar;
354   typedef typename remove_const<Scalar>::type ScalarNoConst;
355 
356   static const int NumDims = traits<Expression>::NumDimensions;
357 
358   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
359   typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
360   typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
361 
362   typedef internal::TensorBlockDescriptor<NumDims, IndexType>
363       TensorBlockDesc;
364   typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
365       TensorBlockScratch;
366 
367   static EIGEN_STRONG_INLINE void run(const Expression& expr,
368                                       const ThreadPoolDevice& device) {
369     Evaluator evaluator(expr, device);
370 
371     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
372     if (needs_assign) {
373       const TilingContext tiling =
374           internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
375                                                    Vectorizable>(evaluator);
376 
377       auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
378                                                        IndexType lastBlockIdx) {
379         TensorBlockScratch scratch(device);
380 
381         for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
382              ++block_idx) {
383           TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
384           evaluator.evalBlock(desc, scratch);
385           scratch.reset();
386         }
387       };
388 
389       // Evaluate small expressions directly as a single block.
390       if (tiling.block_mapper.blockCount() == 1) {
391         TensorBlockScratch scratch(device);
392         TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
393         evaluator.evalBlock(desc, scratch);
394       } else {
395         device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
396                            eval_block);
397       }
398     }
399     evaluator.cleanup();
400   }
401 };
402 
403 template <typename Expression, typename DoneCallback, bool Vectorizable,
404           TiledEvaluation Tiling>
405 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
406                           Vectorizable, Tiling> {
407  public:
408   typedef typename Expression::Index StorageIndex;
409   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
410 
411   static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
412                                            const ThreadPoolDevice& device,
413                                            DoneCallback done) {
414     TensorAsyncExecutorContext* const ctx =
415         new TensorAsyncExecutorContext(expr, device, std::move(done));
416 
417     const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
418       if (!need_assign) {
419         delete ctx;
420         return;
421       }
422 
423       typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
424       const StorageIndex size = array_prod(ctx->evaluator.dimensions());
425       device.parallelForAsync(
426           size, ctx->evaluator.costPerCoeff(Vectorizable),
427           EvalRange::alignBlockSize,
428           [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
429             EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
430           },
431           [ctx]() { delete ctx; });
432     };
433 
434     ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
435   }
436 
437  private:
438   struct TensorAsyncExecutorContext {
439     TensorAsyncExecutorContext(const Expression& expr,
440                                const ThreadPoolDevice& thread_pool,
441                                DoneCallback done)
442         : evaluator(expr, thread_pool), on_done(std::move(done)) {}
443 
444     ~TensorAsyncExecutorContext() {
445       evaluator.cleanup();
446       on_done();
447     }
448 
449     Evaluator evaluator;
450 
451    private:
452     DoneCallback on_done;
453   };
454 };
455 
456 template <typename Expression, typename DoneCallback, bool Vectorizable>
457 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
458                           Vectorizable, /*Tileable*/ TiledEvaluation::On> {
459  public:
460   typedef typename traits<Expression>::Index IndexType;
461   typedef typename traits<Expression>::Scalar Scalar;
462   typedef typename remove_const<Scalar>::type ScalarNoConst;
463 
464   static const int NumDims = traits<Expression>::NumDimensions;
465 
466   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
467   typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
468   typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
469 
470   typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
471   typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
472       TensorBlockScratch;
473 
474   static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
475                                            const ThreadPoolDevice& device,
476                                            DoneCallback done) {
477 
478     TensorAsyncExecutorContext* const ctx =
479         new TensorAsyncExecutorContext(expr, device, std::move(done));
480 
481     const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
482       if (!need_assign) {
483         delete ctx;
484         return;
485       }
486 
487       ctx->tiling = internal::GetTensorExecutorTilingContext<
488           Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
489 
490       auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
491         TensorBlockScratch scratch(ctx->device);
492 
493         for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
494              ++block_idx) {
495           TensorBlockDesc desc =
496               ctx->tiling.block_mapper.blockDescriptor(block_idx);
497           ctx->evaluator.evalBlock(desc, scratch);
498           scratch.reset();
499         }
500       };
501 
502       // Evaluate small expressions directly as a single block.
503       if (ctx->tiling.block_mapper.blockCount() == 1) {
504         TensorBlockScratch scratch(ctx->device);
505         TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
506         ctx->evaluator.evalBlock(desc, scratch);
507         delete ctx;
508       } else {
509         ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
510                                      ctx->tiling.cost, eval_block,
511                                      [ctx]() { delete ctx; });
512       }
513     };
514 
515     ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
516   }
517 
518  private:
519   struct TensorAsyncExecutorContext {
520     TensorAsyncExecutorContext(const Expression& expr,
521                                const ThreadPoolDevice& thread_pool,
522                                DoneCallback done)
523         : device(thread_pool),
524           evaluator(expr, thread_pool),
525           on_done(std::move(done)) {}
526 
527     ~TensorAsyncExecutorContext() {
528       evaluator.cleanup();
529       on_done();
530     }
531 
532     const ThreadPoolDevice& device;
533     Evaluator evaluator;
534     TilingContext tiling;
535 
536    private:
537     DoneCallback on_done;
538   };
539 };
540 
541 #endif  // EIGEN_USE_THREADS
542 
543 // GPU: the evaluation of the expression is offloaded to a GPU.
544 #if defined(EIGEN_USE_GPU)
545 
546 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
547 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
548  public:
549   typedef typename Expression::Index StorageIndex;
550   static void run(const Expression& expr, const GpuDevice& device);
551 };
552 
553 #if defined(EIGEN_GPUCC)
554 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
555 struct EigenMetaKernelEval {
556   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
557   void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
558     for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
559       eval.evalScalar(i);
560     }
561   }
562 };
563 
564 template <typename Evaluator, typename StorageIndex>
565 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
566   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
567   void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
568     const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
569     const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
570     const StorageIndex vectorized_step_size = step_size * PacketSize;
571 
572     // Use the vector path
573     for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
574          i += vectorized_step_size) {
575       eval.evalPacket(i);
576     }
577     for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
578       eval.evalScalar(i);
579     }
580   }
581 };
582 
583 template <typename Evaluator, typename StorageIndex>
584 __global__ void
585 __launch_bounds__(1024)
586 EigenMetaKernel(Evaluator eval, StorageIndex size) {
587 
588   const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
589   const StorageIndex step_size = blockDim.x * gridDim.x;
590 
591   const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
592   EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
593 }
594 
595 /*static*/
596 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
597 EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
598     const Expression& expr, const GpuDevice& device) {
599   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
600   const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
601   if (needs_assign) {
602 
603     const int block_size = device.maxGpuThreadsPerBlock();
604     const int max_blocks = device.getNumGpuMultiProcessors() *
605                            device.maxGpuThreadsPerMultiProcessor() / block_size;
606     const StorageIndex size = array_prod(evaluator.dimensions());
607     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
608     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
609 
610     LAUNCH_GPU_KERNEL(
611         (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
612         num_blocks, block_size, 0, device, evaluator, size);
613   }
614   evaluator.cleanup();
615 }
616 
617 #endif  // EIGEN_GPUCC
618 #endif  // EIGEN_USE_GPU
619 
620 // SYCL Executor policy
621 #ifdef EIGEN_USE_SYCL
622 
623 template <typename Evaluator>
624 struct ExecExprFunctorKernel {
625   typedef typename Evaluator::Index Index;
626   Evaluator evaluator;
627   const Index range;
628   template <typename Scratch>
629   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
630       const Scratch, Evaluator evaluator_, const Index range_)
631       : evaluator(evaluator_), range(range_) {}
632 
633   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
634       cl::sycl::nd_item<1> itemID) {
635     compute(itemID);
636   }
637   template <bool is_vec = Evaluator::PacketAccess>
638   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
639   compute(const cl::sycl::nd_item<1>& itemID) {
640     Index gId = static_cast<Index>(itemID.get_global_linear_id());
641     Index total_threads = itemID.get_global_range(0);
642 
643     for (Index i = gId; i < range; i += total_threads) {
644       evaluator.evalScalar(i);
645     }
646   }
647   template <bool is_vec = Evaluator::PacketAccess>
648   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
649   compute(const cl::sycl::nd_item<1>& itemID) {
650     const Index vectorizedRange =
651         (range / Evaluator::PacketSize) * Evaluator::PacketSize;
652     Index gId = static_cast<Index>(itemID.get_global_linear_id());
653     const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
654     const Index start = Evaluator::PacketSize * gId;
655     for (Index i = start; i < vectorizedRange; i += step) {
656       evaluator.evalPacket(i);
657     }
658     gId += vectorizedRange;
659     for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
660       evaluator.evalScalar(i);
661     }
662   }
663 };
664 
665 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
666 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
667  public:
668   typedef typename Expression::Index Index;
669   static EIGEN_STRONG_INLINE void run(const Expression& expr,
670                                       const Eigen::SyclDevice& dev) {
671     typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
672     Evaluator evaluator(expr, dev);
673     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
674     if (needs_assign) {
675       Index range, GRange, tileSize;
676       Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
677       total_size = (total_size == 0) ? 1 : total_size;
678       const int PacketSize =
679           Eigen::PacketType<typename Evaluator::CoeffReturnType,
680                             Eigen::SyclDevice>::size;
681       Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
682       dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
683       range = total_size;
684 
685       dev.template nullary_kernel_launcher<
686           typename Evaluator::CoeffReturnType,
687           ExecExprFunctorKernel<Evaluator> >(
688           evaluator,
689           cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
690                                 cl::sycl::range<1>(tileSize)),
691           Index(1), range);
692     }
693     evaluator.cleanup();
694   }
695 };
696 
697 #endif
698 
699 } // end namespace internal
700 
701 } // end namespace Eigen
702 
703 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
704