1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 12 13 namespace Eigen { 14 15 /** 16 * \class TensorExecutor 17 * \ingroup CXX11_Tensor_Module 18 * 19 * \brief The tensor executor class. 20 * 21 * This class is responsible for launch the evaluation of the expression on 22 * the specified computing device. 23 * 24 * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and 25 * instructions) 26 * @tparam Tiling can use block based tensor evaluation 27 * (see TensorBlock.h) 28 */ 29 namespace internal { 30 31 /** 32 * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely 33 * expensive. If expression has at least one broadcast op in it, and it supports 34 * block based evaluation, we always prefer it, even for the small tensors. For 35 * all other tileable ops, block evaluation overhead for small tensors (fits 36 * into L1) is too large, and we fallback on vectorized evaluation. 37 */ 38 39 // TODO(ezhulenev): Add specializations for all other types of Tensor ops. 40 41 template<typename Expression> 42 struct ExpressionHasTensorBroadcastingOp { 43 enum { value = false }; 44 }; 45 46 template<typename LhsXprType, typename RhsXprType> 47 struct ExpressionHasTensorBroadcastingOp< 48 const TensorAssignOp<LhsXprType, RhsXprType> > { 49 enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value }; 50 }; 51 52 template<typename UnaryOp, typename XprType> 53 struct ExpressionHasTensorBroadcastingOp< 54 const TensorCwiseUnaryOp<UnaryOp, XprType> > { 55 enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value }; 56 }; 57 58 template<typename BinaryOp, typename LhsXprType, typename RhsXprType> 59 struct ExpressionHasTensorBroadcastingOp< 60 const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > { 61 enum { 62 value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value || 63 ExpressionHasTensorBroadcastingOp<RhsXprType>::value 64 }; 65 }; 66 67 template<typename Broadcast, typename XprType> 68 struct ExpressionHasTensorBroadcastingOp< 69 const TensorBroadcastingOp<Broadcast, XprType> > { 70 enum { value = true }; 71 }; 72 73 // -------------------------------------------------------------------------- // 74 75 /** 76 * Default strategy: the expression is evaluated sequentially with a single cpu 77 * thread, without vectorization and block evaluation. 78 */ 79 template <typename Expression, typename Device, bool Vectorizable, 80 TiledEvaluation Tiling> 81 class TensorExecutor { 82 public: 83 typedef typename Expression::Index StorageIndex; 84 85 // Including `unsupported/Eigen/CXX11/Tensor` in different translation units 86 // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR 87 // violation. If this template is instantiated with a non-default device, it 88 // means that this header file was included without defining 89 // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`. 90 static_assert(std::is_same<Device, DefaultDevice>::value, 91 "Default executor instantiated with non-default device. " 92 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or " 93 "EIGEN_USE_SYCL before including Eigen headers."); 94 95 EIGEN_DEVICE_FUNC 96 static EIGEN_STRONG_INLINE void run(const Expression& expr, 97 const Device& device = Device()) { 98 TensorEvaluator<Expression, Device> evaluator(expr, device); 99 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); 100 if (needs_assign) { 101 const StorageIndex size = array_prod(evaluator.dimensions()); 102 for (StorageIndex i = 0; i < size; ++i) { 103 evaluator.evalScalar(i); 104 } 105 } 106 evaluator.cleanup(); 107 } 108 }; 109 110 /** 111 * Default async execution strategy is not implemented. Currently it's only 112 * available for ThreadPoolDevice (see definition below). 113 */ 114 template <typename Expression, typename Device, typename DoneCallback, 115 bool Vectorizable, TiledEvaluation Tiling> 116 class TensorAsyncExecutor {}; 117 118 /** 119 * Process all the data with a single cpu thread, using vectorized instructions. 120 */ 121 template <typename Expression> 122 class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true, 123 /*Tiling=*/TiledEvaluation::Off> { 124 public: 125 typedef typename Expression::Index StorageIndex; 126 127 EIGEN_DEVICE_FUNC 128 static EIGEN_STRONG_INLINE void run( 129 const Expression& expr, const DefaultDevice& device = DefaultDevice()) { 130 TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device); 131 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); 132 if (needs_assign) { 133 const StorageIndex size = array_prod(evaluator.dimensions()); 134 const int PacketSize = unpacket_traits<typename TensorEvaluator< 135 Expression, DefaultDevice>::PacketReturnType>::size; 136 137 // Give compiler a strong possibility to unroll the loop. But don't insist 138 // on unrolling, because if the function is expensive compiler should not 139 // unroll the loop at the expense of inlining. 140 const StorageIndex UnrolledSize = 141 (size / (4 * PacketSize)) * 4 * PacketSize; 142 for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) { 143 for (StorageIndex j = 0; j < 4; j++) { 144 evaluator.evalPacket(i + j * PacketSize); 145 } 146 } 147 const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize; 148 for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) { 149 evaluator.evalPacket(i); 150 } 151 for (StorageIndex i = VectorizedSize; i < size; ++i) { 152 evaluator.evalScalar(i); 153 } 154 } 155 evaluator.cleanup(); 156 } 157 }; 158 159 /** 160 * Process all the data with a single cpu thread, using blocks of data. By 161 * sizing a block to fit L1 cache we get better cache performance. 162 */ 163 template <typename Expression, bool Vectorizable> 164 class TensorExecutor<Expression, DefaultDevice, Vectorizable, 165 /*Tiling=*/TiledEvaluation::On> { 166 public: 167 typedef typename traits<Expression>::Scalar Scalar; 168 typedef typename remove_const<Scalar>::type ScalarNoConst; 169 170 typedef TensorEvaluator<Expression, DefaultDevice> Evaluator; 171 typedef typename traits<Expression>::Index StorageIndex; 172 173 static const int NumDims = traits<Expression>::NumDimensions; 174 175 EIGEN_DEVICE_FUNC 176 static EIGEN_STRONG_INLINE void run(const Expression& expr, 177 const DefaultDevice& device = DefaultDevice()) { 178 typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex> 179 TensorBlockMapper; 180 181 typedef internal::TensorBlockDescriptor<NumDims, StorageIndex> 182 TensorBlockDesc; 183 typedef internal::TensorBlockScratchAllocator<DefaultDevice> 184 TensorBlockScratch; 185 186 Evaluator evaluator(expr, device); 187 188 // TODO(ezhulenev): Do not use tiling for small tensors? 189 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); 190 191 if (needs_assign) { 192 // Query expression tree for desired block size/shape. 193 const TensorBlockResourceRequirements requirements = 194 evaluator.getResourceRequirements(); 195 196 const TensorBlockMapper block_mapper( 197 typename TensorBlockDesc::Dimensions(evaluator.dimensions()), 198 requirements); 199 200 // Share scratch memory allocator between all blocks. 201 TensorBlockScratch scratch(device); 202 203 const StorageIndex total_block_count = block_mapper.blockCount(); 204 for (StorageIndex i = 0; i < total_block_count; ++i) { 205 TensorBlockDesc desc = block_mapper.blockDescriptor(i); 206 evaluator.evalBlock(desc, scratch); 207 scratch.reset(); 208 } 209 } 210 evaluator.cleanup(); 211 } 212 }; 213 214 /** 215 * Multicore strategy: the index space is partitioned and each partition is 216 * executed on a single core. 217 * 218 * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread 219 * pool, and will block the caller thread until all tasks are finished. 220 * 221 * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to 222 * the ThreadPoolDevice managed thread pool, and will return immediately. 223 * It will call 'done' callback after all tasks are finished. 224 */ 225 #ifdef EIGEN_USE_THREADS 226 227 template <typename TensorBlockMapper> 228 struct TensorExecutorTilingContext { 229 TensorExecutorTilingContext() = default; 230 TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, 231 const TensorOpCost& b_cost, size_t b_aligned_size) 232 : block_mapper(b_mapper), 233 cost(b_cost), 234 aligned_blocksize(b_aligned_size) {} 235 236 TensorBlockMapper block_mapper; // navigate through blocks 237 TensorOpCost cost; // cost of computing a single block 238 size_t aligned_blocksize; // block size after memory alignment 239 }; 240 241 // Computes a block evaluation parameters, and allocates temporary memory buffer 242 // for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below. 243 template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable> 244 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext( 245 const Evaluator& evaluator) { 246 // Query expression tree for desired block size/shape. 247 TensorBlockResourceRequirements requirements = 248 evaluator.getResourceRequirements(); 249 250 // Update target block size based on cost model. 251 double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize( 252 1, requirements.cost_per_coeff); 253 requirements.size = static_cast<size_t>(1.0 / taskSize); 254 255 TensorBlockMapper block_mapper( 256 typename TensorBlockMapper::Dimensions(evaluator.dimensions()), 257 requirements); 258 259 size_t block_size = block_mapper.blockTotalSize(); 260 const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); 261 const size_t aligned_blocksize = 262 align * 263 divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align); 264 265 return {block_mapper, requirements.cost_per_coeff * block_size, 266 aligned_blocksize}; 267 } 268 269 template <typename Evaluator, typename StorageIndex, bool Vectorizable> 270 struct EvalRange { 271 static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, 272 const StorageIndex lastIdx) { 273 Evaluator evaluator = *evaluator_in; 274 eigen_assert(lastIdx >= firstIdx); 275 for (StorageIndex i = firstIdx; i < lastIdx; ++i) { 276 evaluator.evalScalar(i); 277 } 278 } 279 280 static StorageIndex alignBlockSize(StorageIndex size) { return size; } 281 }; 282 283 template <typename Evaluator, typename StorageIndex> 284 struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> { 285 static const int PacketSize = 286 unpacket_traits<typename Evaluator::PacketReturnType>::size; 287 288 static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, 289 const StorageIndex lastIdx) { 290 Evaluator evaluator = *evaluator_in; 291 eigen_assert(lastIdx >= firstIdx); 292 StorageIndex i = firstIdx; 293 if (lastIdx - firstIdx >= PacketSize) { 294 eigen_assert(firstIdx % PacketSize == 0); 295 StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize; 296 // Give compiler a strong possibility to unroll the loop. But don't insist 297 // on unrolling, because if the function is expensive compiler should not 298 // unroll the loop at the expense of inlining. 299 for (; i <= last_chunk_offset; i += 4 * PacketSize) { 300 for (StorageIndex j = 0; j < 4; j++) { 301 evaluator.evalPacket(i + j * PacketSize); 302 } 303 } 304 last_chunk_offset = lastIdx - PacketSize; 305 for (; i <= last_chunk_offset; i += PacketSize) { 306 evaluator.evalPacket(i); 307 } 308 } 309 for (; i < lastIdx; ++i) { 310 evaluator.evalScalar(i); 311 } 312 } 313 314 static StorageIndex alignBlockSize(StorageIndex size) { 315 // Align block size to packet size and account for unrolling in run above. 316 if (size >= 16 * PacketSize) { 317 return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); 318 } 319 // Aligning to 4 * PacketSize would increase block size by more than 25%. 320 return (size + PacketSize - 1) & ~(PacketSize - 1); 321 } 322 }; 323 324 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling> 325 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> { 326 public: 327 typedef typename Expression::Index StorageIndex; 328 329 static EIGEN_STRONG_INLINE void run(const Expression& expr, 330 const ThreadPoolDevice& device) { 331 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; 332 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange; 333 334 Evaluator evaluator(expr, device); 335 const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); 336 if (needs_assign) { 337 const StorageIndex size = array_prod(evaluator.dimensions()); 338 device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), 339 EvalRange::alignBlockSize, 340 [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { 341 EvalRange::run(&evaluator, firstIdx, lastIdx); 342 }); 343 } 344 evaluator.cleanup(); 345 } 346 }; 347 348 template <typename Expression, bool Vectorizable> 349 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, 350 /*Tiling=*/TiledEvaluation::On> { 351 public: 352 typedef typename traits<Expression>::Index IndexType; 353 typedef typename traits<Expression>::Scalar Scalar; 354 typedef typename remove_const<Scalar>::type ScalarNoConst; 355 356 static const int NumDims = traits<Expression>::NumDimensions; 357 358 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; 359 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper; 360 typedef TensorExecutorTilingContext<BlockMapper> TilingContext; 361 362 typedef internal::TensorBlockDescriptor<NumDims, IndexType> 363 TensorBlockDesc; 364 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> 365 TensorBlockScratch; 366 367 static EIGEN_STRONG_INLINE void run(const Expression& expr, 368 const ThreadPoolDevice& device) { 369 Evaluator evaluator(expr, device); 370 371 const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); 372 if (needs_assign) { 373 const TilingContext tiling = 374 internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, 375 Vectorizable>(evaluator); 376 377 auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, 378 IndexType lastBlockIdx) { 379 TensorBlockScratch scratch(device); 380 381 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; 382 ++block_idx) { 383 TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx); 384 evaluator.evalBlock(desc, scratch); 385 scratch.reset(); 386 } 387 }; 388 389 // Evaluate small expressions directly as a single block. 390 if (tiling.block_mapper.blockCount() == 1) { 391 TensorBlockScratch scratch(device); 392 TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions()); 393 evaluator.evalBlock(desc, scratch); 394 } else { 395 device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, 396 eval_block); 397 } 398 } 399 evaluator.cleanup(); 400 } 401 }; 402 403 template <typename Expression, typename DoneCallback, bool Vectorizable, 404 TiledEvaluation Tiling> 405 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, 406 Vectorizable, Tiling> { 407 public: 408 typedef typename Expression::Index StorageIndex; 409 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; 410 411 static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, 412 const ThreadPoolDevice& device, 413 DoneCallback done) { 414 TensorAsyncExecutorContext* const ctx = 415 new TensorAsyncExecutorContext(expr, device, std::move(done)); 416 417 const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void { 418 if (!need_assign) { 419 delete ctx; 420 return; 421 } 422 423 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange; 424 const StorageIndex size = array_prod(ctx->evaluator.dimensions()); 425 device.parallelForAsync( 426 size, ctx->evaluator.costPerCoeff(Vectorizable), 427 EvalRange::alignBlockSize, 428 [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { 429 EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); 430 }, 431 [ctx]() { delete ctx; }); 432 }; 433 434 ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); 435 } 436 437 private: 438 struct TensorAsyncExecutorContext { 439 TensorAsyncExecutorContext(const Expression& expr, 440 const ThreadPoolDevice& thread_pool, 441 DoneCallback done) 442 : evaluator(expr, thread_pool), on_done(std::move(done)) {} 443 444 ~TensorAsyncExecutorContext() { 445 evaluator.cleanup(); 446 on_done(); 447 } 448 449 Evaluator evaluator; 450 451 private: 452 DoneCallback on_done; 453 }; 454 }; 455 456 template <typename Expression, typename DoneCallback, bool Vectorizable> 457 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, 458 Vectorizable, /*Tileable*/ TiledEvaluation::On> { 459 public: 460 typedef typename traits<Expression>::Index IndexType; 461 typedef typename traits<Expression>::Scalar Scalar; 462 typedef typename remove_const<Scalar>::type ScalarNoConst; 463 464 static const int NumDims = traits<Expression>::NumDimensions; 465 466 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; 467 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper; 468 typedef TensorExecutorTilingContext<BlockMapper> TilingContext; 469 470 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc; 471 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> 472 TensorBlockScratch; 473 474 static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, 475 const ThreadPoolDevice& device, 476 DoneCallback done) { 477 478 TensorAsyncExecutorContext* const ctx = 479 new TensorAsyncExecutorContext(expr, device, std::move(done)); 480 481 const auto on_eval_subexprs = [ctx](bool need_assign) -> void { 482 if (!need_assign) { 483 delete ctx; 484 return; 485 } 486 487 ctx->tiling = internal::GetTensorExecutorTilingContext< 488 Evaluator, BlockMapper, Vectorizable>(ctx->evaluator); 489 490 auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) { 491 TensorBlockScratch scratch(ctx->device); 492 493 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; 494 ++block_idx) { 495 TensorBlockDesc desc = 496 ctx->tiling.block_mapper.blockDescriptor(block_idx); 497 ctx->evaluator.evalBlock(desc, scratch); 498 scratch.reset(); 499 } 500 }; 501 502 // Evaluate small expressions directly as a single block. 503 if (ctx->tiling.block_mapper.blockCount() == 1) { 504 TensorBlockScratch scratch(ctx->device); 505 TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions()); 506 ctx->evaluator.evalBlock(desc, scratch); 507 delete ctx; 508 } else { 509 ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), 510 ctx->tiling.cost, eval_block, 511 [ctx]() { delete ctx; }); 512 } 513 }; 514 515 ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); 516 } 517 518 private: 519 struct TensorAsyncExecutorContext { 520 TensorAsyncExecutorContext(const Expression& expr, 521 const ThreadPoolDevice& thread_pool, 522 DoneCallback done) 523 : device(thread_pool), 524 evaluator(expr, thread_pool), 525 on_done(std::move(done)) {} 526 527 ~TensorAsyncExecutorContext() { 528 evaluator.cleanup(); 529 on_done(); 530 } 531 532 const ThreadPoolDevice& device; 533 Evaluator evaluator; 534 TilingContext tiling; 535 536 private: 537 DoneCallback on_done; 538 }; 539 }; 540 541 #endif // EIGEN_USE_THREADS 542 543 // GPU: the evaluation of the expression is offloaded to a GPU. 544 #if defined(EIGEN_USE_GPU) 545 546 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling> 547 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> { 548 public: 549 typedef typename Expression::Index StorageIndex; 550 static void run(const Expression& expr, const GpuDevice& device); 551 }; 552 553 #if defined(EIGEN_GPUCC) 554 template <typename Evaluator, typename StorageIndex, bool Vectorizable> 555 struct EigenMetaKernelEval { 556 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE 557 void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { 558 for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) { 559 eval.evalScalar(i); 560 } 561 } 562 }; 563 564 template <typename Evaluator, typename StorageIndex> 565 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> { 566 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE 567 void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { 568 const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; 569 const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize; 570 const StorageIndex vectorized_step_size = step_size * PacketSize; 571 572 // Use the vector path 573 for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; 574 i += vectorized_step_size) { 575 eval.evalPacket(i); 576 } 577 for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) { 578 eval.evalScalar(i); 579 } 580 } 581 }; 582 583 template <typename Evaluator, typename StorageIndex> 584 __global__ void 585 __launch_bounds__(1024) 586 EigenMetaKernel(Evaluator eval, StorageIndex size) { 587 588 const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x; 589 const StorageIndex step_size = blockDim.x * gridDim.x; 590 591 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; 592 EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size); 593 } 594 595 /*static*/ 596 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling> 597 EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run( 598 const Expression& expr, const GpuDevice& device) { 599 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); 600 const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); 601 if (needs_assign) { 602 603 const int block_size = device.maxGpuThreadsPerBlock(); 604 const int max_blocks = device.getNumGpuMultiProcessors() * 605 device.maxGpuThreadsPerMultiProcessor() / block_size; 606 const StorageIndex size = array_prod(evaluator.dimensions()); 607 // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. 608 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); 609 610 LAUNCH_GPU_KERNEL( 611 (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>), 612 num_blocks, block_size, 0, device, evaluator, size); 613 } 614 evaluator.cleanup(); 615 } 616 617 #endif // EIGEN_GPUCC 618 #endif // EIGEN_USE_GPU 619 620 // SYCL Executor policy 621 #ifdef EIGEN_USE_SYCL 622 623 template <typename Evaluator> 624 struct ExecExprFunctorKernel { 625 typedef typename Evaluator::Index Index; 626 Evaluator evaluator; 627 const Index range; 628 template <typename Scratch> 629 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel( 630 const Scratch, Evaluator evaluator_, const Index range_) 631 : evaluator(evaluator_), range(range_) {} 632 633 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()( 634 cl::sycl::nd_item<1> itemID) { 635 compute(itemID); 636 } 637 template <bool is_vec = Evaluator::PacketAccess> 638 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type 639 compute(const cl::sycl::nd_item<1>& itemID) { 640 Index gId = static_cast<Index>(itemID.get_global_linear_id()); 641 Index total_threads = itemID.get_global_range(0); 642 643 for (Index i = gId; i < range; i += total_threads) { 644 evaluator.evalScalar(i); 645 } 646 } 647 template <bool is_vec = Evaluator::PacketAccess> 648 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type 649 compute(const cl::sycl::nd_item<1>& itemID) { 650 const Index vectorizedRange = 651 (range / Evaluator::PacketSize) * Evaluator::PacketSize; 652 Index gId = static_cast<Index>(itemID.get_global_linear_id()); 653 const Index step = Evaluator::PacketSize * itemID.get_global_range(0); 654 const Index start = Evaluator::PacketSize * gId; 655 for (Index i = start; i < vectorizedRange; i += step) { 656 evaluator.evalPacket(i); 657 } 658 gId += vectorizedRange; 659 for (Index i = gId; i < range; i += itemID.get_global_range(0)) { 660 evaluator.evalScalar(i); 661 } 662 } 663 }; 664 665 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling> 666 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> { 667 public: 668 typedef typename Expression::Index Index; 669 static EIGEN_STRONG_INLINE void run(const Expression& expr, 670 const Eigen::SyclDevice& dev) { 671 typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator; 672 Evaluator evaluator(expr, dev); 673 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); 674 if (needs_assign) { 675 Index range, GRange, tileSize; 676 Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); 677 total_size = (total_size == 0) ? 1 : total_size; 678 const int PacketSize = 679 Eigen::PacketType<typename Evaluator::CoeffReturnType, 680 Eigen::SyclDevice>::size; 681 Index vectorizable_threads = static_cast<Index>(total_size / PacketSize); 682 dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange); 683 range = total_size; 684 685 dev.template nullary_kernel_launcher< 686 typename Evaluator::CoeffReturnType, 687 ExecExprFunctorKernel<Evaluator> >( 688 evaluator, 689 cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), 690 cl::sycl::range<1>(tileSize)), 691 Index(1), range); 692 } 693 evaluator.cleanup(); 694 } 695 }; 696 697 #endif 698 699 } // end namespace internal 700 701 } // end namespace Eigen 702 703 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 704