/external/tensorflow/tensorflow/compiler/xla/service/gpu/ |
D | parallel_loop_emitter.cc | 75 const int unroll_factor = in EmitLinearBaseAndThreadIdx() local 76 launch_config_.unroll_factor > 1 ? launch_config_.unroll_factor : 1; in EmitLinearBaseAndThreadIdx() 83 ? launch_dimensions_.total_nb_threads() * unroll_factor in EmitLinearBaseAndThreadIdx() 131 if (!launch_config_.logical_order && launch_config_.unroll_factor > 1) { in EmitLinearBaseAndThreadIdx() 134 llvm::ConstantInt::get(index_type, launch_config_.unroll_factor), in EmitLinearBaseAndThreadIdx() 155 const int unroll_factor = launch_config_.unroll_factor; in EmitLogicalIndexAndSetExitBasicBlock() local 159 for (int i = 0; i < unroll_factor; ++i) { in EmitLogicalIndexAndSetExitBasicBlock() 226 launch_config_.unroll_factor); in EmitIndexAndSetExitBasicBlock() 232 << launch_config_.unroll_factor; in EmitIndexAndSetExitBasicBlock() 259 llvm::ConstantInt::get(index_type, launch_config_.unroll_factor), in EmitIndexAndSetExitBasicBlock() [all …]
|
D | launch_dimensions.cc | 64 shape.dimensions().back() / dim_config.unroll_factor; in ThreadsPerBlockRowVectorized() 66 shape.dimensions().back() % dim_config.unroll_factor == 0 && in ThreadsPerBlockRowVectorized() 87 CHECK_EQ(num_elements % dim_config.unroll_factor, 0); in CalculateLaunchDimensions() 88 num_elements = num_elements / dim_config.unroll_factor; in CalculateLaunchDimensions() 114 dim_config.unroll_factor, in CalculateLaunchDimensions()
|
D | launch_dimensions.h | 86 int unroll_factor = 1; member 104 "unroll_factor=", unroll_factor, ", few_waves=", few_waves, in ToString()
|
D | ir_emitter_unnested.cc | 730 int unroll_factor = 1; in EmitPadToStatic() local 737 {unroll_factor})); in EmitPadToStatic() 845 {unroll_factor}) in EmitPadToStatic() 856 int unroll_factor = 1; in EmitSliceToDynamic() local 863 {unroll_factor})); in EmitSliceToDynamic() 968 {unroll_factor}) in EmitSliceToDynamic() 1561 (launch_dimensions.launch_bound() * launch_config.unroll_factor)) { in EnableLogicalIndexGenerationForOutput() 1685 int unroll_factor; in EmitLoopFusion() local 1687 unroll_factor = ComputeMaxUnrollFactor(fusion, hlo_module_config_); in EmitLoopFusion() 1689 unroll_factor = 1; in EmitLoopFusion() [all …]
|
/external/pytorch/aten/src/ATen/native/cuda/ |
D | RreluWithNoise.cu | 19 template <typename scalar_t, int unroll_factor, typename F> 40 int grid_stride = blockDim.x * gridDim.x * unroll_factor; in rrelu_with_noise_cuda_kernel() 48 static_assert(sizeof(rand)/sizeof(rand.x) == unroll_factor, ""); in rrelu_with_noise_cuda_kernel() 51 for (int ii = 0; ii < unroll_factor; ii++) { in rrelu_with_noise_cuda_kernel() 83 const int unroll_factor = std::is_same<scalar_t, double>::value ? 2 : 4; in _rrelu_with_noise_cuda_train() local 84 auto execution_policy = calc_execution_policy(numel, unroll_factor); in _rrelu_with_noise_cuda_train()
|
D | DistributionTemplates.h | 49 …_t, dim3, dim3> calc_execution_policy(const int64_t total_elements, const uint32_t unroll_factor) { in calc_execution_policy() argument 59 …uint64_t counter_offset = ((numel - 1) / (block_size * grid.x * unroll_factor) + 1) * max_generato… in calc_execution_policy() 64 template<typename accscalar_t, int unroll_factor, typename dist_t, typename transform_t> 78 int rounded_size = ((numel - 1)/(blockDim.x * gridDim.x * unroll_factor)+1) * in C10_LAUNCH_BOUNDS_2() 79 blockDim.x * gridDim.x * unroll_factor; in C10_LAUNCH_BOUNDS_2() 80 …index = idx; linear_index < rounded_size; linear_index += blockDim.x * gridDim.x * unroll_factor) { in C10_LAUNCH_BOUNDS_2() 83 for (int ii = 0; ii < unroll_factor; ii++) { in C10_LAUNCH_BOUNDS_2() 119 const int unroll_factor = sizeof(dist_func_return_t) / sizeof(accscalar_t); in distribution_nullary_kernel() local 120 TORCH_CHECK(unroll_factor >= 1, "unroll_factor must be >= 1."); in distribution_nullary_kernel() 126 auto execution_policy = calc_execution_policy(numel, unroll_factor); in distribution_nullary_kernel() [all …]
|
D | layer_norm_kernel.cu | 568 constexpr int unroll_factor = 8; in GammaBetaBackwardCUDAKernel_32x32() local 578 for (bcounter = 0; bcounter < M / (blockDim.y * unroll_factor); in GammaBetaBackwardCUDAKernel_32x32() 580 int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor; in GammaBetaBackwardCUDAKernel_32x32() 582 if (laneId < unroll_factor) { in GammaBetaBackwardCUDAKernel_32x32() 589 for (int ii = 0; ii < unroll_factor; ++ii) { in GammaBetaBackwardCUDAKernel_32x32() 600 int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor; in GammaBetaBackwardCUDAKernel_32x32() 601 for (int ii = 0; ii < unroll_factor; ii++) { in GammaBetaBackwardCUDAKernel_32x32() 668 constexpr int unroll_factor = 8; in GammaBetaBackwardCUDAKernel() local 677 for (bcounter = 0; bcounter < M / (blockDim.y * unroll_factor); bcounter++){ in GammaBetaBackwardCUDAKernel() 678 int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor; in GammaBetaBackwardCUDAKernel() [all …]
|
D | CUDALoops.cuh | 274 constexpr int unroll_factor = sizeof(arg0_t) >= 4 ? 2 : 4; in gpu_kernel_impl_nocast() local 275 launch_legacy_kernel<128, unroll_factor>(numel, [=] GPU_LAMBDA(int idx) { in gpu_kernel_impl_nocast()
|
/external/eigen/Eigen/src/Core/arch/AltiVec/ |
D | MatrixProductMMA.h | 163 if (unroll_factor > iter) { \ 171 if (unroll_factor > iter) { \ 215 if (unroll_factor > iter) { \ 224 if (unroll_factor > iter) { \ 233 if (unroll_factor > iter) { \ 240 if (unroll_factor > iter) { \ 246 template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapp… 278 row += unroll_factor*accCols; in gemm_unrolled_MMA_iteration() 382 if (unroll_factor > iter) { \ 397 if (unroll_factor > iter) { \ [all …]
|
D | MatrixProduct.h | 1456 if (unroll_factor > iter) { \ 1464 if (unroll_factor > iter) { \ 1515 if (unroll_factor > iter) { \ 1524 if (unroll_factor > iter) { \ 1533 if (unroll_factor > iter) { \ 1540 if (unroll_factor > iter) { \ 1552 if (unroll_factor > iter) { \ 1560 template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, … 1593 row += unroll_factor*accCols; 1596 template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, … [all …]
|
/external/deqp-deps/SPIRV-Tools/source/opt/ |
D | loop_unroller.h | 26 LoopUnroller(bool fully_unroll, int unroll_factor) in LoopUnroller() argument 27 : Pass(), fully_unroll_(fully_unroll), unroll_factor_(unroll_factor) {} in LoopUnroller()
|
D | loop_descriptor.h | 342 size_t unroll_factor);
|
/external/swiftshader/third_party/SPIRV-Tools/source/opt/ |
D | loop_unroller.h | 26 LoopUnroller(bool fully_unroll, int unroll_factor) in LoopUnroller() argument 27 : Pass(), fully_unroll_(fully_unroll), unroll_factor_(unroll_factor) {} in LoopUnroller()
|
D | loop_descriptor.h | 342 size_t unroll_factor);
|
/external/angle/third_party/spirv-tools/src/source/opt/ |
D | loop_unroller.h | 26 LoopUnroller(bool fully_unroll, int unroll_factor) in LoopUnroller() argument 27 : Pass(), fully_unroll_(fully_unroll), unroll_factor_(unroll_factor) {} in LoopUnroller()
|
D | loop_descriptor.h | 342 size_t unroll_factor);
|
/external/pytorch/test/ |
D | test_jit.py | 11178 unroll_factor = 8 11179 FileCheck().check("prim::Loop").check_count("aten::sub", unroll_factor) \ 11217 unroll_factor = 8 11218 … FileCheck().check("prim::Loop").check("prim::Loop").check_count('aten::sub', unroll_factor) \
|