Searched refs:BlockReduce (Results 1 – 8 of 8) sorted by relevance
/external/pytorch/aten/src/ATen/native/cuda/ |
D | block_reduce.cuh | 124 BlockReduce(T val, const ReduceOp& op, const T& identity_element, T* shared) { in BlockReduce() function
|
D | DistanceKernel.cu | 116 agg = cuda_utils::BlockReduce(agg, DistReduceOp<scalar_t, F>{}, agg_init, agg_smem); in pdist_kernel_cuda_impl() 207 agg = cuda_utils::BlockReduce(agg, DistReduceOp<scalar_t, F>{}, agg_init, agg_smem); in cdist_kernel_cuda_impl()
|
D | TensorModeKernel.cuh | 93 return cuda_utils::BlockReduce(local, reduceOp, init, smem); in reduceBlockWithNThreadLocalReductions()
|
D | group_norm_kernel.cu | 59 val = cuda_utils::BlockReduce( in RowwiseMomentsCUDAKernel()
|
D | SoftMax.cu | 444 T result = cuda_utils::BlockReduce<T, Reduction<T>>(value, op, defaultVal, smem_cache); in blockReduceWarp()
|
D | layer_norm_kernel.cu | 77 val = cuda_utils::BlockReduce( in RowwiseMomentsCUDAKernel()
|
D | Normalization.cuh | 124 …sum = cuda_utils::BlockReduce<scalar_t, SumReduceOp<scalar_t>, cuda_utils::Block2D>(sum, reduce_op… in reduce()
|
/external/tensorflow/tensorflow/core/kernels/ |
D | reduction_gpu_kernels.cu.h | 183 typedef gpuprim::BlockReduce<value_type, num_threads> BlockReduce; 185 __shared__ typename BlockReduce::TempStorage temp_storage; 194 sum = BlockReduce(temp_storage).Reduce(sum, op, num_elements_to_reduce);
|