Home
last modified time | relevance | path

Searched refs:BlockReduceSum (Results 1 – 9 of 9) sorted by relevance

/external/pytorch/aten/src/ATen/native/cuda/
Dblock_reduce.cuh71 __inline__ __device__ T BlockReduceSum(T val, T* shared) { in BlockReduceSum() function
Dgroup_norm_kernel.cu132 sum1 = cuda_utils::BlockReduceSum<T_ACC>(sum1, ds_shared); in Compute1dBackwardFusedParamsCUDAKernel()
133 sum2 = cuda_utils::BlockReduceSum<T_ACC>(sum2, db_shared); in Compute1dBackwardFusedParamsCUDAKernel()
299 sum1 = cuda_utils::BlockReduceSum<T_ACC>(sum1, ds_shared); in ComputeInternalGradientsCUDAKernel()
300 sum2 = cuda_utils::BlockReduceSum<T_ACC>(sum2, db_shared); in ComputeInternalGradientsCUDAKernel()
342 sum1 = cuda_utils::BlockReduceSum<T_ACC>(sum1, ds_shared); in ComputeBackwardFusedParamsCUDAKernel()
343 sum2 = cuda_utils::BlockReduceSum<T_ACC>(sum2, db_shared); in ComputeBackwardFusedParamsCUDAKernel()
DMultiLabelMarginCriterion.cu117 accscalar_t total_sum = cuda_utils::BlockReduceSum(sum, smem); in C10_LAUNCH_BOUNDS_1()
189 accscalar_t total_sum = cuda_utils::BlockReduceSum(sum, smem); in C10_LAUNCH_BOUNDS_1()
DMultinomialKernel.cu64 sum = cuda_utils::BlockReduceSum(sum, smem); in C10_LAUNCH_BOUNDS_1()
223 sum = cuda_utils::BlockReduceSum(sum, smem); in C10_LAUNCH_BOUNDS_1()
DNLLLoss2d.cu118 auto acc_weight_ = cuda_utils::BlockReduceSum(acc_weight, acc_weight_smem); in C10_LAUNCH_BOUNDS_1()
119 auto input_sum_ = cuda_utils::BlockReduceSum(input_sum, input_sum_smem); in C10_LAUNCH_BOUNDS_1()
DForeachReduceOp.cu309 ? at::native::cuda_utils::BlockReduceSum(val, s_vals) in operator ()()
343 ? at::native::cuda_utils::BlockReduceSum<out_opmath_t>(val, vals) in lpnorm_cleanup()
DEmbedding.cu225 v = cuda_utils::BlockReduceSum(v, sdata); in renorm_kernel()
Dlayer_norm_kernel.cu352 stats_x1 = cuda_utils::BlockReduceSum(stats_x1, buf); in compute_gI()
353 stats_x2 = cuda_utils::BlockReduceSum(stats_x2, buf); in compute_gI()
459 stats_x1 = cuda_utils::BlockReduceSum(stats_x1, reduce_buf); in layer_norm_grad_input_kernel_vectorized()
460 stats_x2 = cuda_utils::BlockReduceSum(stats_x2, reduce_buf); in layer_norm_grad_input_kernel_vectorized()
DDepthwiseConv2d.cu350 acc_t tval = cuda_utils::BlockReduceSum(grad, buf); in conv_depthwise2d_grad_weight_kernel()