Searched refs:BlockReduceSum (Results 1 – 9 of 9) sorted by relevance
/external/pytorch/aten/src/ATen/native/cuda/ |
D | block_reduce.cuh | 71 __inline__ __device__ T BlockReduceSum(T val, T* shared) { in BlockReduceSum() function
|
D | group_norm_kernel.cu | 132 sum1 = cuda_utils::BlockReduceSum<T_ACC>(sum1, ds_shared); in Compute1dBackwardFusedParamsCUDAKernel() 133 sum2 = cuda_utils::BlockReduceSum<T_ACC>(sum2, db_shared); in Compute1dBackwardFusedParamsCUDAKernel() 299 sum1 = cuda_utils::BlockReduceSum<T_ACC>(sum1, ds_shared); in ComputeInternalGradientsCUDAKernel() 300 sum2 = cuda_utils::BlockReduceSum<T_ACC>(sum2, db_shared); in ComputeInternalGradientsCUDAKernel() 342 sum1 = cuda_utils::BlockReduceSum<T_ACC>(sum1, ds_shared); in ComputeBackwardFusedParamsCUDAKernel() 343 sum2 = cuda_utils::BlockReduceSum<T_ACC>(sum2, db_shared); in ComputeBackwardFusedParamsCUDAKernel()
|
D | MultiLabelMarginCriterion.cu | 117 accscalar_t total_sum = cuda_utils::BlockReduceSum(sum, smem); in C10_LAUNCH_BOUNDS_1() 189 accscalar_t total_sum = cuda_utils::BlockReduceSum(sum, smem); in C10_LAUNCH_BOUNDS_1()
|
D | MultinomialKernel.cu | 64 sum = cuda_utils::BlockReduceSum(sum, smem); in C10_LAUNCH_BOUNDS_1() 223 sum = cuda_utils::BlockReduceSum(sum, smem); in C10_LAUNCH_BOUNDS_1()
|
D | NLLLoss2d.cu | 118 auto acc_weight_ = cuda_utils::BlockReduceSum(acc_weight, acc_weight_smem); in C10_LAUNCH_BOUNDS_1() 119 auto input_sum_ = cuda_utils::BlockReduceSum(input_sum, input_sum_smem); in C10_LAUNCH_BOUNDS_1()
|
D | ForeachReduceOp.cu | 309 ? at::native::cuda_utils::BlockReduceSum(val, s_vals) in operator ()() 343 ? at::native::cuda_utils::BlockReduceSum<out_opmath_t>(val, vals) in lpnorm_cleanup()
|
D | Embedding.cu | 225 v = cuda_utils::BlockReduceSum(v, sdata); in renorm_kernel()
|
D | layer_norm_kernel.cu | 352 stats_x1 = cuda_utils::BlockReduceSum(stats_x1, buf); in compute_gI() 353 stats_x2 = cuda_utils::BlockReduceSum(stats_x2, buf); in compute_gI() 459 stats_x1 = cuda_utils::BlockReduceSum(stats_x1, reduce_buf); in layer_norm_grad_input_kernel_vectorized() 460 stats_x2 = cuda_utils::BlockReduceSum(stats_x2, reduce_buf); in layer_norm_grad_input_kernel_vectorized()
|
D | DepthwiseConv2d.cu | 350 acc_t tval = cuda_utils::BlockReduceSum(grad, buf); in conv_depthwise2d_grad_weight_kernel()
|