/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/ |
D | cumsum_impl.cu | 120 …RightMoveSum<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride… in CumSum() 121 Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1); in CumSum() 122 …CumSumKernelReverse<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, d… in CumSum() 125 …LeftMoveSum<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride,… in CumSum() 126 Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1); in CumSum() 127 …CumSumKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, dim2, st… in CumSum() 131 …CumSumKernelReverse<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2,… in CumSum() 134 …CumSumKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride… in CumSum()
|
D | cumprod_impl.cu | 120 …RightMoveProd<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, strid… in CumProd() 121 Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1); in CumProd() 122 …CumProdKernelReverse<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, … in CumProd() 125 …LeftMoveProd<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride… in CumProd() 126 Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1); in CumProd() 127 …CumProdKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, dim2, s… in CumProd() 131 …CumProdKernelReverse<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2… in CumProd() 134 …CumProdKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, strid… in CumProd()
|
D | bce_with_logits_loss_impl.cu | 98 FillAndBroadcast<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>( in CalBCEWithLogitsLoss() 101 …FillWithoutBroadcast<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input_size, pos_weig… in CalBCEWithLogitsLoss() 104 …BCEWithLogitsLossMain<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input_size, predict… in CalBCEWithLogitsLoss() 107 …FillAndBroadcast<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input_size, shape_size, … in CalBCEWithLogitsLoss() 110 … FillWithoutBroadcast<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input_size, weight, in CalBCEWithLogitsLoss() 113 …Mul<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input_size, shape_broadcasted, output… in CalBCEWithLogitsLoss()
|
D | prelu_grad_impl.cu | 56 size_t thread_num = static_cast<size_t>(GET_BLOCKS(size) * GET_THREADS); in CalPReLUGrad() 58 …InitDwArrayData<<<GET_BLOCKS(dw_array_size), GET_THREADS, 0, cuda_stream>>>(dw_array_size, dw_arra… in CalPReLUGrad() 59 …CalPReLUGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, weight_size, per_chann… in CalPReLUGrad() 61 …ComputeDwData<<<GET_BLOCKS(weight_size), GET_THREADS, 0, cuda_stream>>>(weight_size, thread_num, d… in CalPReLUGrad()
|
D | random_choice_with_mask_impl.cu | 235 …InitArray<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, ceil_power2, input, mask… in CalRandomChoiceWithMask() 239 Copy<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(mask_buff, tmp_buff, ceil_power2); in CalRandomChoiceWithMask() 243 Copy<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(Tnum_buff, tmp_buff, BLOCKNUM); in CalRandomChoiceWithMask() 248 …Reshape2Index<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, input_shape_size, d1… in CalRandomChoiceWithMask() 251 Sort<<<1, GET_THREADS, 0, stream>>>(ceil_power2, rank_buff); in CalRandomChoiceWithMask() 253 SrandInit<<<GET_BLOCKS(ceil_power2), GET_THREADS, 0, stream>>>(ceil_power2, globalState, seedc); in CalRandomChoiceWithMask() 254 Shuffle<<<1, GET_THREADS, 0, stream>>>(ceil_power2, globalState, rank_buff); in CalRandomChoiceWithMask() 256 …MoveToOutput<<<GET_BLOCKS(count), GET_THREADS, 0, stream>>>(input_shape_size, count, input, output… in CalRandomChoiceWithMask()
|
D | gelu_impl.cu | 56 GeluKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Gelu() 62 GeluKernel<half2><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>( in Gelu() 65 …GeluKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Gelu() 116 GeluGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy_addr, x_addr, dx_addr); in GeluGradKernel() 122 GeluGradKernel<half2><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>( in GeluGradKernel() 126 …GeluGradKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy_addr, x_addr, dx_… in GeluGradKernel()
|
D | unary_op_impl.cu | 410 ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Exponential() 415 Expm1Kernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Expm1() 420 LogarithmKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Logarithm() 425 Log1pKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Log1p() 430 ErfKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Erf() 435 ErfcKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Erfc() 440 NegativeKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Negative() 445 ReciprocalKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Reciprocal() 450 SquareKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Square() 455 PowKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Pow() [all …]
|
D | square_sum_all_impl.cu | 87 InitOutput<<<GET_BLOCKS(1), GET_THREADS, 0, cuda_stream>>>(1, ws_addr_0); in SquareSumAll() 88 InitOutput<<<GET_BLOCKS(1), GET_THREADS, 0, cuda_stream>>>(1, ws_addr_1); in SquareSumAll() 90 …SquareSumAllKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr_0, input_ad… in SquareSumAll() 92 …AssignKernel<<<GET_BLOCKS(1), GET_THREADS, 0, cuda_stream>>>(1, output_addr_0, output_addr_1, ws_a… in SquareSumAll()
|
D | softplus_impl.cu | 38 SoftplusKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Softplus() 43 …SoftplusKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_a… in Softplus() 66 …SoftplusGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy_addr, x_addr, dx_ad… in SoftplusGrad() 71 …SoftplusGradKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy_addr, x_addr,… in SoftplusGrad()
|
D | ctcloss_impl.cu | 282 LogBInitKernel<<<GET_BLOCKS(log_prob_size), GET_THREADS, 0, stream>>>(log_beta_b, log_prob_size); in CalculateBwdVar() 283 CalculateBwdVarKernel<<<GET_BLOCKS(batch), GET_THREADS, 0, stream>>>( in CalculateBwdVar() 293 LogBInitKernel<<<GET_BLOCKS(log_prob_size), GET_THREADS, 0, stream>>>(log_alpha_b, log_prob_size); in CalculateFwdVar() 294 CalculateFwdVarKernel<<<GET_BLOCKS(batch), GET_THREADS, 0, stream>>>( in CalculateFwdVar() 302 …InnerSoftMaxKernel<<<GET_BLOCKS(batch * max_time), GET_THREADS, 0, stream>>>(probs, softmax_probs,… in InnerSoftMax() 325 GenLabelWithBlankKernel<<<GET_BLOCKS(batch), GET_THREADS, 0, stream>>>( in GenLabelWithBlank() 331 …GenLabelValuePCRKernel<<<GET_BLOCKS(batch), GET_THREADS, 0, stream>>>(label_value_sp, label_value_… in GenLabelValuePCR() 374 LabelValueInitKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(label_value_sp, size, blank); in GenLabelValue() 375 …GenLabelValueKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(label_value_sp, label_indices, l… in GenLabelValue() 377 …RecalculateLengthKernel<<<GET_BLOCKS(batch), GET_THREADS, 0, stream>>>(label_value_sp, label_squen… in GenLabelValue() [all …]
|
D | unary_op_grad_impl.cu | 127 SqrtGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); in SqrtGrad() 133 RsqrtGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); in RsqrtGrad() 139 AsinGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); in AsinGrad() 145 ACosGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); in ACosGrad() 151 AtanGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); in AtanGrad() 157 AsinhGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); in AsinhGrad() 163 AcoshGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); in AcoshGrad() 169 …ReciprocalGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, coun… in ReciprocalGrad()
|
D | nms_with_mask_impl.cu | 177 MaskInit<<<GET_BLOCKS(total_val), GET_THREADS, 0, cuda_stream>>>(total_val, row_mask); in CalPreprocess() 179 …PopulateOutput<<<GET_BLOCKS(num), GET_THREADS, 0, cuda_stream>>>(input, output, index_buff, num, b… in CalPreprocess() 180 …Preprocess<<<GET_BLOCKS(num), GET_THREADS, 0, cuda_stream>>>(num, sel_idx, sel_boxes, output, box_… in CalPreprocess() 187 int thread = std::min(ceil_p_2, GET_THREADS); in CalSort() 196 …NmsPass<<<GET_BLOCKS(row_mask_size), GET_THREADS, 0, cuda_stream>>>(num, IOU_value, output, sel_bo… in CalNms() 198 ReducePass<<<1, GET_THREADS, 0, cuda_stream>>>(num, sel_boxes, row_mask); in CalNms()
|
D | multinomial_impl.cu | 34 …CheckZeroKernel<<<GET_BLOCKS(distributions), GET_THREADS, 0, cuda_stream>>>(distributions, categor… in CheckZero() 50 CheckNonNegKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, output); in CheckNonNeg() 68 …NormInputKernel<<<GET_BLOCKS(count1), GET_THREADS, 0, cuda_stream>>>(input, distributions, categor… in NormInput() 117 …MultinomialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(RNG_seed, input, num_sample,… in Multinomial()
|
D | local_response_norm_impl.cu | 71 …ComputeScaleNHWC<<<GET_BLOCKS(num_elements), GET_THREADS, 0, cuda_stream>>>(input, depth_radius, b… in CalLocalResponseNormNHWC() 73 …LocalResponseNormNHWC<<<GET_BLOCKS(num_elements), GET_THREADS, 0, cuda_stream>>>(input, scale, bet… in CalLocalResponseNormNHWC() 83 …ComputeScaleNHWC<<<GET_BLOCKS(num_elements), GET_THREADS, 0, cuda_stream>>>(x, depth_radius, bias,… in CalLocalResponseNormGradNHWC() 85 …LocalResponseNormGradNHWC<<<GET_BLOCKS(num_elements), GET_THREADS, 0, cuda_stream>>>(dy, x, y, sca… in CalLocalResponseNormGradNHWC()
|
D | batchnorm_fold2_impl.cu | 108 BatchNormFold2Kernel<<<GET_BLOCKS(num_count), GET_THREADS, 0, cuda_stream>>>( in BatchNormFold2Forward() 122 …BatchNormFold2GradMul<<<GET_BLOCKS(num_count), GET_THREADS, 0, cuda_stream>>>(dout, x, tmp_x, num_… in BatchNormFold2GradReduce() 123 …BatchNormFold2GradReduce1<<<GET_BLOCKS(N * C), GET_THREADS, 0, cuda_stream>>>(dout, tmp, tmp_x, tm… in BatchNormFold2GradReduce() 124 …BatchNormFold2GradReduce2<<<GET_BLOCKS(C), GET_THREADS, 0, cuda_stream>>>(tmp, d_beta, tmp2, reduc… in BatchNormFold2GradReduce() 135 BatchNormFold2GradNotFreeze<<<GET_BLOCKS(C), GET_THREADS, 0, cuda_stream>>>( in CalBatchNormFold2GradNotFreeze() 149 …BatchNormFold2GradFreeze<<<GET_BLOCKS(C), GET_THREADS, 0, cuda_stream>>>(d_beta, running_mean, run… in CalBatchNormFold2GradFreeze() 164 …DxMul<<<GET_BLOCKS(N * C * H * W), GET_THREADS, 0, cuda_stream>>>(N, C, H * W, batch_std, running_… in CalBatchNormFold2GradNotFreezeDxMul()
|
D | loss_with_reduction_impl.cu | 103 PartialSum<<<GET_BLOCKS(stride), GET_THREADS, 0, stream>>>(array, stride); in Sum() 120 CopyEqual<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(tmp_loss, output, size); in Reduce() 159 …KLDivLossKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, reduction, input_x… in KLDivLoss() 166 PartialSum<<<GET_BLOCKS(stride), GET_THREADS, 0, stream>>>(tmp_loss, stride); in KLDivLoss() 202 …KLDivLossGradKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, reduction, inp… in KLDivLossGrad() 240 …BinaryCrossEntropyLossKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, reduc… in BinaryCrossEntropyLoss() 247 PartialSum<<<GET_BLOCKS(stride), GET_THREADS, 0, stream>>>(tmp_loss, stride); in BinaryCrossEntropyLoss() 299 …BinaryCrossEntropyLossGradKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, r… in BinaryCrossEntropyLossGrad() 326 …NLLLossKernel<<<GET_BLOCKS(n), GET_THREADS, 0, stream>>>(n, c, input, target, weight, tmp_target_w… in NLLLoss() 333 …NLLLossKernel<<<GET_BLOCKS(n), GET_THREADS, 0, stream>>>(n, c, input, target, weight, tmp_target_w… in NLLLoss() [all …]
|
D | fake_learned_scale_quant_perlayer_impl.cu | 74 …FakeLearnedScaleQuantPerLayer<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(output, size, inp… in CalFakeLearnedScaleQuantPerLayer() 82 FakeLearnedScaleQuantPerLayerGrad<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(grad_input, in CalFakeLearnedScaleQuantPerLayerGrad() 94 …LSQNudgePerLayer<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(input, size, input_alpha, inpu… in CalLSQNudgePerLayer()
|
D | float_status_impl.cu | 110 FloatStatus<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, output); in CalFloatStatus() 115 IsNan<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, output); in CalIsNan() 120 IsInf<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, output); in CalIsInf() 125 IsFinite<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, output); in CalIsFinite()
|
D | correction_mul_impl.cu | 51 …CorrectionMul<<<GET_BLOCKS(N * C * H * W), GET_THREADS, 0, cuda_stream>>>(weight, gamma, running_s… in CalCorrectionMul() 61 …Mul<<<GET_BLOCKS(N * C * H * W), GET_THREADS, 0, cuda_stream>>>(N * C * H * W, d_out, weight, tmp); in CalCorrectionMulGrad() 62 Reduce<<<GET_BLOCKS(N), GET_THREADS, 0, cuda_stream>>>(N, C * H * W, tmp, running_std, d_gamma); in CalCorrectionMulGrad()
|
D | fake_learned_scale_quant_perchannel_impl.cu | 87 …FakeLearnedScaleQuantPerChannel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(output, size, i… in CalFakeLearnedScaleQuantPerChannel() 95 FakeLearnedScaleQuantPerChannelGrad<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(grad_input, in CalFakeLearnedScaleQuantPerChannelGrad() 109 …LSQNudgePerChannel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(input, size, input_alpha, in… in CalLSQNudgePerChannel()
|
D | uniform_candidate_sampler_impl.cu | 29 …AssignToOutput<<<GET_BLOCKS(true_size), GET_THREADS, 0, cuda_stream>>>(true_size, prob_val, true_e… in CalUniformCandidateSampler() 30 AssignToOutput<<<GET_BLOCKS(num_sampled), GET_THREADS, 0, cuda_stream>>>(num_sampled, prob_val, in CalUniformCandidateSampler()
|
D | batchnorm_fold_impl.cu | 54 …UpdateRunningStd<<<GET_BLOCKS(channel_size), GET_THREADS, 0, cuda_stream>>>(channel_size, epsilon,… in CalUpdateRunningStd() 63 …UpdateBatchStd<<<GET_BLOCKS(channel_size), GET_THREADS, 0, cuda_stream>>>(channel_size, batch_std); in CalUpdateBatchStd() 73 CalDx<<<GET_BLOCKS(batch_size * channel_size * height * width), GET_THREADS, 0, cuda_stream>>>( in CalBatchNormFoldGrad()
|
D | l2_loss.cu | 38 ClearOutputMem<<<GET_BLOCKS(1), GET_THREADS, 0, stream>>>(output); in L2Loss() 39 L2LossKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, input, output); in L2Loss()
|
D | fake_quant_perchannel_impl.cu | 71 NudgeMinMaxPerChannel<<<GET_BLOCKS(channel_num), GET_THREADS, 0, cuda_stream>>>( in CalNudgePerChannel() 114 …FakeQuantPerChannel<<<GET_BLOCKS(total_size), GET_THREADS, 0, cuda_stream>>>(input, output, total_… in CalFakeQuantPerChannel() 136 …FakeQuantPerChannelGrad<<<GET_BLOCKS(channel_num), GET_THREADS, 0, cuda_stream>>>(input, gradient,… in CalFakeQuantPerChannelGrad()
|
D | pad_impl.cu | 185 …Pad<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, num, channels, old_height, old… in CalPad() 194 …PadNHWC<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, num, old_height, old_width… in CalPadNHWC() 201 …PadGeneral<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input, output, input_shape, st… in CalPadGeneral() 209 …PadGradNHWC<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy, num, old_height, old_widt… in CalPadGradNHWC() 217 …PadGrad<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy, num, channels, old_height, ol… in CalPadGrad() 230 …Pad3d<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, num, channels, old_depth, ol… in CalPad3d() 245 …PadGrad3d<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy, num, channels, old_depth, o… in CalPadGrad3d() 256 …PadNDHWC<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, num, old_depth, old_heigh… in CalPadNDHWC() 266 …PadGradNDHWC<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy, num, old_depth, old_heig… in CalPadGradNDHWC()
|