Home
last modified time | relevance | path

Searched refs:GET_THREADS (Results 1 – 25 of 127) sorted by relevance

123456

/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/
Dcumsum_impl.cu120 …RightMoveSum<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride… in CumSum()
121 Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1); in CumSum()
122 …CumSumKernelReverse<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, d… in CumSum()
125 …LeftMoveSum<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride,… in CumSum()
126 Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1); in CumSum()
127 …CumSumKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, dim2, st… in CumSum()
131 …CumSumKernelReverse<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2,… in CumSum()
134 …CumSumKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride… in CumSum()
Dcumprod_impl.cu120 …RightMoveProd<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, strid… in CumProd()
121 Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1); in CumProd()
122 …CumProdKernelReverse<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, … in CumProd()
125 …LeftMoveProd<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride… in CumProd()
126 Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1); in CumProd()
127 …CumProdKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, dim2, s… in CumProd()
131 …CumProdKernelReverse<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2… in CumProd()
134 …CumProdKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, strid… in CumProd()
Dbce_with_logits_loss_impl.cu98 FillAndBroadcast<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>( in CalBCEWithLogitsLoss()
101 …FillWithoutBroadcast<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input_size, pos_weig… in CalBCEWithLogitsLoss()
104 …BCEWithLogitsLossMain<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input_size, predict… in CalBCEWithLogitsLoss()
107 …FillAndBroadcast<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input_size, shape_size, … in CalBCEWithLogitsLoss()
110 … FillWithoutBroadcast<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input_size, weight, in CalBCEWithLogitsLoss()
113 …Mul<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input_size, shape_broadcasted, output… in CalBCEWithLogitsLoss()
Dprelu_grad_impl.cu56 size_t thread_num = static_cast<size_t>(GET_BLOCKS(size) * GET_THREADS); in CalPReLUGrad()
58 …InitDwArrayData<<<GET_BLOCKS(dw_array_size), GET_THREADS, 0, cuda_stream>>>(dw_array_size, dw_arra… in CalPReLUGrad()
59 …CalPReLUGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, weight_size, per_chann… in CalPReLUGrad()
61 …ComputeDwData<<<GET_BLOCKS(weight_size), GET_THREADS, 0, cuda_stream>>>(weight_size, thread_num, d… in CalPReLUGrad()
Drandom_choice_with_mask_impl.cu235 …InitArray<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, ceil_power2, input, mask… in CalRandomChoiceWithMask()
239 Copy<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(mask_buff, tmp_buff, ceil_power2); in CalRandomChoiceWithMask()
243 Copy<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(Tnum_buff, tmp_buff, BLOCKNUM); in CalRandomChoiceWithMask()
248 …Reshape2Index<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, input_shape_size, d1… in CalRandomChoiceWithMask()
251 Sort<<<1, GET_THREADS, 0, stream>>>(ceil_power2, rank_buff); in CalRandomChoiceWithMask()
253 SrandInit<<<GET_BLOCKS(ceil_power2), GET_THREADS, 0, stream>>>(ceil_power2, globalState, seedc); in CalRandomChoiceWithMask()
254 Shuffle<<<1, GET_THREADS, 0, stream>>>(ceil_power2, globalState, rank_buff); in CalRandomChoiceWithMask()
256 …MoveToOutput<<<GET_BLOCKS(count), GET_THREADS, 0, stream>>>(input_shape_size, count, input, output… in CalRandomChoiceWithMask()
Dgelu_impl.cu56 GeluKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Gelu()
62 GeluKernel<half2><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>( in Gelu()
65 …GeluKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Gelu()
116 GeluGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy_addr, x_addr, dx_addr); in GeluGradKernel()
122 GeluGradKernel<half2><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>( in GeluGradKernel()
126 …GeluGradKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy_addr, x_addr, dx_… in GeluGradKernel()
Dunary_op_impl.cu410 ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Exponential()
415 Expm1Kernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Expm1()
420 LogarithmKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Logarithm()
425 Log1pKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Log1p()
430 ErfKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Erf()
435 ErfcKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Erfc()
440 NegativeKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Negative()
445 ReciprocalKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Reciprocal()
450 SquareKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Square()
455 PowKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); in Pow()
[all …]
Dsquare_sum_all_impl.cu87 InitOutput<<<GET_BLOCKS(1), GET_THREADS, 0, cuda_stream>>>(1, ws_addr_0); in SquareSumAll()
88 InitOutput<<<GET_BLOCKS(1), GET_THREADS, 0, cuda_stream>>>(1, ws_addr_1); in SquareSumAll()
90 …SquareSumAllKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr_0, input_ad… in SquareSumAll()
92 …AssignKernel<<<GET_BLOCKS(1), GET_THREADS, 0, cuda_stream>>>(1, output_addr_0, output_addr_1, ws_a… in SquareSumAll()
Dsoftplus_impl.cu38 SoftplusKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Softplus()
43 …SoftplusKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_a… in Softplus()
66 …SoftplusGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy_addr, x_addr, dx_ad… in SoftplusGrad()
71 …SoftplusGradKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy_addr, x_addr,… in SoftplusGrad()
Dctcloss_impl.cu282 LogBInitKernel<<<GET_BLOCKS(log_prob_size), GET_THREADS, 0, stream>>>(log_beta_b, log_prob_size); in CalculateBwdVar()
283 CalculateBwdVarKernel<<<GET_BLOCKS(batch), GET_THREADS, 0, stream>>>( in CalculateBwdVar()
293 LogBInitKernel<<<GET_BLOCKS(log_prob_size), GET_THREADS, 0, stream>>>(log_alpha_b, log_prob_size); in CalculateFwdVar()
294 CalculateFwdVarKernel<<<GET_BLOCKS(batch), GET_THREADS, 0, stream>>>( in CalculateFwdVar()
302 …InnerSoftMaxKernel<<<GET_BLOCKS(batch * max_time), GET_THREADS, 0, stream>>>(probs, softmax_probs,… in InnerSoftMax()
325 GenLabelWithBlankKernel<<<GET_BLOCKS(batch), GET_THREADS, 0, stream>>>( in GenLabelWithBlank()
331 …GenLabelValuePCRKernel<<<GET_BLOCKS(batch), GET_THREADS, 0, stream>>>(label_value_sp, label_value_… in GenLabelValuePCR()
374 LabelValueInitKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(label_value_sp, size, blank); in GenLabelValue()
375 …GenLabelValueKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(label_value_sp, label_indices, l… in GenLabelValue()
377 …RecalculateLengthKernel<<<GET_BLOCKS(batch), GET_THREADS, 0, stream>>>(label_value_sp, label_squen… in GenLabelValue()
[all …]
Dunary_op_grad_impl.cu127 SqrtGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); in SqrtGrad()
133 RsqrtGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); in RsqrtGrad()
139 AsinGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); in AsinGrad()
145 ACosGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); in ACosGrad()
151 AtanGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); in AtanGrad()
157 AsinhGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); in AsinhGrad()
163 AcoshGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); in AcoshGrad()
169 …ReciprocalGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, coun… in ReciprocalGrad()
Dnms_with_mask_impl.cu177 MaskInit<<<GET_BLOCKS(total_val), GET_THREADS, 0, cuda_stream>>>(total_val, row_mask); in CalPreprocess()
179 …PopulateOutput<<<GET_BLOCKS(num), GET_THREADS, 0, cuda_stream>>>(input, output, index_buff, num, b… in CalPreprocess()
180 …Preprocess<<<GET_BLOCKS(num), GET_THREADS, 0, cuda_stream>>>(num, sel_idx, sel_boxes, output, box_… in CalPreprocess()
187 int thread = std::min(ceil_p_2, GET_THREADS); in CalSort()
196 …NmsPass<<<GET_BLOCKS(row_mask_size), GET_THREADS, 0, cuda_stream>>>(num, IOU_value, output, sel_bo… in CalNms()
198 ReducePass<<<1, GET_THREADS, 0, cuda_stream>>>(num, sel_boxes, row_mask); in CalNms()
Dmultinomial_impl.cu34 …CheckZeroKernel<<<GET_BLOCKS(distributions), GET_THREADS, 0, cuda_stream>>>(distributions, categor… in CheckZero()
50 CheckNonNegKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, output); in CheckNonNeg()
68 …NormInputKernel<<<GET_BLOCKS(count1), GET_THREADS, 0, cuda_stream>>>(input, distributions, categor… in NormInput()
117 …MultinomialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(RNG_seed, input, num_sample,… in Multinomial()
Dlocal_response_norm_impl.cu71 …ComputeScaleNHWC<<<GET_BLOCKS(num_elements), GET_THREADS, 0, cuda_stream>>>(input, depth_radius, b… in CalLocalResponseNormNHWC()
73 …LocalResponseNormNHWC<<<GET_BLOCKS(num_elements), GET_THREADS, 0, cuda_stream>>>(input, scale, bet… in CalLocalResponseNormNHWC()
83 …ComputeScaleNHWC<<<GET_BLOCKS(num_elements), GET_THREADS, 0, cuda_stream>>>(x, depth_radius, bias,… in CalLocalResponseNormGradNHWC()
85 …LocalResponseNormGradNHWC<<<GET_BLOCKS(num_elements), GET_THREADS, 0, cuda_stream>>>(dy, x, y, sca… in CalLocalResponseNormGradNHWC()
Dbatchnorm_fold2_impl.cu108 BatchNormFold2Kernel<<<GET_BLOCKS(num_count), GET_THREADS, 0, cuda_stream>>>( in BatchNormFold2Forward()
122 …BatchNormFold2GradMul<<<GET_BLOCKS(num_count), GET_THREADS, 0, cuda_stream>>>(dout, x, tmp_x, num_… in BatchNormFold2GradReduce()
123 …BatchNormFold2GradReduce1<<<GET_BLOCKS(N * C), GET_THREADS, 0, cuda_stream>>>(dout, tmp, tmp_x, tm… in BatchNormFold2GradReduce()
124 …BatchNormFold2GradReduce2<<<GET_BLOCKS(C), GET_THREADS, 0, cuda_stream>>>(tmp, d_beta, tmp2, reduc… in BatchNormFold2GradReduce()
135 BatchNormFold2GradNotFreeze<<<GET_BLOCKS(C), GET_THREADS, 0, cuda_stream>>>( in CalBatchNormFold2GradNotFreeze()
149 …BatchNormFold2GradFreeze<<<GET_BLOCKS(C), GET_THREADS, 0, cuda_stream>>>(d_beta, running_mean, run… in CalBatchNormFold2GradFreeze()
164 …DxMul<<<GET_BLOCKS(N * C * H * W), GET_THREADS, 0, cuda_stream>>>(N, C, H * W, batch_std, running_… in CalBatchNormFold2GradNotFreezeDxMul()
Dloss_with_reduction_impl.cu103 PartialSum<<<GET_BLOCKS(stride), GET_THREADS, 0, stream>>>(array, stride); in Sum()
120 CopyEqual<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(tmp_loss, output, size); in Reduce()
159 …KLDivLossKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, reduction, input_x… in KLDivLoss()
166 PartialSum<<<GET_BLOCKS(stride), GET_THREADS, 0, stream>>>(tmp_loss, stride); in KLDivLoss()
202 …KLDivLossGradKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, reduction, inp… in KLDivLossGrad()
240 …BinaryCrossEntropyLossKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, reduc… in BinaryCrossEntropyLoss()
247 PartialSum<<<GET_BLOCKS(stride), GET_THREADS, 0, stream>>>(tmp_loss, stride); in BinaryCrossEntropyLoss()
299 …BinaryCrossEntropyLossGradKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, r… in BinaryCrossEntropyLossGrad()
326 …NLLLossKernel<<<GET_BLOCKS(n), GET_THREADS, 0, stream>>>(n, c, input, target, weight, tmp_target_w… in NLLLoss()
333 …NLLLossKernel<<<GET_BLOCKS(n), GET_THREADS, 0, stream>>>(n, c, input, target, weight, tmp_target_w… in NLLLoss()
[all …]
Dfake_learned_scale_quant_perlayer_impl.cu74 …FakeLearnedScaleQuantPerLayer<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(output, size, inp… in CalFakeLearnedScaleQuantPerLayer()
82 FakeLearnedScaleQuantPerLayerGrad<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(grad_input, in CalFakeLearnedScaleQuantPerLayerGrad()
94 …LSQNudgePerLayer<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(input, size, input_alpha, inpu… in CalLSQNudgePerLayer()
Dfloat_status_impl.cu110 FloatStatus<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, output); in CalFloatStatus()
115 IsNan<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, output); in CalIsNan()
120 IsInf<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, output); in CalIsInf()
125 IsFinite<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, output); in CalIsFinite()
Dcorrection_mul_impl.cu51 …CorrectionMul<<<GET_BLOCKS(N * C * H * W), GET_THREADS, 0, cuda_stream>>>(weight, gamma, running_s… in CalCorrectionMul()
61 …Mul<<<GET_BLOCKS(N * C * H * W), GET_THREADS, 0, cuda_stream>>>(N * C * H * W, d_out, weight, tmp); in CalCorrectionMulGrad()
62 Reduce<<<GET_BLOCKS(N), GET_THREADS, 0, cuda_stream>>>(N, C * H * W, tmp, running_std, d_gamma); in CalCorrectionMulGrad()
Dfake_learned_scale_quant_perchannel_impl.cu87 …FakeLearnedScaleQuantPerChannel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(output, size, i… in CalFakeLearnedScaleQuantPerChannel()
95 FakeLearnedScaleQuantPerChannelGrad<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(grad_input, in CalFakeLearnedScaleQuantPerChannelGrad()
109 …LSQNudgePerChannel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(input, size, input_alpha, in… in CalLSQNudgePerChannel()
Duniform_candidate_sampler_impl.cu29 …AssignToOutput<<<GET_BLOCKS(true_size), GET_THREADS, 0, cuda_stream>>>(true_size, prob_val, true_e… in CalUniformCandidateSampler()
30 AssignToOutput<<<GET_BLOCKS(num_sampled), GET_THREADS, 0, cuda_stream>>>(num_sampled, prob_val, in CalUniformCandidateSampler()
Dbatchnorm_fold_impl.cu54 …UpdateRunningStd<<<GET_BLOCKS(channel_size), GET_THREADS, 0, cuda_stream>>>(channel_size, epsilon,… in CalUpdateRunningStd()
63 …UpdateBatchStd<<<GET_BLOCKS(channel_size), GET_THREADS, 0, cuda_stream>>>(channel_size, batch_std); in CalUpdateBatchStd()
73 CalDx<<<GET_BLOCKS(batch_size * channel_size * height * width), GET_THREADS, 0, cuda_stream>>>( in CalBatchNormFoldGrad()
Dl2_loss.cu38 ClearOutputMem<<<GET_BLOCKS(1), GET_THREADS, 0, stream>>>(output); in L2Loss()
39 L2LossKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, input, output); in L2Loss()
Dfake_quant_perchannel_impl.cu71 NudgeMinMaxPerChannel<<<GET_BLOCKS(channel_num), GET_THREADS, 0, cuda_stream>>>( in CalNudgePerChannel()
114 …FakeQuantPerChannel<<<GET_BLOCKS(total_size), GET_THREADS, 0, cuda_stream>>>(input, output, total_… in CalFakeQuantPerChannel()
136 …FakeQuantPerChannelGrad<<<GET_BLOCKS(channel_num), GET_THREADS, 0, cuda_stream>>>(input, gradient,… in CalFakeQuantPerChannelGrad()
Dpad_impl.cu185 …Pad<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, num, channels, old_height, old… in CalPad()
194 …PadNHWC<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, num, old_height, old_width… in CalPadNHWC()
201 …PadGeneral<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input, output, input_shape, st… in CalPadGeneral()
209 …PadGradNHWC<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy, num, old_height, old_widt… in CalPadGradNHWC()
217 …PadGrad<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy, num, channels, old_height, ol… in CalPadGrad()
230 …Pad3d<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, num, channels, old_depth, ol… in CalPad3d()
245 …PadGrad3d<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy, num, channels, old_depth, o… in CalPadGrad3d()
256 …PadNDHWC<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, num, old_depth, old_heigh… in CalPadNDHWC()
266 …PadGradNDHWC<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy, num, old_depth, old_heig… in CalPadGradNDHWC()

123456