Home
last modified time | relevance | path

Searched refs:input_addr (Results 1 – 25 of 118) sorted by relevance

12345

/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/
Dcast_impl.cu25 __device__ __forceinline__ void CastBase(const S *input_addr, T *output_addr) { in CastBase() argument
26 *output_addr = static_cast<T>((*input_addr)); in CastBase()
30 __device__ __forceinline__ void CastBase(const half *input_addr, uint64_t *output_addr) { in CastBase() argument
31 *output_addr = __half2ull_rz((*input_addr)); in CastBase()
34 __device__ __forceinline__ void CastBase(const half *input_addr, int64_t *output_addr) { in CastBase() argument
35 *output_addr = __half2ll_rz((*input_addr)); in CastBase()
38 __device__ __forceinline__ void CastBase(const half *input_addr, uint32_t *output_addr) { in CastBase() argument
39 *output_addr = __half2uint_rz((*input_addr)); in CastBase()
42 __device__ __forceinline__ void CastBase(const half *input_addr, int32_t *output_addr) { in CastBase() argument
43 *output_addr = __half2int_rz((*input_addr)); in CastBase()
[all …]
Dunsorted_segment_sum.cu22 T* input_addr, S* ids_addr, T* output_addr) { in UnsortedSegmentSum() argument
33 MsAtomicAdd(output_addr + output_index, input_addr[input_index]); in UnsortedSegmentSum()
39 T* input_addr, S* ids_addr, T* output_addr, cudaStream_t stream) { in UnsortedSegmentSum() argument
42 output_dim0, output_dim1, input_addr, ids_addr, output_addr); in UnsortedSegmentSum()
47 … double* input_addr, int* ids_addr, double* output_addr, cudaStream_t stream);
49 … double* input_addr, int64_t* ids_addr, double* output_addr, cudaStream_t stream);
52 … float* input_addr, int* ids_addr, float* output_addr, cudaStream_t stream);
54 … float* input_addr, int64_t* ids_addr, float* output_addr, cudaStream_t stream);
57 … half* input_addr, int* ids_addr, half* output_addr, cudaStream_t stream);
59 … half* input_addr, int64_t* ids_addr, half* output_addr, cudaStream_t stream);
[all …]
Dconvert_gradient_impl.cu21 … const size_t batchwidth, const size_t width, T *input_addr, T *output_addr) { in ConvertGradientKernel() argument
29 output_addr[pointIdx] = input_addr[src_coordinate]; in ConvertGradientKernel()
35 … const size_t batchwidth, const size_t width, T *input_addr, T *output_addr) { in ConvertGradientBackKernel() argument
43 output_addr[src_coordinate] = input_addr[pointIdx]; in ConvertGradientBackKernel()
50 const size_t width, T *input_addr, T *output_addr) { in ConvertGradientBackKernel() argument
61 output_addr[src_coordinate] = input_addr[pointIdx]; in ConvertGradientBackKernel()
68 const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in ConvertGradient() argument
70 input_addr, output_addr); in ConvertGradient()
75 … const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in ConvertGradientBack() argument
77 … width, input_addr, output_addr); in ConvertGradientBack()
[all …]
Dmatrix_split_impl.cu20 … void MatrixSplitKernel(const size_t size, const size_t split_dim, const size_t dim, T *input_addr, in MatrixSplitKernel() argument
27 output_addr[pointIdx] = input_addr[src_coordinate]; in MatrixSplitKernel()
33 T *input_addr, T *output_addr) { in MatrixSplitKernel() argument
41 output_addr[pointIdx] = input_addr[src_coordinate]; in MatrixSplitKernel()
44 output_addr[pointIdx] = input_addr[src_coordinate]; in MatrixSplitKernel()
55 void MatrixSplit(const size_t size, const size_t split_dim, const size_t dim, T *input_addr, T *out… in MatrixSplit() argument
60 …<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, split_dim, dim, input_addr, output_addr); in MatrixSplit()
62 …ernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, split_dim, dim, res_dim, input_addr, in MatrixSplit()
68 … MatrixSplit<float>(const size_t size, const size_t split_dim, const size_t dim, float *input_addr,
Drelu_impl.cu22 __global__ void CalReLUKernel(int size, T *input_addr, T *output_addr) { in CalReLUKernel() argument
24 output_addr[pos] = input_addr[pos] > static_cast<T>(0) ? input_addr[pos] : static_cast<T>(0); in CalReLUKernel()
29 void CalReLU(int size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in CalReLU() argument
30 CalReLUKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in CalReLU()
33 template void CalReLU(int size, double *input_addr, double *output_addr, cudaStream_t cuda_stream);
34 template void CalReLU(int size, float *input_addr, float *output_addr, cudaStream_t cuda_stream);
35 template void CalReLU(int size, half *input_addr, half *output_addr, cudaStream_t cuda_stream);
36 template void CalReLU(int size, int8_t *input_addr, int8_t *output_addr, cudaStream_t cuda_stream);
37 template void CalReLU(int size, int16_t *input_addr, int16_t *output_addr, cudaStream_t cuda_stream…
38 template void CalReLU(int size, int32_t *input_addr, int32_t *output_addr, cudaStream_t cuda_stream…
[all …]
Dsoftplus_impl.cu21 __global__ void SoftplusKernel(const size_t size, const T *input_addr, T *output_addr) { in SoftplusKernel() argument
23 float x = input_addr[pos]; in SoftplusKernel()
29 __global__ void SoftplusKernel(const size_t size, const half *input_addr, half *output_addr) { in SoftplusKernel() argument
31 float x = __half2float(input_addr[pos]); in SoftplusKernel()
37 void Softplus(const size_t size, const T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in Softplus() argument
38 SoftplusKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Softplus()
42 void Softplus(const size_t size, const half *input_addr, half *output_addr, cudaStream_t cuda_strea… in Softplus() argument
43 …SoftplusKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_a… in Softplus()
74 template void Softplus(const size_t size, const float *input_addr, float *output_addr, cudaStream_t…
75 template void Softplus(const size_t size, const half *input_addr, half *output_addr, cudaStream_t c…
Dgelu_impl.cu21 __global__ void GeluKernel(size_t size, T *input_addr, T *output_addr) { in GeluKernel() argument
27 float x = input_addr[pos]; in GeluKernel()
34 __global__ void GeluKernel(size_t size, half *input_addr, half *output_addr) { in GeluKernel() argument
36 half x = input_addr[pos]; in GeluKernel()
43 __global__ void GeluKernel(size_t size, half2 *input_addr, half2 *output_addr) { in GeluKernel() argument
45 half2 x = input_addr[pos]; in GeluKernel()
55 void Gelu(size_t size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in Gelu() argument
56 GeluKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Gelu()
60 void Gelu(size_t size, half *input_addr, half *output_addr, cudaStream_t cuda_stream) { in Gelu() argument
63 size / 2, reinterpret_cast<half2 *>(input_addr), reinterpret_cast<half2 *>(output_addr)); in Gelu()
[all …]
Dmatrix_combine_impl.cu21 const size_t dst_width, T *input_addr, T *output_addr) { in MatrixCombineKernel() argument
28 output_addr[dst_h * dst_width + dst_w] = input_addr[pointIdx]; in MatrixCombineKernel()
34 … const size_t dst_width, const size_t res_width, const size_t batch, T *input_addr, in MatrixCombineKernel() argument
43 output_addr[dst_h * dst_width + dst_w] = input_addr[pointIdx]; in MatrixCombineKernel()
50 output_addr[dst_h * dst_width + dst_w] = input_addr[src_coordinate]; in MatrixCombineKernel()
57 … const size_t residual, const size_t res_width, const size_t batch, T *input_addr, T *output_addr, in MatrixCombine() argument
61 input_addr, output_addr); in MatrixCombine()
64 … res_width, batch, input_addr, output_addr); in MatrixCombine()
71 … const size_t batch, float *input_addr, float *output_addr, cudaStream_t cuda_stream);
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/math/
Dunary_op_gpu_kernel.h90 T *input_addr = GetDeviceAddress<T>(inputs, 0); in Launch() local
95 …Exponential(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(s… in Launch()
99 …Expm1(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_… in Launch()
103 …Logarithm(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(str… in Launch()
107 …Log1p(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_… in Launch()
111 …Erf(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_pt… in Launch()
115 …Erfc(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_p… in Launch()
119 …Negative(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stre… in Launch()
123 …Reciprocal(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(st… in Launch()
127 …Square(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream… in Launch()
[all …]
/third_party/mindspore/mindspore/lite/src/runtime/kernel/arm/fp32/
Dactivation_fp32.cc54 auto input_addr = reinterpret_cast<float *>(in_tensors_.at(0)->MutableData()); in DoActivation() local
56 MS_ASSERT(input_addr != nullptr); in DoActivation()
72 ret = Fp32Relu(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation()
74 ret = Fp32Relu6(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation()
76 ret = LRelu(input_addr + stride * task_id, count, output_addr + stride * task_id, alpha_); in DoActivation()
78 ret = Sigmoid(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation()
80 ret = Tanh(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation()
82 ret = Swish(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation()
84 ret = HSwish(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation()
86 ret = HSigmoid(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation()
[all …]
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/
Drolling_cpu_kernel.cc103 reduceMethod_ = [](const T *input_addr, const size_t *ids, size_t start, size_t end) { in MethodSwitch() argument
106 if (max_value < input_addr[ids[x]]) { in MethodSwitch()
107 max_value = input_addr[ids[x]]; in MethodSwitch()
114 reduceMethod_ = [](const T *input_addr, const size_t *ids, size_t start, size_t end) { in MethodSwitch() argument
117 if (min_value > input_addr[ids[x]]) { in MethodSwitch()
118 min_value = input_addr[ids[x]]; in MethodSwitch()
125 reduceMethod_ = [](const T *input_addr, const size_t *ids, size_t start, size_t end) { in MethodSwitch() argument
128 sum += input_addr[ids[x]]; in MethodSwitch()
134 reduceMethod_ = [](const T *input_addr, const size_t *ids, size_t start, size_t end) { in MethodSwitch() argument
137 sum += input_addr[ids[x]]; in MethodSwitch()
[all …]
Dtranspose_cpu_kernel.cc88 const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr); in LaunchKernel() local
97 ParallelRun(input_addr, output_addr, output_shape, data_count); in LaunchKernel()
102 res = DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
104 res = DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
106 res = DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
108 res = DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
110 res = DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
112 res = DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
114 res = DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
116 res = DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
[all …]
Dreduce_cpu_kernel.cc102 auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr); in Launch() local
107 *output_addr = input_addr[0]; in Launch()
109 reduce_func_(input_addr, i, output_addr); in Launch()
115 AccelerateLongVector(input_addr, output_addr, input_size); in Launch()
143 (void)ReduceSumDim2Axis1(stride, input_addr + i * stride, output_addr + i); in Launch()
159 auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) { in Launch()
163 output_addr[i] = input_addr[iter.GetPos()]; in Launch()
166 reduce_func_(input_addr, iter.GetPos(), &output_addr[i]); in Launch()
180 void ReduceCPUKernel<T>::AccelerateLongVector(T *input_addr, T *output_addr, size_t input_size) { in AccelerateLongVector() argument
182 *output_addr = input_addr[0]; in AccelerateLongVector()
[all …]
Dl2_normalize_cpu_kernel.cc51 void L2NormalizeCPUKernel<T>::CalcDenominator(const T *input_addr, const size_t reduce_size, const … in CalcDenominator() argument
75 …auto task = [this, &tran_base_iter, &input_addr, &denominator_addr, stride](size_t start, size_t e… in CalcDenominator()
81 denominator = input_addr[iter.GetPos()]; in CalcDenominator()
85 temp = input_addr[iter.GetPos()]; in CalcDenominator()
97 void L2NormalizeCPUKernel<T>::CalcOutput(const T *input_addr, const std::vector<size_t> reduce_shap… in CalcOutput() argument
105 T dividend = input_addr[iter.GetInputPosA()]; in CalcOutput()
132 auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); in Launch() local
144 L2NormalizeCPUKernel<T>::CalcDenominator(input_addr, reduce_size, dims, &denominator_addr); in Launch()
147 …L2NormalizeCPUKernel<T>::CalcOutput(input_addr, reduce_shape, output_size, output_addr, denominato… in Launch()
Dunsorted_segment_sum_cpu_kernel.cc56 void *input_addr = inputs[0]->addr; in Launch() local
66 ret = UnsortedSegmentSum(int, int, static_cast<const int *>(input_addr), SizeToInt(unit_num_), in Launch()
70 … ret = UnsortedSegmentSum(float, int, static_cast<const float *>(input_addr), SizeToInt(unit_num_), in Launch()
74 … ret = UnsortedSegmentSum(int, int64_t, static_cast<const int *>(input_addr), SizeToInt(unit_num_), in Launch()
78 …ret = UnsortedSegmentSum(float, int64_t, static_cast<const float *>(input_addr), SizeToInt(unit_nu… in Launch()
Dstridedslice_cpu_kernel.cc165 int StridedSliceCPUKernel::RunTaskOnOuter(const uint8_t *input_addr, uint8_t *output_addr, int star… in RunTaskOnOuter() argument
168 …const uint8_t *cur_in_ptr = input_addr + (start_pos * input_shape_[split_axis_] + begin_index) * i… in RunTaskOnOuter()
180 int StridedSliceCPUKernel::RunTaskOnSplitAxis(const uint8_t *input_addr, uint8_t *output_addr, int … in RunTaskOnSplitAxis() argument
183 …const uint8_t *cur_in_ptr = input_addr + (start_pos * slice_param_.strides_[split_axis_] + begin_i… in RunTaskOnSplitAxis()
194 void StridedSliceCPUKernel::ParallelRun(const uint8_t *input_addr, uint8_t *output_addr, int thread… in ParallelRun() argument
208 std::bind(execute_func, this, input_addr, output_addr, thread_index * cal_num_per_thread_)); in ParallelRun()
223 auto input_addr = reinterpret_cast<uint8_t *>(inputs[0]->addr); in Launch() local
227 ParallelRun(input_addr, output_addr, thread_num); in Launch()
229 (void)DoStridedSlice(input_addr, output_addr, &slice_param_); in Launch()
Dbias_add_grad_cpu_kernel.cc42 const auto *input_addr = reinterpret_cast<float *>(inputs[0]->addr); in Launch() local
57 output_addr[c] += input_addr[offset + hw]; in Launch()
62 auto task = [this, input_addr, output_addr](size_t start, size_t end) { in Launch()
64 …ReduceSumDim2Axis0(end - start, input_shape_[1], input_shape_[0], input_addr + start, output_addr … in Launch()
/third_party/mindspore/mindspore/lite/src/runtime/kernel/arm/fp16_grad/
Dactivation_fp16_grad.cc51 auto input_addr = reinterpret_cast<float16_t *>(in_tensors_.at(1)->MutableData()); in DoActivation() local
52 CHECK_NULL_RETURN(input_addr); in DoActivation()
64 error_code = ReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
66 error_code = Relu6Fp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
68 error_code = LReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start, in DoActivation()
72 error_code = SigmoidFp16Grad(input_addr + start, yt_addr + start, count, output_addr + start); in DoActivation()
74 error_code = TanhFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
76 error_code = HSwishFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
78 error_code = HSigmoidFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
81 …EluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start, (float16_t)param_act_… in DoActivation()
[all …]
/third_party/mindspore/mindspore/lite/src/runtime/kernel/arm/fp32_grad/
Dactivation_grad.cc54 const auto input_addr = reinterpret_cast<float *>(in_tensors_.at(1)->MutableData()); in DoActivation() local
55 CHECK_NULL_RETURN(input_addr); in DoActivation()
67 error_code = ReluGrad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
69 error_code = Relu6Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
71 …error_code = LReluGrad(yt_addr + start, input_addr + start, count, output_addr + start, param_act_… in DoActivation()
74 error_code = SigmoidGrad(input_addr + start, yt_addr + start, count, output_addr + start); in DoActivation()
76 error_code = TanhGrad(input_addr + start, yt_addr + start, count, output_addr + start); in DoActivation()
78 error_code = HSwishGrad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
80 error_code = HSigmoidGrad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
82 …error_code = EluGrad(yt_addr + start, input_addr + start, count, output_addr + start, param_act_gr… in DoActivation()
[all …]
/third_party/mindspore/mindspore/ccsrc/runtime/device/gpu/distribution/
Dcollective_wrapper.cc37 ncclResult_t AllReduce(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in AllReduce() argument
39 …return NCCLWrapper::instance().AllReduce(input_addr, output_addr, count, data_type, reduce_type, s… in AllReduce()
42 ncclResult_t AllGather(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in AllGather() argument
44 …return NCCLWrapper::instance().AllGather(input_addr, output_addr, count, data_type, stream, group); in AllGather()
47 ncclResult_t ReduceScatter(const void *input_addr, void *output_addr, size_t count, ncclDataType_t … in ReduceScatter() argument
49 …return NCCLWrapper::instance().ReduceScatter(input_addr, output_addr, count, data_type, reduce_typ… in ReduceScatter()
52 ncclResult_t Broadcast(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in Broadcast() argument
54 …return NCCLWrapper::instance().Broadcast(input_addr, output_addr, count, data_type, root, stream, … in Broadcast()
Dnccl_wrapper.cc49 ncclResult_t NCCLWrapper::AllReduce(const void *input_addr, void *output_addr, size_t count, ncclDa… in AllReduce() argument
54 return ncclAllReduce(input_addr, output_addr, count, data_type, reduce_type, group_comm, stream); in AllReduce()
57 ncclResult_t NCCLWrapper::AllGather(const void *input_addr, void *output_addr, size_t count, ncclDa… in AllGather() argument
62 return ncclAllGather(input_addr, output_addr, count, data_type, group_comm, stream); in AllGather()
65 ncclResult_t NCCLWrapper::ReduceScatter(const void *input_addr, void *output_addr, size_t count, in ReduceScatter() argument
71 …return ncclReduceScatter(input_addr, output_addr, count, data_type, reduce_type, group_comm, strea… in ReduceScatter()
74 ncclResult_t NCCLWrapper::Broadcast(const void *input_addr, void *output_addr, size_t count, ncclDa… in Broadcast() argument
79 return ncclBroadcast(input_addr, output_addr, count, data_type, root, group_comm, stream); in Broadcast()
Dnccl_wrapper.h38 …ncclResult_t AllReduce(const void *input_addr, void *output_addr, size_t count, ncclDataType_t dat…
40 …ncclResult_t AllGather(const void *input_addr, void *output_addr, size_t count, ncclDataType_t dat…
42 …ncclResult_t ReduceScatter(const void *input_addr, void *output_addr, size_t count, ncclDataType_t…
44 …ncclResult_t Broadcast(const void *input_addr, void *output_addr, size_t count, ncclDataType_t dat…
Dcollective_wrapper.h39 extern "C" EXPORT_WRAPPER ncclResult_t AllReduce(const void *input_addr, void *output_addr, size_t …
42 extern "C" EXPORT_WRAPPER ncclResult_t AllGather(const void *input_addr, void *output_addr, size_t …
45 extern "C" EXPORT_WRAPPER ncclResult_t ReduceScatter(const void *input_addr, void *output_addr, siz…
48 extern "C" EXPORT_WRAPPER ncclResult_t Broadcast(const void *input_addr, void *output_addr, size_t …
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/
Dnccl_collective_gpu_kernel.h158 T *input_addr = GetDeviceAddress<T>(inputs, 0); in LaunchAllReduce() local
164 … (*all_reduce_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), nccl_data_type_, in LaunchAllReduce()
171 T *input_addr = GetDeviceAddress<T>(inputs, 0); in LaunchAllGather() local
178 …(*all_gather_funcptr)(input_addr, output_addr, input_size_ / sizeof(T), nccl_data_type_, stream, g… in LaunchAllGather()
184 T *input_addr = GetDeviceAddress<T>(inputs, 0); in LaunchReduceScatter() local
191 … (*reduce_scatter_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), in LaunchReduceScatter()
198 T *input_addr = GetDeviceAddress<T>(inputs, 0); in LaunchBroadcast() local
204 input_addr = GetDeviceAddress<T>(inputs, i); in LaunchBroadcast()
207 … (*broadcast_funcptr)(input_addr, output_addr, output_size_list_[i] / sizeof(T), in LaunchBroadcast()
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/
Dcast_gpu_kernel.h42 S *input_addr = GetPossiblyNullDeviceAddress<S>(inputs, 0); in Launch() local
45 if (input_addr == nullptr && output_addr == nullptr) { in Launch()
47 } else if (input_addr != nullptr && output_addr != nullptr) { in Launch()
48 Cast(input_size_, input_addr, output_addr, reinterpret_cast<cudaStream_t>(stream_ptr)); in Launch()

12345