/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/ |
D | cast_impl.cu | 25 __device__ __forceinline__ void CastBase(const S *input_addr, T *output_addr) { in CastBase() argument 26 *output_addr = static_cast<T>((*input_addr)); in CastBase() 30 __device__ __forceinline__ void CastBase(const half *input_addr, uint64_t *output_addr) { in CastBase() argument 31 *output_addr = __half2ull_rz((*input_addr)); in CastBase() 34 __device__ __forceinline__ void CastBase(const half *input_addr, int64_t *output_addr) { in CastBase() argument 35 *output_addr = __half2ll_rz((*input_addr)); in CastBase() 38 __device__ __forceinline__ void CastBase(const half *input_addr, uint32_t *output_addr) { in CastBase() argument 39 *output_addr = __half2uint_rz((*input_addr)); in CastBase() 42 __device__ __forceinline__ void CastBase(const half *input_addr, int32_t *output_addr) { in CastBase() argument 43 *output_addr = __half2int_rz((*input_addr)); in CastBase() [all …]
|
D | unsorted_segment_sum.cu | 22 T* input_addr, S* ids_addr, T* output_addr) { in UnsortedSegmentSum() argument 33 MsAtomicAdd(output_addr + output_index, input_addr[input_index]); in UnsortedSegmentSum() 39 T* input_addr, S* ids_addr, T* output_addr, cudaStream_t stream) { in UnsortedSegmentSum() argument 42 output_dim0, output_dim1, input_addr, ids_addr, output_addr); in UnsortedSegmentSum() 47 … double* input_addr, int* ids_addr, double* output_addr, cudaStream_t stream); 49 … double* input_addr, int64_t* ids_addr, double* output_addr, cudaStream_t stream); 52 … float* input_addr, int* ids_addr, float* output_addr, cudaStream_t stream); 54 … float* input_addr, int64_t* ids_addr, float* output_addr, cudaStream_t stream); 57 … half* input_addr, int* ids_addr, half* output_addr, cudaStream_t stream); 59 … half* input_addr, int64_t* ids_addr, half* output_addr, cudaStream_t stream); [all …]
|
D | convert_gradient_impl.cu | 21 … const size_t batchwidth, const size_t width, T *input_addr, T *output_addr) { in ConvertGradientKernel() argument 29 output_addr[pointIdx] = input_addr[src_coordinate]; in ConvertGradientKernel() 35 … const size_t batchwidth, const size_t width, T *input_addr, T *output_addr) { in ConvertGradientBackKernel() argument 43 output_addr[src_coordinate] = input_addr[pointIdx]; in ConvertGradientBackKernel() 50 const size_t width, T *input_addr, T *output_addr) { in ConvertGradientBackKernel() argument 61 output_addr[src_coordinate] = input_addr[pointIdx]; in ConvertGradientBackKernel() 68 const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in ConvertGradient() argument 70 input_addr, output_addr); in ConvertGradient() 75 … const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in ConvertGradientBack() argument 77 … width, input_addr, output_addr); in ConvertGradientBack() [all …]
|
D | matrix_split_impl.cu | 20 … void MatrixSplitKernel(const size_t size, const size_t split_dim, const size_t dim, T *input_addr, in MatrixSplitKernel() argument 27 output_addr[pointIdx] = input_addr[src_coordinate]; in MatrixSplitKernel() 33 T *input_addr, T *output_addr) { in MatrixSplitKernel() argument 41 output_addr[pointIdx] = input_addr[src_coordinate]; in MatrixSplitKernel() 44 output_addr[pointIdx] = input_addr[src_coordinate]; in MatrixSplitKernel() 55 void MatrixSplit(const size_t size, const size_t split_dim, const size_t dim, T *input_addr, T *out… in MatrixSplit() argument 60 …<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, split_dim, dim, input_addr, output_addr); in MatrixSplit() 62 …ernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, split_dim, dim, res_dim, input_addr, in MatrixSplit() 68 … MatrixSplit<float>(const size_t size, const size_t split_dim, const size_t dim, float *input_addr,
|
D | relu_impl.cu | 22 __global__ void CalReLUKernel(int size, T *input_addr, T *output_addr) { in CalReLUKernel() argument 24 output_addr[pos] = input_addr[pos] > static_cast<T>(0) ? input_addr[pos] : static_cast<T>(0); in CalReLUKernel() 29 void CalReLU(int size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in CalReLU() argument 30 CalReLUKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in CalReLU() 33 template void CalReLU(int size, double *input_addr, double *output_addr, cudaStream_t cuda_stream); 34 template void CalReLU(int size, float *input_addr, float *output_addr, cudaStream_t cuda_stream); 35 template void CalReLU(int size, half *input_addr, half *output_addr, cudaStream_t cuda_stream); 36 template void CalReLU(int size, int8_t *input_addr, int8_t *output_addr, cudaStream_t cuda_stream); 37 template void CalReLU(int size, int16_t *input_addr, int16_t *output_addr, cudaStream_t cuda_stream… 38 template void CalReLU(int size, int32_t *input_addr, int32_t *output_addr, cudaStream_t cuda_stream… [all …]
|
D | softplus_impl.cu | 21 __global__ void SoftplusKernel(const size_t size, const T *input_addr, T *output_addr) { in SoftplusKernel() argument 23 float x = input_addr[pos]; in SoftplusKernel() 29 __global__ void SoftplusKernel(const size_t size, const half *input_addr, half *output_addr) { in SoftplusKernel() argument 31 float x = __half2float(input_addr[pos]); in SoftplusKernel() 37 void Softplus(const size_t size, const T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in Softplus() argument 38 SoftplusKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Softplus() 42 void Softplus(const size_t size, const half *input_addr, half *output_addr, cudaStream_t cuda_strea… in Softplus() argument 43 …SoftplusKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_a… in Softplus() 74 template void Softplus(const size_t size, const float *input_addr, float *output_addr, cudaStream_t… 75 template void Softplus(const size_t size, const half *input_addr, half *output_addr, cudaStream_t c…
|
D | gelu_impl.cu | 21 __global__ void GeluKernel(size_t size, T *input_addr, T *output_addr) { in GeluKernel() argument 27 float x = input_addr[pos]; in GeluKernel() 34 __global__ void GeluKernel(size_t size, half *input_addr, half *output_addr) { in GeluKernel() argument 36 half x = input_addr[pos]; in GeluKernel() 43 __global__ void GeluKernel(size_t size, half2 *input_addr, half2 *output_addr) { in GeluKernel() argument 45 half2 x = input_addr[pos]; in GeluKernel() 55 void Gelu(size_t size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in Gelu() argument 56 GeluKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Gelu() 60 void Gelu(size_t size, half *input_addr, half *output_addr, cudaStream_t cuda_stream) { in Gelu() argument 63 size / 2, reinterpret_cast<half2 *>(input_addr), reinterpret_cast<half2 *>(output_addr)); in Gelu() [all …]
|
D | matrix_combine_impl.cu | 21 const size_t dst_width, T *input_addr, T *output_addr) { in MatrixCombineKernel() argument 28 output_addr[dst_h * dst_width + dst_w] = input_addr[pointIdx]; in MatrixCombineKernel() 34 … const size_t dst_width, const size_t res_width, const size_t batch, T *input_addr, in MatrixCombineKernel() argument 43 output_addr[dst_h * dst_width + dst_w] = input_addr[pointIdx]; in MatrixCombineKernel() 50 output_addr[dst_h * dst_width + dst_w] = input_addr[src_coordinate]; in MatrixCombineKernel() 57 … const size_t residual, const size_t res_width, const size_t batch, T *input_addr, T *output_addr, in MatrixCombine() argument 61 input_addr, output_addr); in MatrixCombine() 64 … res_width, batch, input_addr, output_addr); in MatrixCombine() 71 … const size_t batch, float *input_addr, float *output_addr, cudaStream_t cuda_stream);
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/math/ |
D | unary_op_gpu_kernel.h | 90 T *input_addr = GetDeviceAddress<T>(inputs, 0); in Launch() local 95 …Exponential(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(s… in Launch() 99 …Expm1(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_… in Launch() 103 …Logarithm(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(str… in Launch() 107 …Log1p(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_… in Launch() 111 …Erf(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_pt… in Launch() 115 …Erfc(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_p… in Launch() 119 …Negative(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stre… in Launch() 123 …Reciprocal(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(st… in Launch() 127 …Square(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream… in Launch() [all …]
|
/third_party/mindspore/mindspore/lite/src/runtime/kernel/arm/fp32/ |
D | activation_fp32.cc | 54 auto input_addr = reinterpret_cast<float *>(in_tensors_.at(0)->MutableData()); in DoActivation() local 56 MS_ASSERT(input_addr != nullptr); in DoActivation() 72 ret = Fp32Relu(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation() 74 ret = Fp32Relu6(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation() 76 ret = LRelu(input_addr + stride * task_id, count, output_addr + stride * task_id, alpha_); in DoActivation() 78 ret = Sigmoid(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation() 80 ret = Tanh(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation() 82 ret = Swish(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation() 84 ret = HSwish(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation() 86 ret = HSigmoid(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation() [all …]
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/ |
D | rolling_cpu_kernel.cc | 103 reduceMethod_ = [](const T *input_addr, const size_t *ids, size_t start, size_t end) { in MethodSwitch() argument 106 if (max_value < input_addr[ids[x]]) { in MethodSwitch() 107 max_value = input_addr[ids[x]]; in MethodSwitch() 114 reduceMethod_ = [](const T *input_addr, const size_t *ids, size_t start, size_t end) { in MethodSwitch() argument 117 if (min_value > input_addr[ids[x]]) { in MethodSwitch() 118 min_value = input_addr[ids[x]]; in MethodSwitch() 125 reduceMethod_ = [](const T *input_addr, const size_t *ids, size_t start, size_t end) { in MethodSwitch() argument 128 sum += input_addr[ids[x]]; in MethodSwitch() 134 reduceMethod_ = [](const T *input_addr, const size_t *ids, size_t start, size_t end) { in MethodSwitch() argument 137 sum += input_addr[ids[x]]; in MethodSwitch() [all …]
|
D | transpose_cpu_kernel.cc | 88 const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr); in LaunchKernel() local 97 ParallelRun(input_addr, output_addr, output_shape, data_count); in LaunchKernel() 102 res = DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() 104 res = DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() 106 res = DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() 108 res = DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() 110 res = DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() 112 res = DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() 114 res = DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() 116 res = DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() [all …]
|
D | reduce_cpu_kernel.cc | 102 auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr); in Launch() local 107 *output_addr = input_addr[0]; in Launch() 109 reduce_func_(input_addr, i, output_addr); in Launch() 115 AccelerateLongVector(input_addr, output_addr, input_size); in Launch() 143 (void)ReduceSumDim2Axis1(stride, input_addr + i * stride, output_addr + i); in Launch() 159 auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) { in Launch() 163 output_addr[i] = input_addr[iter.GetPos()]; in Launch() 166 reduce_func_(input_addr, iter.GetPos(), &output_addr[i]); in Launch() 180 void ReduceCPUKernel<T>::AccelerateLongVector(T *input_addr, T *output_addr, size_t input_size) { in AccelerateLongVector() argument 182 *output_addr = input_addr[0]; in AccelerateLongVector() [all …]
|
D | l2_normalize_cpu_kernel.cc | 51 void L2NormalizeCPUKernel<T>::CalcDenominator(const T *input_addr, const size_t reduce_size, const … in CalcDenominator() argument 75 …auto task = [this, &tran_base_iter, &input_addr, &denominator_addr, stride](size_t start, size_t e… in CalcDenominator() 81 denominator = input_addr[iter.GetPos()]; in CalcDenominator() 85 temp = input_addr[iter.GetPos()]; in CalcDenominator() 97 void L2NormalizeCPUKernel<T>::CalcOutput(const T *input_addr, const std::vector<size_t> reduce_shap… in CalcOutput() argument 105 T dividend = input_addr[iter.GetInputPosA()]; in CalcOutput() 132 auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); in Launch() local 144 L2NormalizeCPUKernel<T>::CalcDenominator(input_addr, reduce_size, dims, &denominator_addr); in Launch() 147 …L2NormalizeCPUKernel<T>::CalcOutput(input_addr, reduce_shape, output_size, output_addr, denominato… in Launch()
|
D | unsorted_segment_sum_cpu_kernel.cc | 56 void *input_addr = inputs[0]->addr; in Launch() local 66 ret = UnsortedSegmentSum(int, int, static_cast<const int *>(input_addr), SizeToInt(unit_num_), in Launch() 70 … ret = UnsortedSegmentSum(float, int, static_cast<const float *>(input_addr), SizeToInt(unit_num_), in Launch() 74 … ret = UnsortedSegmentSum(int, int64_t, static_cast<const int *>(input_addr), SizeToInt(unit_num_), in Launch() 78 …ret = UnsortedSegmentSum(float, int64_t, static_cast<const float *>(input_addr), SizeToInt(unit_nu… in Launch()
|
D | stridedslice_cpu_kernel.cc | 165 int StridedSliceCPUKernel::RunTaskOnOuter(const uint8_t *input_addr, uint8_t *output_addr, int star… in RunTaskOnOuter() argument 168 …const uint8_t *cur_in_ptr = input_addr + (start_pos * input_shape_[split_axis_] + begin_index) * i… in RunTaskOnOuter() 180 int StridedSliceCPUKernel::RunTaskOnSplitAxis(const uint8_t *input_addr, uint8_t *output_addr, int … in RunTaskOnSplitAxis() argument 183 …const uint8_t *cur_in_ptr = input_addr + (start_pos * slice_param_.strides_[split_axis_] + begin_i… in RunTaskOnSplitAxis() 194 void StridedSliceCPUKernel::ParallelRun(const uint8_t *input_addr, uint8_t *output_addr, int thread… in ParallelRun() argument 208 std::bind(execute_func, this, input_addr, output_addr, thread_index * cal_num_per_thread_)); in ParallelRun() 223 auto input_addr = reinterpret_cast<uint8_t *>(inputs[0]->addr); in Launch() local 227 ParallelRun(input_addr, output_addr, thread_num); in Launch() 229 (void)DoStridedSlice(input_addr, output_addr, &slice_param_); in Launch()
|
D | bias_add_grad_cpu_kernel.cc | 42 const auto *input_addr = reinterpret_cast<float *>(inputs[0]->addr); in Launch() local 57 output_addr[c] += input_addr[offset + hw]; in Launch() 62 auto task = [this, input_addr, output_addr](size_t start, size_t end) { in Launch() 64 …ReduceSumDim2Axis0(end - start, input_shape_[1], input_shape_[0], input_addr + start, output_addr … in Launch()
|
/third_party/mindspore/mindspore/lite/src/runtime/kernel/arm/fp16_grad/ |
D | activation_fp16_grad.cc | 51 auto input_addr = reinterpret_cast<float16_t *>(in_tensors_.at(1)->MutableData()); in DoActivation() local 52 CHECK_NULL_RETURN(input_addr); in DoActivation() 64 error_code = ReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 66 error_code = Relu6Fp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 68 error_code = LReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start, in DoActivation() 72 error_code = SigmoidFp16Grad(input_addr + start, yt_addr + start, count, output_addr + start); in DoActivation() 74 error_code = TanhFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 76 error_code = HSwishFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 78 error_code = HSigmoidFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 81 …EluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start, (float16_t)param_act_… in DoActivation() [all …]
|
/third_party/mindspore/mindspore/lite/src/runtime/kernel/arm/fp32_grad/ |
D | activation_grad.cc | 54 const auto input_addr = reinterpret_cast<float *>(in_tensors_.at(1)->MutableData()); in DoActivation() local 55 CHECK_NULL_RETURN(input_addr); in DoActivation() 67 error_code = ReluGrad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 69 error_code = Relu6Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 71 …error_code = LReluGrad(yt_addr + start, input_addr + start, count, output_addr + start, param_act_… in DoActivation() 74 error_code = SigmoidGrad(input_addr + start, yt_addr + start, count, output_addr + start); in DoActivation() 76 error_code = TanhGrad(input_addr + start, yt_addr + start, count, output_addr + start); in DoActivation() 78 error_code = HSwishGrad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 80 error_code = HSigmoidGrad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 82 …error_code = EluGrad(yt_addr + start, input_addr + start, count, output_addr + start, param_act_gr… in DoActivation() [all …]
|
/third_party/mindspore/mindspore/ccsrc/runtime/device/gpu/distribution/ |
D | collective_wrapper.cc | 37 ncclResult_t AllReduce(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in AllReduce() argument 39 …return NCCLWrapper::instance().AllReduce(input_addr, output_addr, count, data_type, reduce_type, s… in AllReduce() 42 ncclResult_t AllGather(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in AllGather() argument 44 …return NCCLWrapper::instance().AllGather(input_addr, output_addr, count, data_type, stream, group); in AllGather() 47 ncclResult_t ReduceScatter(const void *input_addr, void *output_addr, size_t count, ncclDataType_t … in ReduceScatter() argument 49 …return NCCLWrapper::instance().ReduceScatter(input_addr, output_addr, count, data_type, reduce_typ… in ReduceScatter() 52 ncclResult_t Broadcast(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in Broadcast() argument 54 …return NCCLWrapper::instance().Broadcast(input_addr, output_addr, count, data_type, root, stream, … in Broadcast()
|
D | nccl_wrapper.cc | 49 ncclResult_t NCCLWrapper::AllReduce(const void *input_addr, void *output_addr, size_t count, ncclDa… in AllReduce() argument 54 return ncclAllReduce(input_addr, output_addr, count, data_type, reduce_type, group_comm, stream); in AllReduce() 57 ncclResult_t NCCLWrapper::AllGather(const void *input_addr, void *output_addr, size_t count, ncclDa… in AllGather() argument 62 return ncclAllGather(input_addr, output_addr, count, data_type, group_comm, stream); in AllGather() 65 ncclResult_t NCCLWrapper::ReduceScatter(const void *input_addr, void *output_addr, size_t count, in ReduceScatter() argument 71 …return ncclReduceScatter(input_addr, output_addr, count, data_type, reduce_type, group_comm, strea… in ReduceScatter() 74 ncclResult_t NCCLWrapper::Broadcast(const void *input_addr, void *output_addr, size_t count, ncclDa… in Broadcast() argument 79 return ncclBroadcast(input_addr, output_addr, count, data_type, root, group_comm, stream); in Broadcast()
|
D | nccl_wrapper.h | 38 …ncclResult_t AllReduce(const void *input_addr, void *output_addr, size_t count, ncclDataType_t dat… 40 …ncclResult_t AllGather(const void *input_addr, void *output_addr, size_t count, ncclDataType_t dat… 42 …ncclResult_t ReduceScatter(const void *input_addr, void *output_addr, size_t count, ncclDataType_t… 44 …ncclResult_t Broadcast(const void *input_addr, void *output_addr, size_t count, ncclDataType_t dat…
|
D | collective_wrapper.h | 39 extern "C" EXPORT_WRAPPER ncclResult_t AllReduce(const void *input_addr, void *output_addr, size_t … 42 extern "C" EXPORT_WRAPPER ncclResult_t AllGather(const void *input_addr, void *output_addr, size_t … 45 extern "C" EXPORT_WRAPPER ncclResult_t ReduceScatter(const void *input_addr, void *output_addr, siz… 48 extern "C" EXPORT_WRAPPER ncclResult_t Broadcast(const void *input_addr, void *output_addr, size_t …
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/ |
D | nccl_collective_gpu_kernel.h | 158 T *input_addr = GetDeviceAddress<T>(inputs, 0); in LaunchAllReduce() local 164 … (*all_reduce_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), nccl_data_type_, in LaunchAllReduce() 171 T *input_addr = GetDeviceAddress<T>(inputs, 0); in LaunchAllGather() local 178 …(*all_gather_funcptr)(input_addr, output_addr, input_size_ / sizeof(T), nccl_data_type_, stream, g… in LaunchAllGather() 184 T *input_addr = GetDeviceAddress<T>(inputs, 0); in LaunchReduceScatter() local 191 … (*reduce_scatter_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), in LaunchReduceScatter() 198 T *input_addr = GetDeviceAddress<T>(inputs, 0); in LaunchBroadcast() local 204 input_addr = GetDeviceAddress<T>(inputs, i); in LaunchBroadcast() 207 … (*broadcast_funcptr)(input_addr, output_addr, output_size_list_[i] / sizeof(T), in LaunchBroadcast()
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/ |
D | cast_gpu_kernel.h | 42 S *input_addr = GetPossiblyNullDeviceAddress<S>(inputs, 0); in Launch() local 45 if (input_addr == nullptr && output_addr == nullptr) { in Launch() 47 } else if (input_addr != nullptr && output_addr != nullptr) { in Launch() 48 Cast(input_size_, input_addr, output_addr, reinterpret_cast<cudaStream_t>(stream_ptr)); in Launch()
|