/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/ |
D | cast_impl.cu | 25 __device__ __forceinline__ void CastBase(const S *input_addr, T *output_addr) { in CastBase() argument 26 *output_addr = static_cast<T>((*input_addr)); in CastBase() 30 __device__ __forceinline__ void CastBase(const half *input_addr, uint64_t *output_addr) { in CastBase() argument 31 *output_addr = __half2ull_rz((*input_addr)); in CastBase() 34 __device__ __forceinline__ void CastBase(const half *input_addr, int64_t *output_addr) { in CastBase() argument 35 *output_addr = __half2ll_rz((*input_addr)); in CastBase() 38 __device__ __forceinline__ void CastBase(const half *input_addr, uint32_t *output_addr) { in CastBase() argument 39 *output_addr = __half2uint_rz((*input_addr)); in CastBase() 42 __device__ __forceinline__ void CastBase(const half *input_addr, int32_t *output_addr) { in CastBase() argument 43 *output_addr = __half2int_rz((*input_addr)); in CastBase() [all …]
|
D | unsorted_segment_sum.cu | 22 T* input_addr, S* ids_addr, T* output_addr) { in UnsortedSegmentSum() argument 33 MsAtomicAdd(output_addr + output_index, input_addr[input_index]); in UnsortedSegmentSum() 39 T* input_addr, S* ids_addr, T* output_addr, cudaStream_t stream) { in UnsortedSegmentSum() argument 42 output_dim0, output_dim1, input_addr, ids_addr, output_addr); in UnsortedSegmentSum() 47 … double* input_addr, int* ids_addr, double* output_addr, cudaStream_t stream); 49 … double* input_addr, int64_t* ids_addr, double* output_addr, cudaStream_t stream); 52 … float* input_addr, int* ids_addr, float* output_addr, cudaStream_t stream); 54 … float* input_addr, int64_t* ids_addr, float* output_addr, cudaStream_t stream); 57 … half* input_addr, int* ids_addr, half* output_addr, cudaStream_t stream); 59 … half* input_addr, int64_t* ids_addr, half* output_addr, cudaStream_t stream); [all …]
|
D | matrix_split_impl.cu | 21 T *output_addr) { in MatrixSplitKernel() argument 27 output_addr[pointIdx] = input_addr[src_coordinate]; in MatrixSplitKernel() 33 T *input_addr, T *output_addr) { in MatrixSplitKernel() argument 41 output_addr[pointIdx] = input_addr[src_coordinate]; in MatrixSplitKernel() 44 output_addr[pointIdx] = input_addr[src_coordinate]; in MatrixSplitKernel() 46 output_addr[pointIdx] = 1; in MatrixSplitKernel() 48 output_addr[pointIdx] = 0; in MatrixSplitKernel() 55 …ixSplit(const size_t size, const size_t split_dim, const size_t dim, T *input_addr, T *output_addr, in MatrixSplit() argument 60 …<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, split_dim, dim, input_addr, output_addr); in MatrixSplit() 63 output_addr); in MatrixSplit() [all …]
|
D | convert_gradient_impl.cu | 21 … const size_t batchwidth, const size_t width, T *input_addr, T *output_addr) { in ConvertGradientKernel() argument 29 output_addr[pointIdx] = input_addr[src_coordinate]; in ConvertGradientKernel() 35 … const size_t batchwidth, const size_t width, T *input_addr, T *output_addr) { in ConvertGradientBackKernel() argument 43 output_addr[src_coordinate] = input_addr[pointIdx]; in ConvertGradientBackKernel() 50 const size_t width, T *input_addr, T *output_addr) { in ConvertGradientBackKernel() argument 61 output_addr[src_coordinate] = input_addr[pointIdx]; in ConvertGradientBackKernel() 68 const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in ConvertGradient() argument 70 input_addr, output_addr); in ConvertGradient() 75 … const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in ConvertGradientBack() argument 77 … width, input_addr, output_addr); in ConvertGradientBack() [all …]
|
D | softplus_impl.cu | 21 __global__ void SoftplusKernel(const size_t size, const T *input_addr, T *output_addr) { in SoftplusKernel() argument 24 output_addr[pos] = logf(1. + exp(x)); in SoftplusKernel() 29 __global__ void SoftplusKernel(const size_t size, const half *input_addr, half *output_addr) { in SoftplusKernel() argument 32 output_addr[pos] = __float2half(logf(1. + exp(x))); in SoftplusKernel() 37 void Softplus(const size_t size, const T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in Softplus() argument 38 SoftplusKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Softplus() 42 void Softplus(const size_t size, const half *input_addr, half *output_addr, cudaStream_t cuda_strea… in Softplus() argument 43 …plusKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Softplus() 74 template void Softplus(const size_t size, const float *input_addr, float *output_addr, cudaStream_t… 75 template void Softplus(const size_t size, const half *input_addr, half *output_addr, cudaStream_t c…
|
D | relu_impl.cu | 22 __global__ void CalReLUKernel(int size, T *input_addr, T *output_addr) { in CalReLUKernel() argument 24 output_addr[pos] = input_addr[pos] > static_cast<T>(0) ? input_addr[pos] : static_cast<T>(0); in CalReLUKernel() 29 void CalReLU(int size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in CalReLU() argument 30 CalReLUKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in CalReLU() 33 template void CalReLU(int size, double *input_addr, double *output_addr, cudaStream_t cuda_stream); 34 template void CalReLU(int size, float *input_addr, float *output_addr, cudaStream_t cuda_stream); 35 template void CalReLU(int size, half *input_addr, half *output_addr, cudaStream_t cuda_stream); 36 template void CalReLU(int size, int8_t *input_addr, int8_t *output_addr, cudaStream_t cuda_stream); 37 template void CalReLU(int size, int16_t *input_addr, int16_t *output_addr, cudaStream_t cuda_stream… 38 template void CalReLU(int size, int32_t *input_addr, int32_t *output_addr, cudaStream_t cuda_stream… [all …]
|
D | eye_impl.cu | 20 __global__ void EyeKernel(const size_t size, const size_t dim, T *output_addr) { in EyeKernel() argument 26 output_addr[pointIdx] = 1; in EyeKernel() 28 output_addr[pointIdx] = 0; in EyeKernel() 34 void Eye(const size_t size, const size_t dim, T *output_addr, cudaStream_t cuda_stream) { in Eye() argument 35 EyeKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dim, output_addr); in Eye() 39 template void Eye<float>(const size_t size, const size_t dim, float *output_addr, cudaStream_t cuda…
|
D | gelu_impl.cu | 21 __global__ void GeluKernel(size_t size, T *input_addr, T *output_addr) { in GeluKernel() argument 29 output_addr[pos] = 0.5 * x * (1.0 + tanh_res); in GeluKernel() 34 __global__ void GeluKernel(size_t size, half *input_addr, half *output_addr) { in GeluKernel() argument 38 output_addr[pos] = half(0.5) * x * (half(1.0) + __float2half(tanh_res)); in GeluKernel() 43 __global__ void GeluKernel(size_t size, half2 *input_addr, half2 *output_addr) { in GeluKernel() argument 50 output_addr[pos] = half2(0.5, 0.5) * x * (half2(1.0, 1.0) + __float22half2_rn(tanh_res)); in GeluKernel() 55 void Gelu(size_t size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in Gelu() argument 56 GeluKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Gelu() 60 void Gelu(size_t size, half *input_addr, half *output_addr, cudaStream_t cuda_stream) { in Gelu() argument 63 size / 2, reinterpret_cast<half2 *>(input_addr), reinterpret_cast<half2 *>(output_addr)); in Gelu() [all …]
|
D | matrix_combine_impl.cu | 21 const size_t dst_width, T *input_addr, T *output_addr) { in MatrixCombineKernel() argument 28 output_addr[dst_h * dst_width + dst_w] = input_addr[pointIdx]; in MatrixCombineKernel() 35 T *output_addr) { in MatrixCombineKernel() argument 43 output_addr[dst_h * dst_width + dst_w] = input_addr[pointIdx]; in MatrixCombineKernel() 50 output_addr[dst_h * dst_width + dst_w] = input_addr[src_coordinate]; in MatrixCombineKernel() 57 … const size_t residual, const size_t res_width, const size_t batch, T *input_addr, T *output_addr, in MatrixCombine() argument 61 input_addr, output_addr); in MatrixCombine() 64 … res_width, batch, input_addr, output_addr); in MatrixCombine() 71 … const size_t batch, float *input_addr, float *output_addr, cudaStream_t cuda_stream);
|
D | random_categorical.cu | 21 const size_t batch_size, const size_t num_classes, S *output_addr) { in RandomCategorical() argument 32 output_addr[pos] = static_cast<S>(idx); in RandomCategorical() 61 const size_t batch_size, const size_t num_classes, S *output_addr, cudaStream_t cuda_stream) { in RandomCategoricalKernel() argument 65 … num_classes, output_addr); in RandomCategoricalKernel() 84 int16_t *output_addr, cudaStream_t cuda_stream); 87 int *output_addr, cudaStream_t cuda_stream); 90 int64_t *output_addr, cudaStream_t cuda_stream);
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/math/ |
D | unary_op_gpu_kernel.h | 91 T *output_addr = GetDeviceAddress<T>(outputs, 0); in Launch() local 95 …Exponential(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(s… in Launch() 99 …Expm1(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_… in Launch() 103 …Logarithm(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(str… in Launch() 107 …Log1p(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_… in Launch() 111 …Erf(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_pt… in Launch() 115 …Erfc(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_p… in Launch() 119 …Negative(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stre… in Launch() 123 …Reciprocal(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(st… in Launch() 127 …Square(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream… in Launch() [all …]
|
D | addn_gpu_kernel.h | 44 T *output_addr = GetDeviceAddress<T>(outputs, 0); in Launch() local 45 auto work_addr = output_addr; in Launch() 47 if (output_addr == GetDeviceAddress<T>(inputs, i)) { in Launch() 52 …FillDeviceArray(outputs[0]->size / sizeof(T), output_addr, 0.0f, reinterpret_cast<cudaStream_t>(st… in Launch() 59 if (work_addr != output_addr) { in Launch() 61 … cudaMemcpyAsync(output_addr, work_addr, outputs[0]->size, cudaMemcpyDeviceToDevice, in Launch()
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/ |
D | rank_cpu_kernel.cc | 85 const size_t *const sort_idx, float *const output_addr) { in SetFunc() argument 87 output_addr[axisIterator.GetPos(sort_idx[j])] = i + 1; in SetFunc() 93 const size_t *const sort_idx, float *const output_addr) { in SetFunc() argument 95 output_addr[axisIterator.GetPos(sort_idx[j])] = i - duplicate_count + 2; in SetFunc() 106 const size_t *const sort_idx, float *const output_addr) { in SetFunc() argument 109 output_addr[axisIterator.GetPos(sort_idx[j])] = avg; in SetFunc() 115 const size_t *const sort_idx, float *const output_addr) { in SetFunc() argument 117 output_addr[axisIterator.GetPos(sort_idx[j])] = j + 1; in SetFunc() 123 const size_t *const sort_idx, float *const output_addr) { in SetFunc() argument 125 output_addr[axisIterator.GetPos(sort_idx[j])] = culmutive_rank; in SetFunc() [all …]
|
D | reduce_cpu_kernel.cc | 103 auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr); in Launch() local 107 *output_addr = input_addr[0]; in Launch() 109 reduce_func_(input_addr, i, output_addr); in Launch() 112 *output_addr /= input_size; in Launch() 115 AccelerateLongVector(input_addr, output_addr, input_size); in Launch() 143 (void)ReduceSumDim2Axis1(stride, input_addr + i * stride, output_addr + i); in Launch() 145 output_addr[i] /= stride; in Launch() 159 auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) { in Launch() 163 output_addr[i] = input_addr[iter.GetPos()]; in Launch() 166 reduce_func_(input_addr, iter.GetPos(), &output_addr[i]); in Launch() [all …]
|
D | transpose_cpu_kernel.cc | 89 auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr); in LaunchKernel() local 97 ParallelRun(input_addr, output_addr, output_shape, data_count); in LaunchKernel() 102 res = DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() 104 res = DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() 106 res = DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() 108 res = DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() 110 res = DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() 112 res = DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() 114 res = DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() 116 res = DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel() [all …]
|
D | unsorted_segment_sum_cpu_kernel.cc | 58 void *output_addr = outputs[0]->addr; in Launch() local 59 auto ret = memset_s(output_addr, outputs[0]->size, 0, outputs[0]->size); in Launch() 68 … static_cast<int *>(output_addr), SizeToInt(output_dim0_), SizeToInt(output_dim1_)); in Launch() 72 … static_cast<float *>(output_addr), SizeToInt(output_dim0_), SizeToInt(output_dim1_)); in Launch() 76 … static_cast<int *>(output_addr), SizeToInt(output_dim0_), SizeToInt(output_dim1_)); in Launch() 80 … static_cast<float *>(output_addr), SizeToInt(output_dim0_), SizeToInt(output_dim1_)); in Launch()
|
D | bias_add_grad_cpu_kernel.cc | 43 auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr); in Launch() local 53 output_addr[c] = 0; in Launch() 57 output_addr[c] += input_addr[offset + hw]; in Launch() 62 auto task = [this, input_addr, output_addr](size_t start, size_t end) { in Launch() 64 …mDim2Axis0(end - start, input_shape_[1], input_shape_[0], input_addr + start, output_addr + start); in Launch()
|
D | tensoradd_cpu_kernel.cc | 46 T *output_addr = reinterpret_cast<T *>(outputs[0]->addr); in Launch() local 49 auto task = [output_addr, input_addr_a, input_addr_b](size_t start, size_t end) { in Launch() 51 output_addr[i] = input_addr_a[i] + input_addr_b[i]; in Launch() 57 auto task = [&base_iter, output_addr, input_addr_a, input_addr_b](size_t start, size_t end) { in Launch() 61 output_addr[i] = input_addr_a[iter.GetInputPosA()] + input_addr_b[iter.GetInputPosB()]; in Launch()
|
D | embedding_look_up_cpu_kernel.cc | 33 void LookUpTableTask(const float *input_addr, const T *indices_addr, float *output_addr, size_t ind… in LookUpTableTask() argument 41 auto ret = memcpy_s(output_addr, (indices_lens - i) * lens, input_addr + pos, lens); in LookUpTableTask() 46 auto ret = memset_s(output_addr, (indices_lens - i) * lens, 0, lens); in LookUpTableTask() 51 output_addr += outer_dim_size; in LookUpTableTask() 107 auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr); in LaunchKernel() local 121 auto task = [input_addr, indices_addr, output_addr, task_offset, task_proc_lens, this]() { in LaunchKernel() 122 …LookUpTableTask<T>(input_addr, indices_addr + task_offset, output_addr + task_offset * outer_dim_s… in LaunchKernel()
|
/third_party/mindspore/mindspore/lite/src/runtime/kernel/arm/fp32/ |
D | activation_fp32.cc | 55 auto output_addr = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData()); in DoActivation() local 57 MS_ASSERT(output_addr != nullptr); in DoActivation() 72 ret = Fp32Relu(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation() 74 ret = Fp32Relu6(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation() 76 ret = LRelu(input_addr + stride * task_id, count, output_addr + stride * task_id, alpha_); in DoActivation() 78 ret = Sigmoid(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation() 80 ret = Tanh(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation() 82 ret = Swish(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation() 84 ret = HSwish(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation() 86 ret = HSigmoid(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation() [all …]
|
/third_party/mindspore/mindspore/lite/src/runtime/kernel/arm/fp16_grad/ |
D | activation_fp16_grad.cc | 53 auto output_addr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->MutableData()); in DoActivation() local 54 CHECK_NULL_RETURN(output_addr); in DoActivation() 64 error_code = ReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 66 error_code = Relu6Fp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 68 error_code = LReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start, in DoActivation() 72 error_code = SigmoidFp16Grad(input_addr + start, yt_addr + start, count, output_addr + start); in DoActivation() 74 error_code = TanhFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 76 error_code = HSwishFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 78 error_code = HSigmoidFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 81 …EluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start, (float16_t)param_act_… in DoActivation() [all …]
|
/third_party/mindspore/mindspore/lite/src/runtime/kernel/arm/fp32_grad/ |
D | activation_grad.cc | 56 auto output_addr = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData()); in DoActivation() local 57 CHECK_NULL_RETURN(output_addr); in DoActivation() 67 error_code = ReluGrad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 69 error_code = Relu6Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 71 …error_code = LReluGrad(yt_addr + start, input_addr + start, count, output_addr + start, param_act_… in DoActivation() 74 error_code = SigmoidGrad(input_addr + start, yt_addr + start, count, output_addr + start); in DoActivation() 76 error_code = TanhGrad(input_addr + start, yt_addr + start, count, output_addr + start); in DoActivation() 78 error_code = HSwishGrad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 80 error_code = HSigmoidGrad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation() 82 …error_code = EluGrad(yt_addr + start, input_addr + start, count, output_addr + start, param_act_gr… in DoActivation() [all …]
|
/third_party/mindspore/mindspore/ccsrc/runtime/device/gpu/distribution/ |
D | collective_wrapper.cc | 37 ncclResult_t AllReduce(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in AllReduce() argument 39 …return NCCLWrapper::instance().AllReduce(input_addr, output_addr, count, data_type, reduce_type, s… in AllReduce() 42 ncclResult_t AllGather(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in AllGather() argument 44 …return NCCLWrapper::instance().AllGather(input_addr, output_addr, count, data_type, stream, group); in AllGather() 47 ncclResult_t ReduceScatter(const void *input_addr, void *output_addr, size_t count, ncclDataType_t … in ReduceScatter() argument 49 …return NCCLWrapper::instance().ReduceScatter(input_addr, output_addr, count, data_type, reduce_typ… in ReduceScatter() 52 ncclResult_t Broadcast(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in Broadcast() argument 54 …return NCCLWrapper::instance().Broadcast(input_addr, output_addr, count, data_type, root, stream, … in Broadcast()
|
D | nccl_wrapper.cc | 49 ncclResult_t NCCLWrapper::AllReduce(const void *input_addr, void *output_addr, size_t count, ncclDa… in AllReduce() argument 54 return ncclAllReduce(input_addr, output_addr, count, data_type, reduce_type, group_comm, stream); in AllReduce() 57 ncclResult_t NCCLWrapper::AllGather(const void *input_addr, void *output_addr, size_t count, ncclDa… in AllGather() argument 62 return ncclAllGather(input_addr, output_addr, count, data_type, group_comm, stream); in AllGather() 65 ncclResult_t NCCLWrapper::ReduceScatter(const void *input_addr, void *output_addr, size_t count, in ReduceScatter() argument 71 …return ncclReduceScatter(input_addr, output_addr, count, data_type, reduce_type, group_comm, strea… in ReduceScatter() 74 ncclResult_t NCCLWrapper::Broadcast(const void *input_addr, void *output_addr, size_t count, ncclDa… in Broadcast() argument 79 return ncclBroadcast(input_addr, output_addr, count, data_type, root, group_comm, stream); in Broadcast()
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/ |
D | nccl_collective_gpu_kernel.h | 159 T *output_addr = GetDeviceAddress<T>(outputs, 0); in LaunchAllReduce() local 164 … (*all_reduce_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), nccl_data_type_, in LaunchAllReduce() 172 T *output_addr = GetDeviceAddress<T>(outputs, 0); in LaunchAllGather() local 178 …(*all_gather_funcptr)(input_addr, output_addr, input_size_ / sizeof(T), nccl_data_type_, stream, g… in LaunchAllGather() 185 T *output_addr = GetDeviceAddress<T>(outputs, 0); in LaunchReduceScatter() local 191 … (*reduce_scatter_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), in LaunchReduceScatter() 199 T *output_addr = GetDeviceAddress<T>(outputs, 0); in LaunchBroadcast() local 205 output_addr = GetDeviceAddress<T>(outputs, i); in LaunchBroadcast() 207 … (*broadcast_funcptr)(input_addr, output_addr, output_size_list_[i] / sizeof(T), in LaunchBroadcast()
|