Home
last modified time | relevance | path

Searched refs:output_addr (Results 1 – 25 of 145) sorted by relevance

123456

/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/
Dcast_impl.cu25 __device__ __forceinline__ void CastBase(const S *input_addr, T *output_addr) { in CastBase() argument
26 *output_addr = static_cast<T>((*input_addr)); in CastBase()
30 __device__ __forceinline__ void CastBase(const half *input_addr, uint64_t *output_addr) { in CastBase() argument
31 *output_addr = __half2ull_rz((*input_addr)); in CastBase()
34 __device__ __forceinline__ void CastBase(const half *input_addr, int64_t *output_addr) { in CastBase() argument
35 *output_addr = __half2ll_rz((*input_addr)); in CastBase()
38 __device__ __forceinline__ void CastBase(const half *input_addr, uint32_t *output_addr) { in CastBase() argument
39 *output_addr = __half2uint_rz((*input_addr)); in CastBase()
42 __device__ __forceinline__ void CastBase(const half *input_addr, int32_t *output_addr) { in CastBase() argument
43 *output_addr = __half2int_rz((*input_addr)); in CastBase()
[all …]
Dunsorted_segment_sum.cu22 T* input_addr, S* ids_addr, T* output_addr) { in UnsortedSegmentSum() argument
33 MsAtomicAdd(output_addr + output_index, input_addr[input_index]); in UnsortedSegmentSum()
39 T* input_addr, S* ids_addr, T* output_addr, cudaStream_t stream) { in UnsortedSegmentSum() argument
42 output_dim0, output_dim1, input_addr, ids_addr, output_addr); in UnsortedSegmentSum()
47 … double* input_addr, int* ids_addr, double* output_addr, cudaStream_t stream);
49 … double* input_addr, int64_t* ids_addr, double* output_addr, cudaStream_t stream);
52 … float* input_addr, int* ids_addr, float* output_addr, cudaStream_t stream);
54 … float* input_addr, int64_t* ids_addr, float* output_addr, cudaStream_t stream);
57 … half* input_addr, int* ids_addr, half* output_addr, cudaStream_t stream);
59 … half* input_addr, int64_t* ids_addr, half* output_addr, cudaStream_t stream);
[all …]
Dmatrix_split_impl.cu21 T *output_addr) { in MatrixSplitKernel() argument
27 output_addr[pointIdx] = input_addr[src_coordinate]; in MatrixSplitKernel()
33 T *input_addr, T *output_addr) { in MatrixSplitKernel() argument
41 output_addr[pointIdx] = input_addr[src_coordinate]; in MatrixSplitKernel()
44 output_addr[pointIdx] = input_addr[src_coordinate]; in MatrixSplitKernel()
46 output_addr[pointIdx] = 1; in MatrixSplitKernel()
48 output_addr[pointIdx] = 0; in MatrixSplitKernel()
55 …ixSplit(const size_t size, const size_t split_dim, const size_t dim, T *input_addr, T *output_addr, in MatrixSplit() argument
60 …<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, split_dim, dim, input_addr, output_addr); in MatrixSplit()
63 output_addr); in MatrixSplit()
[all …]
Dconvert_gradient_impl.cu21 … const size_t batchwidth, const size_t width, T *input_addr, T *output_addr) { in ConvertGradientKernel() argument
29 output_addr[pointIdx] = input_addr[src_coordinate]; in ConvertGradientKernel()
35 … const size_t batchwidth, const size_t width, T *input_addr, T *output_addr) { in ConvertGradientBackKernel() argument
43 output_addr[src_coordinate] = input_addr[pointIdx]; in ConvertGradientBackKernel()
50 const size_t width, T *input_addr, T *output_addr) { in ConvertGradientBackKernel() argument
61 output_addr[src_coordinate] = input_addr[pointIdx]; in ConvertGradientBackKernel()
68 const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in ConvertGradient() argument
70 input_addr, output_addr); in ConvertGradient()
75 … const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in ConvertGradientBack() argument
77 … width, input_addr, output_addr); in ConvertGradientBack()
[all …]
Dsoftplus_impl.cu21 __global__ void SoftplusKernel(const size_t size, const T *input_addr, T *output_addr) { in SoftplusKernel() argument
24 output_addr[pos] = logf(1. + exp(x)); in SoftplusKernel()
29 __global__ void SoftplusKernel(const size_t size, const half *input_addr, half *output_addr) { in SoftplusKernel() argument
32 output_addr[pos] = __float2half(logf(1. + exp(x))); in SoftplusKernel()
37 void Softplus(const size_t size, const T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in Softplus() argument
38 SoftplusKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Softplus()
42 void Softplus(const size_t size, const half *input_addr, half *output_addr, cudaStream_t cuda_strea… in Softplus() argument
43 …plusKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Softplus()
74 template void Softplus(const size_t size, const float *input_addr, float *output_addr, cudaStream_t…
75 template void Softplus(const size_t size, const half *input_addr, half *output_addr, cudaStream_t c…
Drelu_impl.cu22 __global__ void CalReLUKernel(int size, T *input_addr, T *output_addr) { in CalReLUKernel() argument
24 output_addr[pos] = input_addr[pos] > static_cast<T>(0) ? input_addr[pos] : static_cast<T>(0); in CalReLUKernel()
29 void CalReLU(int size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in CalReLU() argument
30 CalReLUKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in CalReLU()
33 template void CalReLU(int size, double *input_addr, double *output_addr, cudaStream_t cuda_stream);
34 template void CalReLU(int size, float *input_addr, float *output_addr, cudaStream_t cuda_stream);
35 template void CalReLU(int size, half *input_addr, half *output_addr, cudaStream_t cuda_stream);
36 template void CalReLU(int size, int8_t *input_addr, int8_t *output_addr, cudaStream_t cuda_stream);
37 template void CalReLU(int size, int16_t *input_addr, int16_t *output_addr, cudaStream_t cuda_stream…
38 template void CalReLU(int size, int32_t *input_addr, int32_t *output_addr, cudaStream_t cuda_stream…
[all …]
Deye_impl.cu20 __global__ void EyeKernel(const size_t size, const size_t dim, T *output_addr) { in EyeKernel() argument
26 output_addr[pointIdx] = 1; in EyeKernel()
28 output_addr[pointIdx] = 0; in EyeKernel()
34 void Eye(const size_t size, const size_t dim, T *output_addr, cudaStream_t cuda_stream) { in Eye() argument
35 EyeKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dim, output_addr); in Eye()
39 template void Eye<float>(const size_t size, const size_t dim, float *output_addr, cudaStream_t cuda…
Dgelu_impl.cu21 __global__ void GeluKernel(size_t size, T *input_addr, T *output_addr) { in GeluKernel() argument
29 output_addr[pos] = 0.5 * x * (1.0 + tanh_res); in GeluKernel()
34 __global__ void GeluKernel(size_t size, half *input_addr, half *output_addr) { in GeluKernel() argument
38 output_addr[pos] = half(0.5) * x * (half(1.0) + __float2half(tanh_res)); in GeluKernel()
43 __global__ void GeluKernel(size_t size, half2 *input_addr, half2 *output_addr) { in GeluKernel() argument
50 output_addr[pos] = half2(0.5, 0.5) * x * (half2(1.0, 1.0) + __float22half2_rn(tanh_res)); in GeluKernel()
55 void Gelu(size_t size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in Gelu() argument
56 GeluKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr); in Gelu()
60 void Gelu(size_t size, half *input_addr, half *output_addr, cudaStream_t cuda_stream) { in Gelu() argument
63 size / 2, reinterpret_cast<half2 *>(input_addr), reinterpret_cast<half2 *>(output_addr)); in Gelu()
[all …]
Dmatrix_combine_impl.cu21 const size_t dst_width, T *input_addr, T *output_addr) { in MatrixCombineKernel() argument
28 output_addr[dst_h * dst_width + dst_w] = input_addr[pointIdx]; in MatrixCombineKernel()
35 T *output_addr) { in MatrixCombineKernel() argument
43 output_addr[dst_h * dst_width + dst_w] = input_addr[pointIdx]; in MatrixCombineKernel()
50 output_addr[dst_h * dst_width + dst_w] = input_addr[src_coordinate]; in MatrixCombineKernel()
57 … const size_t residual, const size_t res_width, const size_t batch, T *input_addr, T *output_addr, in MatrixCombine() argument
61 input_addr, output_addr); in MatrixCombine()
64 … res_width, batch, input_addr, output_addr); in MatrixCombine()
71 … const size_t batch, float *input_addr, float *output_addr, cudaStream_t cuda_stream);
Drandom_categorical.cu21 const size_t batch_size, const size_t num_classes, S *output_addr) { in RandomCategorical() argument
32 output_addr[pos] = static_cast<S>(idx); in RandomCategorical()
61 const size_t batch_size, const size_t num_classes, S *output_addr, cudaStream_t cuda_stream) { in RandomCategoricalKernel() argument
65 … num_classes, output_addr); in RandomCategoricalKernel()
84 int16_t *output_addr, cudaStream_t cuda_stream);
87 int *output_addr, cudaStream_t cuda_stream);
90 int64_t *output_addr, cudaStream_t cuda_stream);
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/math/
Dunary_op_gpu_kernel.h91 T *output_addr = GetDeviceAddress<T>(outputs, 0); in Launch() local
95 …Exponential(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(s… in Launch()
99 …Expm1(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_… in Launch()
103 …Logarithm(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(str… in Launch()
107 …Log1p(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_… in Launch()
111 …Erf(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_pt… in Launch()
115 …Erfc(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_p… in Launch()
119 …Negative(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stre… in Launch()
123 …Reciprocal(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(st… in Launch()
127 …Square(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream… in Launch()
[all …]
Daddn_gpu_kernel.h44 T *output_addr = GetDeviceAddress<T>(outputs, 0); in Launch() local
45 auto work_addr = output_addr; in Launch()
47 if (output_addr == GetDeviceAddress<T>(inputs, i)) { in Launch()
52 …FillDeviceArray(outputs[0]->size / sizeof(T), output_addr, 0.0f, reinterpret_cast<cudaStream_t>(st… in Launch()
59 if (work_addr != output_addr) { in Launch()
61 … cudaMemcpyAsync(output_addr, work_addr, outputs[0]->size, cudaMemcpyDeviceToDevice, in Launch()
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/
Drank_cpu_kernel.cc85 const size_t *const sort_idx, float *const output_addr) { in SetFunc() argument
87 output_addr[axisIterator.GetPos(sort_idx[j])] = i + 1; in SetFunc()
93 const size_t *const sort_idx, float *const output_addr) { in SetFunc() argument
95 output_addr[axisIterator.GetPos(sort_idx[j])] = i - duplicate_count + 2; in SetFunc()
106 const size_t *const sort_idx, float *const output_addr) { in SetFunc() argument
109 output_addr[axisIterator.GetPos(sort_idx[j])] = avg; in SetFunc()
115 const size_t *const sort_idx, float *const output_addr) { in SetFunc() argument
117 output_addr[axisIterator.GetPos(sort_idx[j])] = j + 1; in SetFunc()
123 const size_t *const sort_idx, float *const output_addr) { in SetFunc() argument
125 output_addr[axisIterator.GetPos(sort_idx[j])] = culmutive_rank; in SetFunc()
[all …]
Dreduce_cpu_kernel.cc103 auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr); in Launch() local
107 *output_addr = input_addr[0]; in Launch()
109 reduce_func_(input_addr, i, output_addr); in Launch()
112 *output_addr /= input_size; in Launch()
115 AccelerateLongVector(input_addr, output_addr, input_size); in Launch()
143 (void)ReduceSumDim2Axis1(stride, input_addr + i * stride, output_addr + i); in Launch()
145 output_addr[i] /= stride; in Launch()
159 auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) { in Launch()
163 output_addr[i] = input_addr[iter.GetPos()]; in Launch()
166 reduce_func_(input_addr, iter.GetPos(), &output_addr[i]); in Launch()
[all …]
Dtranspose_cpu_kernel.cc89 auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr); in LaunchKernel() local
97 ParallelRun(input_addr, output_addr, output_shape, data_count); in LaunchKernel()
102 res = DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
104 res = DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
106 res = DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
108 res = DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
110 res = DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
112 res = DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
114 res = DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
116 res = DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_); in LaunchKernel()
[all …]
Dunsorted_segment_sum_cpu_kernel.cc58 void *output_addr = outputs[0]->addr; in Launch() local
59 auto ret = memset_s(output_addr, outputs[0]->size, 0, outputs[0]->size); in Launch()
68 … static_cast<int *>(output_addr), SizeToInt(output_dim0_), SizeToInt(output_dim1_)); in Launch()
72 … static_cast<float *>(output_addr), SizeToInt(output_dim0_), SizeToInt(output_dim1_)); in Launch()
76 … static_cast<int *>(output_addr), SizeToInt(output_dim0_), SizeToInt(output_dim1_)); in Launch()
80 … static_cast<float *>(output_addr), SizeToInt(output_dim0_), SizeToInt(output_dim1_)); in Launch()
Dbias_add_grad_cpu_kernel.cc43 auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr); in Launch() local
53 output_addr[c] = 0; in Launch()
57 output_addr[c] += input_addr[offset + hw]; in Launch()
62 auto task = [this, input_addr, output_addr](size_t start, size_t end) { in Launch()
64 …mDim2Axis0(end - start, input_shape_[1], input_shape_[0], input_addr + start, output_addr + start); in Launch()
Dtensoradd_cpu_kernel.cc46 T *output_addr = reinterpret_cast<T *>(outputs[0]->addr); in Launch() local
49 auto task = [output_addr, input_addr_a, input_addr_b](size_t start, size_t end) { in Launch()
51 output_addr[i] = input_addr_a[i] + input_addr_b[i]; in Launch()
57 auto task = [&base_iter, output_addr, input_addr_a, input_addr_b](size_t start, size_t end) { in Launch()
61 output_addr[i] = input_addr_a[iter.GetInputPosA()] + input_addr_b[iter.GetInputPosB()]; in Launch()
Dembedding_look_up_cpu_kernel.cc33 void LookUpTableTask(const float *input_addr, const T *indices_addr, float *output_addr, size_t ind… in LookUpTableTask() argument
41 auto ret = memcpy_s(output_addr, (indices_lens - i) * lens, input_addr + pos, lens); in LookUpTableTask()
46 auto ret = memset_s(output_addr, (indices_lens - i) * lens, 0, lens); in LookUpTableTask()
51 output_addr += outer_dim_size; in LookUpTableTask()
107 auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr); in LaunchKernel() local
121 auto task = [input_addr, indices_addr, output_addr, task_offset, task_proc_lens, this]() { in LaunchKernel()
122 …LookUpTableTask<T>(input_addr, indices_addr + task_offset, output_addr + task_offset * outer_dim_s… in LaunchKernel()
/third_party/mindspore/mindspore/lite/src/runtime/kernel/arm/fp32/
Dactivation_fp32.cc55 auto output_addr = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData()); in DoActivation() local
57 MS_ASSERT(output_addr != nullptr); in DoActivation()
72 ret = Fp32Relu(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation()
74 ret = Fp32Relu6(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation()
76 ret = LRelu(input_addr + stride * task_id, count, output_addr + stride * task_id, alpha_); in DoActivation()
78 ret = Sigmoid(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation()
80 ret = Tanh(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation()
82 ret = Swish(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation()
84 ret = HSwish(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation()
86 ret = HSigmoid(input_addr + stride * task_id, count, output_addr + stride * task_id); in DoActivation()
[all …]
/third_party/mindspore/mindspore/lite/src/runtime/kernel/arm/fp16_grad/
Dactivation_fp16_grad.cc53 auto output_addr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->MutableData()); in DoActivation() local
54 CHECK_NULL_RETURN(output_addr); in DoActivation()
64 error_code = ReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
66 error_code = Relu6Fp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
68 error_code = LReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start, in DoActivation()
72 error_code = SigmoidFp16Grad(input_addr + start, yt_addr + start, count, output_addr + start); in DoActivation()
74 error_code = TanhFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
76 error_code = HSwishFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
78 error_code = HSigmoidFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
81 …EluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start, (float16_t)param_act_… in DoActivation()
[all …]
/third_party/mindspore/mindspore/lite/src/runtime/kernel/arm/fp32_grad/
Dactivation_grad.cc56 auto output_addr = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData()); in DoActivation() local
57 CHECK_NULL_RETURN(output_addr); in DoActivation()
67 error_code = ReluGrad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
69 error_code = Relu6Grad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
71 …error_code = LReluGrad(yt_addr + start, input_addr + start, count, output_addr + start, param_act_… in DoActivation()
74 error_code = SigmoidGrad(input_addr + start, yt_addr + start, count, output_addr + start); in DoActivation()
76 error_code = TanhGrad(input_addr + start, yt_addr + start, count, output_addr + start); in DoActivation()
78 error_code = HSwishGrad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
80 error_code = HSigmoidGrad(yt_addr + start, input_addr + start, count, output_addr + start); in DoActivation()
82 …error_code = EluGrad(yt_addr + start, input_addr + start, count, output_addr + start, param_act_gr… in DoActivation()
[all …]
/third_party/mindspore/mindspore/ccsrc/runtime/device/gpu/distribution/
Dcollective_wrapper.cc37 ncclResult_t AllReduce(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in AllReduce() argument
39 …return NCCLWrapper::instance().AllReduce(input_addr, output_addr, count, data_type, reduce_type, s… in AllReduce()
42 ncclResult_t AllGather(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in AllGather() argument
44 …return NCCLWrapper::instance().AllGather(input_addr, output_addr, count, data_type, stream, group); in AllGather()
47 ncclResult_t ReduceScatter(const void *input_addr, void *output_addr, size_t count, ncclDataType_t … in ReduceScatter() argument
49 …return NCCLWrapper::instance().ReduceScatter(input_addr, output_addr, count, data_type, reduce_typ… in ReduceScatter()
52 ncclResult_t Broadcast(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in Broadcast() argument
54 …return NCCLWrapper::instance().Broadcast(input_addr, output_addr, count, data_type, root, stream, … in Broadcast()
Dnccl_wrapper.cc49 ncclResult_t NCCLWrapper::AllReduce(const void *input_addr, void *output_addr, size_t count, ncclDa… in AllReduce() argument
54 return ncclAllReduce(input_addr, output_addr, count, data_type, reduce_type, group_comm, stream); in AllReduce()
57 ncclResult_t NCCLWrapper::AllGather(const void *input_addr, void *output_addr, size_t count, ncclDa… in AllGather() argument
62 return ncclAllGather(input_addr, output_addr, count, data_type, group_comm, stream); in AllGather()
65 ncclResult_t NCCLWrapper::ReduceScatter(const void *input_addr, void *output_addr, size_t count, in ReduceScatter() argument
71 …return ncclReduceScatter(input_addr, output_addr, count, data_type, reduce_type, group_comm, strea… in ReduceScatter()
74 ncclResult_t NCCLWrapper::Broadcast(const void *input_addr, void *output_addr, size_t count, ncclDa… in Broadcast() argument
79 return ncclBroadcast(input_addr, output_addr, count, data_type, root, group_comm, stream); in Broadcast()
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/
Dnccl_collective_gpu_kernel.h159 T *output_addr = GetDeviceAddress<T>(outputs, 0); in LaunchAllReduce() local
164 … (*all_reduce_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), nccl_data_type_, in LaunchAllReduce()
172 T *output_addr = GetDeviceAddress<T>(outputs, 0); in LaunchAllGather() local
178 …(*all_gather_funcptr)(input_addr, output_addr, input_size_ / sizeof(T), nccl_data_type_, stream, g… in LaunchAllGather()
185 T *output_addr = GetDeviceAddress<T>(outputs, 0); in LaunchReduceScatter() local
191 … (*reduce_scatter_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), in LaunchReduceScatter()
199 T *output_addr = GetDeviceAddress<T>(outputs, 0); in LaunchBroadcast() local
205 output_addr = GetDeviceAddress<T>(outputs, i); in LaunchBroadcast()
207 … (*broadcast_funcptr)(input_addr, output_addr, output_size_list_[i] / sizeof(T), in LaunchBroadcast()

123456