| /third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/ |
| D | cast_impl.cu | 25 __device__ __forceinline__ void CastBase(const S *input_addr, T *output_addr) { in CastBase() 30 __device__ __forceinline__ void CastBase(const half *input_addr, uint64_t *output_addr) { in CastBase() 34 __device__ __forceinline__ void CastBase(const half *input_addr, int64_t *output_addr) { in CastBase() 38 __device__ __forceinline__ void CastBase(const half *input_addr, uint32_t *output_addr) { in CastBase() 42 __device__ __forceinline__ void CastBase(const half *input_addr, int32_t *output_addr) { in CastBase() 46 __device__ __forceinline__ void CastBase(const half *input_addr, uint16_t *output_addr) { in CastBase() 50 __device__ __forceinline__ void CastBase(const half *input_addr, int16_t *output_addr) { in CastBase() 54 __device__ __forceinline__ void CastBase(const half *input_addr, uint8_t *output_addr) { in CastBase() 58 __device__ __forceinline__ void CastBase(const half *input_addr, int8_t *output_addr) { in CastBase() 63 __device__ __forceinline__ void CastBase(const uint64_t *input_addr, half *output_addr) { in CastBase() [all …]
|
| D | convert_gradient_impl.cu | 21 … const size_t batchwidth, const size_t width, T *input_addr, T *output_addr) { in ConvertGradientKernel() 35 … const size_t batchwidth, const size_t width, T *input_addr, T *output_addr) { in ConvertGradientBackKernel() 50 const size_t width, T *input_addr, T *output_addr) { in ConvertGradientBackKernel() 68 const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in ConvertGradient() 75 … const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in ConvertGradientBack() 82 … const size_t ori_w, const size_t batchwidth, const size_t width, T *input_addr, T *output_addr, in ConvertGradientBack()
|
| D | softplus_impl.cu | 21 __global__ void SoftplusKernel(const size_t size, const T *input_addr, T *output_addr) { in SoftplusKernel() 29 __global__ void SoftplusKernel(const size_t size, const half *input_addr, half *output_addr) { in SoftplusKernel() 37 void Softplus(const size_t size, const T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in Softplus() 42 void Softplus(const size_t size, const half *input_addr, half *output_addr, cudaStream_t cuda_strea… in Softplus()
|
| D | matrix_split_impl.cu | 20 … void MatrixSplitKernel(const size_t size, const size_t split_dim, const size_t dim, T *input_addr, in MatrixSplitKernel() 33 T *input_addr, T *output_addr) { in MatrixSplitKernel() 55 void MatrixSplit(const size_t size, const size_t split_dim, const size_t dim, T *input_addr, T *out… in MatrixSplit()
|
| D | gelu_impl.cu | 21 __global__ void GeluKernel(size_t size, T *input_addr, T *output_addr) { in GeluKernel() 34 __global__ void GeluKernel(size_t size, half *input_addr, half *output_addr) { in GeluKernel() 43 __global__ void GeluKernel(size_t size, half2 *input_addr, half2 *output_addr) { in GeluKernel() 55 void Gelu(size_t size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in Gelu() 60 void Gelu(size_t size, half *input_addr, half *output_addr, cudaStream_t cuda_stream) { in Gelu()
|
| D | matrix_combine_impl.cu | 21 const size_t dst_width, T *input_addr, T *output_addr) { in MatrixCombineKernel() 34 … const size_t dst_width, const size_t res_width, const size_t batch, T *input_addr, in MatrixCombineKernel() 57 … const size_t residual, const size_t res_width, const size_t batch, T *input_addr, T *output_addr, in MatrixCombine()
|
| D | unsorted_segment_sum.cu | 22 T* input_addr, S* ids_addr, T* output_addr) { in UnsortedSegmentSum() 39 T* input_addr, S* ids_addr, T* output_addr, cudaStream_t stream) { in UnsortedSegmentSum()
|
| D | relu_impl.cu | 22 __global__ void CalReLUKernel(int size, T *input_addr, T *output_addr) { in CalReLUKernel() 29 void CalReLU(int size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) { in CalReLU()
|
| /third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/ |
| D | rolling_cpu_kernel.cc | 103 reduceMethod_ = [](const T *input_addr, const size_t *ids, size_t start, size_t end) { in MethodSwitch() 114 reduceMethod_ = [](const T *input_addr, const size_t *ids, size_t start, size_t end) { in MethodSwitch() 125 reduceMethod_ = [](const T *input_addr, const size_t *ids, size_t start, size_t end) { in MethodSwitch() 134 reduceMethod_ = [](const T *input_addr, const size_t *ids, size_t start, size_t end) { in MethodSwitch() 143 reduceMethod_ = [this](const T *input_addr, const size_t *ids, size_t start, size_t end) { in MethodSwitch() 148 reduceMethod_ = [this](const T *input_addr, const size_t *ids, size_t start, size_t end) { in MethodSwitch() 160 auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); in Launch() local
|
| D | stridedslice_cpu_kernel.cc | 165 int StridedSliceCPUKernel::RunTaskOnOuter(const uint8_t *input_addr, uint8_t *output_addr, int star… in RunTaskOnOuter() 180 int StridedSliceCPUKernel::RunTaskOnSplitAxis(const uint8_t *input_addr, uint8_t *output_addr, int … in RunTaskOnSplitAxis() 194 void StridedSliceCPUKernel::ParallelRun(const uint8_t *input_addr, uint8_t *output_addr, int thread… in ParallelRun() 223 auto input_addr = reinterpret_cast<uint8_t *>(inputs[0]->addr); in Launch() local
|
| D | l2_normalize_cpu_kernel.cc | 51 void L2NormalizeCPUKernel<T>::CalcDenominator(const T *input_addr, const size_t reduce_size, const … in CalcDenominator() 97 void L2NormalizeCPUKernel<T>::CalcOutput(const T *input_addr, const std::vector<size_t> reduce_shap… in CalcOutput() 132 auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); in Launch() local
|
| D | transpose_cpu_kernel.cc | 88 const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr); in LaunchKernel() local 130 void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, const int *output_shap… in ParallelRun()
|
| D | embedding_look_up_cpu_kernel.cc | 33 void LookUpTableTask(const float *input_addr, const T *indices_addr, float *output_addr, size_t ind… in LookUpTableTask() 105 const auto *input_addr = reinterpret_cast<float *>(inputs[0]->addr); in LaunchKernel() local
|
| D | l2loss_cpu_kernel.cc | 42 auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); in Launch() local
|
| D | allgather_cpu_kernel.cc | 42 auto *input_addr = reinterpret_cast<float *>(inputs[0]->addr); in Launch() local
|
| D | rank_cpu_kernel.cc | 136 void RankCpuKernel<T>::Launch1DInt(const T *input_addr, size_t *sort_idx, T *values, const AxisIter… in Launch1DInt() 174 void RankCpuKernel<T>::Launch1DFloat(const T *input_addr, size_t *sort_idx, T *values, bool *is_nan, in Launch1DFloat() 252 auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); in Launch() local
|
| D | reduce_scatter_cpu_kernel.cc | 55 auto *input_addr = reinterpret_cast<float *>(inputs[0]->addr); in Launch() local
|
| D | unsorted_segment_sum_cpu_kernel.cc | 56 void *input_addr = inputs[0]->addr; in Launch() local
|
| D | broadcast_to_cpu_kernel.cc | 66 const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr); in Launch() local
|
| D | rolling_cpu_kernel.h | 52 S Var(const T *input_addr, const size_t *ids, size_t start, size_t end) const { in Var()
|
| /third_party/mindspore/mindspore/ccsrc/runtime/device/gpu/distribution/ |
| D | collective_wrapper.cc | 37 ncclResult_t AllReduce(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in AllReduce() 42 ncclResult_t AllGather(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in AllGather() 47 ncclResult_t ReduceScatter(const void *input_addr, void *output_addr, size_t count, ncclDataType_t … in ReduceScatter() 52 ncclResult_t Broadcast(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data… in Broadcast()
|
| D | nccl_wrapper.cc | 49 ncclResult_t NCCLWrapper::AllReduce(const void *input_addr, void *output_addr, size_t count, ncclDa… in AllReduce() 57 ncclResult_t NCCLWrapper::AllGather(const void *input_addr, void *output_addr, size_t count, ncclDa… in AllGather() 65 ncclResult_t NCCLWrapper::ReduceScatter(const void *input_addr, void *output_addr, size_t count, in ReduceScatter() 74 ncclResult_t NCCLWrapper::Broadcast(const void *input_addr, void *output_addr, size_t count, ncclDa… in Broadcast()
|
| /third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/ |
| D | nccl_collective_gpu_kernel.h | 158 T *input_addr = GetDeviceAddress<T>(inputs, 0); in LaunchAllReduce() local 171 T *input_addr = GetDeviceAddress<T>(inputs, 0); in LaunchAllGather() local 184 T *input_addr = GetDeviceAddress<T>(inputs, 0); in LaunchReduceScatter() local 198 T *input_addr = GetDeviceAddress<T>(inputs, 0); in LaunchBroadcast() local
|
| /third_party/mindspore/mindspore/ccsrc/runtime/device/ascend/ |
| D | ascend_launch_atomic_clean.h | 35 void SetInputAddr(uint8_t *input_addr) override { input_addr_ = input_addr; } in SetInputAddr()
|
| D | ascend_launch_transdata.h | 41 void SetInputAddr(uint8_t *input_addr) override { input_addr_ = input_addr; } in SetInputAddr()
|