1 /** 2 * Copyright 2022 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SOFTMAX_IMPL_CUH_ 18 #define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SOFTMAX_IMPL_CUH_ 19 #include <cuda_fp16.h> 20 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h" 21 22 template <typename T, bool is_cuda> 23 struct AccumulateType {}; 24 template <> 25 struct AccumulateType<half, true> { 26 using type = float; 27 }; 28 template <> 29 struct AccumulateType<float, true> { 30 using type = float; 31 }; 32 template <> 33 struct AccumulateType<double, true> { 34 using type = float; 35 }; 36 37 template <typename T, bool is_cuda> 38 using acc_type = typename AccumulateType<T, is_cuda>::type; 39 40 template <typename T, typename AccumT, typename OutT, bool is_log_softmax> 41 struct SoftMaxForwardEpilogue { SoftMaxForwardEpilogueSoftMaxForwardEpilogue42 __device__ __forceinline__ SoftMaxForwardEpilogue(AccumT max_input, AccumT sum) 43 : max_input(max_input), sum(is_log_softmax == true ? std::log(sum) : sum) {} operator ()SoftMaxForwardEpilogue44 __device__ __forceinline__ OutT operator()(T input) const { 45 return is_log_softmax == true ? static_cast<OutT>((AccumT)input - max_input - sum) 46 : static_cast<OutT>(std::exp((AccumT)input - max_input) / sum); 47 } 48 const AccumT max_input; 49 const AccumT sum; 50 }; 51 52 // aligned vector generates vectorized load/store on CUDA 53 template <typename scalar_t> 54 struct alignas(sizeof(scalar_t) * sizeof(float4) / sizeof(scalar_t)) aligned_vector { 55 scalar_t val[sizeof(float4) / sizeof(scalar_t)]; 56 }; 57 58 template <typename T, bool is_log_softmax> 59 cudaError_t Softmax(T *input_, T *output_, size_t dim_size_, size_t outer_size_, size_t inner_size_, size_t device_id, 60 cudaStream_t cuda_stream); 61 62 #endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_SOFTMAX_IMPL_CUH_ 63