1 /**
2 * Copyright 2019-2024 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_GPU_KERNEL_H_
18 #define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_GPU_KERNEL_H_
19
20 #include <cuda.h>
21 #include <cudnn.h>
22 #include <string>
23 #include <vector>
24 #include <initializer_list>
25 #include <utility>
26 #include <map>
27 #include <memory>
28 #include <numeric>
29 #include <functional>
30 #include <algorithm>
31 #include <tuple>
32 #include <set>
33 #include <optional>
34 #include "kernel/kernel.h"
35 #include "plugin/device/gpu/kernel/gpu_kernel_mod.h"
36 #include "plugin/factory/ms_factory.h"
37 #include "plugin/device/gpu/kernel/kernel_constants.h"
38 #include "plugin/device/gpu/hal/device/gpu_device_manager.h"
39 #include "plugin/device/gpu/hal/device/gpu_device_address.h"
40 #include "plugin/device/gpu/hal/device/gpu_common.h"
41 #include "include/backend/anf_runtime_algorithm.h"
42 #include "include/common/utils/anfalgo.h"
43 #include "kernel/kernel_build_info.h"
44 #include "kernel/common_utils.h"
45 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
46
47 using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
48
49 // The max_limit of tensor shape size: 2 Giga-elements(2^31, the largest number in 32 bits).
50 #define SHAPE_SIZE_LIMIT 2147483648
51
52 namespace mindspore {
53 namespace kernel {
54 constexpr size_t kShapeIndex1st = 1;
55 constexpr size_t kShapeIndex2nd = 2;
56 constexpr size_t kShapeIndex3rd = 3;
57 constexpr size_t kShapeIndex4th = 4;
58 constexpr size_t kShapeIndex5nd = 5;
59 constexpr size_t kShapeIndex6rd = 6;
60 constexpr size_t kShapeIndex7th = 7;
61
62 constexpr size_t kDim2DShapeSize = 4;
63 constexpr size_t kDim3DShapeSize = 5;
64 constexpr size_t kPoolingNbDims = kDim3DShapeSize;
65
66 constexpr size_t kHelperDimsNum = 5;
67
68 static std::map<int, int> kNCHWToNHWCAxisMap = {
69 {0, 0},
70 {1, 3},
71 {2, 1},
72 {3, 2},
73 };
74 static std::map<int, int> kNHWCToNCHWAxisMap = {
75 {0, 0},
76 {1, 2},
77 {2, 3},
78 {3, 1},
79 };
80
81 static auto Anyone = [](auto &&k, auto &&... args) { return ((args == k) || ...); };
82
CeilDivide(int m,int n)83 inline int CeilDivide(int m, int n) { return (m + n - 1) / n; }
84
GetPad(int input,int kernel,int stride)85 inline int GetPad(int input, int kernel, int stride) {
86 return std::max<int>(0, (CeilDivide(input, stride) - 1) * stride + kernel - input);
87 }
88
89 // Choose the suitable datatype for cudnn
GetCudnnDataType(const std::string & Type)90 inline cudnnDataType_t GetCudnnDataType(const std::string &Type) {
91 auto type = kCudnnDtypeMap.find(Type);
92 if (type == kCudnnDtypeMap.end()) {
93 MS_EXCEPTION(TypeError) << Type << " is not supported.";
94 }
95 return type->second;
96 }
97
98 // Choose the suitable datatype for cublas
GetCudaDataType(const std::string & Type)99 inline cudaDataType_t GetCudaDataType(const std::string &Type) {
100 auto type = kCudaDtypeMap.find(Type);
101 if (type == kCudaDtypeMap.end()) {
102 MS_EXCEPTION(TypeError) << Type << " is not supported.";
103 }
104 return type->second;
105 }
106
107 class NativeGpuKernelMod : public GpuKernelMod {
108 public:
109 using ReduceDetail = std::tuple<size_t, TypeId, TypeId>;
110 using ReducePrecisonRes = std::tuple<bool, std::vector<ReduceDetail>, std::vector<ReduceDetail>>;
111
DestroyResource()112 virtual void DestroyResource() noexcept {}
113 bool CheckSupport(const std::string &kernel_name, const KernelAttr &kernel_attr);
114 std::vector<KernelAttr> GetAllSupportedList(const std::string &kernel_name);
115 ReducePrecisonRes ReducePrecisionCheck(const std::string &kernel_name, const KernelAttr &kernel_attr);
GetGpuSupportedList(const std::string & kernel_name)116 static std::vector<KernelAttr> GetGpuSupportedList(const std::string &kernel_name) {
117 if (!Factory<NativeGpuKernelMod>::Instance().IsRegistered(kernel_name)) {
118 return {};
119 }
120 return Factory<NativeGpuKernelMod>::Instance().Create(kernel_name)->GetAllSupportedList(kernel_name);
121 }
GetOpSupport()122 std::vector<KernelAttr> GetOpSupport() { return {}; }
123 static bool GpuCheckSupport(const std::string &kernel_name, const KernelAttr &kernel_attr);
124
GpuReducePrecisionCheck(const std::string & kernel_name,const KernelAttr & kernel_attr)125 static ReducePrecisonRes GpuReducePrecisionCheck(const std::string &kernel_name, const KernelAttr &kernel_attr) {
126 return Factory<NativeGpuKernelMod>::Instance().Create(kernel_name)->ReducePrecisionCheck(kernel_name, kernel_attr);
127 }
GetKernelModType()128 enum KernelModType GetKernelModType() const override { return KernelModType::NativeGpuKernelMod; }
129
130 protected:
InitResource()131 virtual void InitResource() {}
132 static mindspore::HashMap<std::string, std::vector<KernelAttr>> support_map_;
133 };
134
135 std::vector<void *> ConvertPtrs(const std::vector<KernelTensor *> &input_ptrs);
136
137 // expand Nd Shape to 4d (N in [0,4])
138 bool ShapeNdTo4d(const ShapeVector &src, ShapeVector *dst);
139
140 template <typename T>
GetPossiblyNullDeviceAddress(const std::vector<KernelTensor * > & addr_list,size_t index)141 inline T *GetPossiblyNullDeviceAddress(const std::vector<KernelTensor *> &addr_list, size_t index) {
142 if (index >= addr_list.size()) {
143 MS_LOG(ERROR) << "Address index(" << index << ") out of range(" << addr_list.size() << ")";
144 return nullptr;
145 }
146 // Kernels may run normally without workspace, the addr_list[index] maybe nullptr.
147 if ((addr_list[index] == nullptr) || (addr_list[index]->size() == 0)) {
148 return nullptr;
149 }
150 if (addr_list[index]->device_ptr() == nullptr) {
151 MS_LOG(ERROR) << "The device address is empty, address index:" << index;
152 return nullptr;
153 }
154 return reinterpret_cast<T *>(addr_list[index]->device_ptr());
155 }
156 template <typename T>
GetPossiblyNullDeviceAddress(const std::vector<AddressPtr> & addr_list,size_t index)157 inline T *GetPossiblyNullDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) {
158 if (index >= addr_list.size()) {
159 MS_LOG(ERROR) << "Address index(" << index << ") out of range(" << addr_list.size() << ")";
160 return nullptr;
161 }
162 // Kernels may run normally without workspace, the addr_list[index] maybe nullptr.
163 if ((addr_list[index] == nullptr) || (addr_list[index]->size == 0)) {
164 return nullptr;
165 }
166 if (addr_list[index]->addr == nullptr) {
167 MS_LOG(ERROR) << "The device address is empty, address index:" << index;
168 return nullptr;
169 }
170 return reinterpret_cast<T *>(addr_list[index]->addr);
171 }
172
173 int AxisTransform(const std::string &origin_data_format, const std::string &cal_format, int axis);
174
175 // transpose shape: NCHW To NHWC
176 void ShapeNCHW2NHWC(ShapeVector *shape);
177
178 // transpose shape: NCDHW To NDHWC
179 void ShapeNCDHW2NDHWC(ShapeVector *shape);
180
181 //////////////// old: format string /////////////
182 void SetDimA(const ShapeVector &shape, int *dimA, size_t len, const std::string &format);
183
184 void SetStrideA(const ShapeVector &shape, int *strideA, size_t len, const std::string &format);
185
186 void SetNCHW(const ShapeVector &shape, int *n, int *c, int *h, int *w, const std::string &format);
187
188 void SetNCDHW(const ShapeVector &shape, int *n, int *c, int *d, int *h, int *w, const std::string &format);
189 ////////////////////////////////////////////////
190 //////////////// new: format enum///////////////
191 void SetDimA(const ShapeVector &shape, int *dimA, size_t len, const mindspore::Format &format);
192
193 void SetStrideA(const ShapeVector &shape, int *strideA, size_t len, const mindspore::Format &format);
194
195 void SetNCHW(const ShapeVector &shape, int *n, int *c, int *h, int *w, const mindspore::Format &format);
196
197 void SetNCDHW(const ShapeVector &shape, int *n, int *c, int *d, int *h, int *w, const mindspore::Format &format);
198 ////////////////////////////////////////////////
199
200 bool CheckBroadcast4TensorOp(const std::vector<int> &A, const std::vector<int> &B, const std::vector<int> &Out);
201
202 // The tensor size is limited to 2G by cudnn.
203 bool CheckTensorSize(const std::initializer_list<ShapeVector> &shapes);
204
205 // set the tensor descriptor for cudnn/cublas
206 bool CudnnSetTensorNdDescriptor(const ShapeVector &shape, cudnnTensorDescriptor_t descriptor, cudnnDataType_t data_type,
207 const std::string &node_name);
208
209 // choose the suitable datatype for cudnn/cublas
210 bool GetCudnnDataType(const std::string &Type, cudnnDataType_t *out_type);
211
212 bool GetCudaDataType(const std::string &Type, cudaDataType_t *out_type);
213
214 bool ShapeEqual(const ShapeVector &s1, const ShapeVector &s2);
215
216 template <typename T>
GetDimValue(const std::vector<KernelTensor * > & inputs,const int index,const string kernel_name,const TypeId & dim_type)217 T GetDimValue(const std::vector<KernelTensor *> &inputs, const int index, const string kernel_name,
218 const TypeId &dim_type) {
219 size_t size = abstract::TypeIdSize(dim_type);
220 auto dim_gpu_addr =
221 std::make_shared<device::gpu::GPUDeviceAddress>(inputs[index]->device_ptr(), size, kOpFormat_DEFAULT, dim_type);
222 int res = 0;
223 if (dim_type == kNumberTypeInt32) {
224 int32_t host_dim = 0;
225 dim_gpu_addr->SyncDeviceToHost(size, &host_dim);
226 res = static_cast<T>(host_dim);
227 } else if (dim_type == kNumberTypeInt64) {
228 int64_t host_dim = 0;
229 dim_gpu_addr->SyncDeviceToHost(size, &host_dim);
230 res = static_cast<T>(host_dim);
231 } else {
232 MS_LOG(EXCEPTION) << "For '" << kernel_name << "', got unsupported data type of dim: " << dim_type;
233 }
234 return res;
235 }
236 // This is necessary for gpu kernels to support uint8 data type. In cuda, an unsigned,
237 // 8 bit integral type is represented by an unsigned char, but the MS_REG_GPU_KERNEL
238 // macros defined below will create compilation errors when datatype T contains a space,
239 // because the variable created by the macro will also contain a space. So, we solve this
240 // problem by writing uchar when calling these macros, and expanding uchar after the
241 // variable has been created.
242 using uchar = unsigned char;
243
GetTensorSize(std::vector<size_t> shape)244 inline size_t GetTensorSize(std::vector<size_t> shape) {
245 return std::accumulate(shape.begin(), shape.end(), size_t(1), std::multiplies<size_t>());
246 }
247 } // namespace kernel
248 } // namespace mindspore
249
250 #endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_GPU_KERNEL_H_
251