1 /**
2 * Copyright 2019-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_GPUKERNEL_H_
18 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_GPUKERNEL_H_
19
20 #include <cuda.h>
21 #include <cudnn.h>
22 #include <string>
23 #include <vector>
24 #include <initializer_list>
25 #include <utility>
26 #include <map>
27 #include <memory>
28 #include <numeric>
29 #include <functional>
30 #include <algorithm>
31 #include "backend/kernel_compiler/kernel.h"
32 #include "backend/kernel_compiler/gpu/kernel_constants.h"
33 #include "runtime/device/gpu/gpu_device_manager.h"
34 #include "runtime/device/gpu/gpu_common.h"
35 #include "backend/session/anf_runtime_algorithm.h"
36 #include "runtime/device/executor/dynamic_kernel.h"
37 using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
38
39 // The max_limit of tensor shape size: 2 Giga-elements(2^31, the largest number in 32 bits).
40 #define SHAPE_SIZE_LIMIT 2147483648
41
42 namespace mindspore {
43 namespace kernel {
44 constexpr size_t kDim2DShapeSize = 4;
45 constexpr size_t kDim3DShapeSize = 5;
46 constexpr size_t kPoolingNbDims = kDim3DShapeSize;
47
48 static std::map<int, int> kNCHWToNHWCAxisMap = {
49 {0, 0},
50 {1, 3},
51 {2, 1},
52 {3, 2},
53 };
54 static std::map<int, int> kNHWCToNCHWAxisMap = {
55 {0, 0},
56 {1, 2},
57 {2, 3},
58 {3, 1},
59 };
60
61 static auto Anyone = [](auto &&k, auto &&... args) { return ((args == k) || ...); };
62
CeilDivide(int m,int n)63 inline int CeilDivide(int m, int n) { return (m + n - 1) / n; }
64
GetPad(int input,int kernel,int stride)65 inline int GetPad(int input, int kernel, int stride) {
66 return std::max<int>(0, (CeilDivide(input, stride) - 1) * stride + kernel - input);
67 }
68
69 class GpuDynamicKernel : public device::DynamicKernel {
70 public:
GpuDynamicKernel(const CNodePtr & cnode_ptr)71 explicit GpuDynamicKernel(const CNodePtr &cnode_ptr) : DynamicKernel(nullptr, cnode_ptr) {}
72 ~GpuDynamicKernel() = default;
73
74 void UpdateArgs() override;
PostExecute()75 void PostExecute() final { MS_LOG(EXCEPTION) << "`PostExecute()` should not invoked with gpu backend"; };
Execute()76 void Execute() final { MS_LOG(EXCEPTION) << "`Execute()` should not invoked with gpu backend"; }
77 };
78
79 class GpuKernel : public KernelMod {
80 public:
81 virtual ~GpuKernel() = default;
82 virtual bool Init(const CNodePtr &kernel_node) = 0;
ResetResource()83 virtual void ResetResource() noexcept {
84 MS_LOG(ERROR) << "kernel must override the `ResetResource()` method when dynamic shape";
85 }
DestroyResource()86 virtual void DestroyResource() noexcept {}
PostExecute()87 virtual void PostExecute() {}
88
InitDynamicKernel(const CNodePtr & cnode_ptr)89 void InitDynamicKernel(const CNodePtr &cnode_ptr) { dynamic_kernel_ = std::make_shared<GpuDynamicKernel>(cnode_ptr); }
DynamicKernel()90 device::DynamicKernelPtr DynamicKernel() const { return dynamic_kernel_; }
91
92 protected:
InitResource()93 virtual void InitResource() {}
94 virtual void InitSizeLists() = 0;
95 std::weak_ptr<CNode> kernel_node_;
96
97 template <typename T>
GetDeviceAddress(const std::vector<AddressPtr> & addr_list,size_t index)98 inline T *GetDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) {
99 if (index >= addr_list.size()) {
100 MS_LOG(EXCEPTION) << "Address index(" << index << ") out of range(" << addr_list.size() << ")";
101 }
102
103 if ((addr_list[index] == nullptr) || (addr_list[index]->addr == nullptr) || (addr_list[index]->size == 0)) {
104 auto kernel_node = kernel_node_.lock();
105 const std::string &prim_name = AnfAlgo::GetCNodeName(kernel_node);
106 MS_LOG(EXCEPTION) << "The device address is empty, address index: " << index << ", op name is: " << prim_name;
107 }
108
109 return reinterpret_cast<T *>(addr_list[index]->addr);
110 }
111
112 template <typename T>
GetPossiblyNullDeviceAddress(const std::vector<AddressPtr> & addr_list,size_t index)113 inline T *GetPossiblyNullDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) {
114 if (index >= addr_list.size()) {
115 MS_LOG(EXCEPTION) << "Address index(" << index << ") out of range(" << addr_list.size() << ")";
116 }
117 // Kernels may run normally without workspace, the addr_list[index] maybe nullptr.
118 if ((addr_list[index] == nullptr) || (addr_list[index]->size == 0)) {
119 return nullptr;
120 }
121 if (addr_list[index]->addr == nullptr) {
122 MS_LOG(EXCEPTION) << "The device address is empty, address index:" << index;
123 }
124 return reinterpret_cast<T *>(addr_list[index]->addr);
125 }
126
127 template <typename T>
GetAttr(const CNodePtr & kernel_node,const std::string & key)128 inline T GetAttr(const CNodePtr &kernel_node, const std::string &key) const {
129 const PrimitivePtr &prim = AnfAlgo::GetCNodePrimitive(kernel_node);
130 const ValuePtr &attr = prim->GetAttr(key);
131 if (attr == nullptr) {
132 const std::string &prim_name = AnfAlgo::GetCNodeName(kernel_node);
133 MS_LOG(EXCEPTION) << "The attr(" << key << ") of kernel(" << prim_name << ") not exist";
134 }
135 return GetValue<T>(attr);
136 }
137 template <typename T>
GetAttrWithDefault(const CNodePtr & kernel_node,const std::string & key,const T & value)138 inline T GetAttrWithDefault(const CNodePtr &kernel_node, const std::string &key, const T &value) const {
139 const PrimitivePtr &prim = AnfAlgo::GetCNodePrimitive(kernel_node);
140 const ValuePtr &attr = prim->GetAttr(key);
141 if (attr == nullptr) {
142 return value;
143 }
144 return GetValue<T>(attr);
145 }
146 // expand Nd Shape to 4d (N in [0,4])
ShapeNdTo4d(const std::vector<size_t> & src,std::vector<size_t> * dst)147 void ShapeNdTo4d(const std::vector<size_t> &src, std::vector<size_t> *dst) {
148 if (src.size() > 4) {
149 MS_EXCEPTION(ValueError) << src.size() << "-D data is not supported!";
150 }
151 dst->push_back(src.size() < 4 ? 1 : src[src.size() - 4]);
152 dst->push_back(src.size() < 3 ? 1 : src[src.size() - 3]);
153 dst->push_back(src.size() < 2 ? 1 : src[src.size() - 2]);
154 dst->push_back(src.size() == 0 ? 1 : src[src.size() - 1]);
155 }
156
AxisTransform(const std::string & origin_data_format,const std::string & cal_format,int axis)157 int AxisTransform(const std::string &origin_data_format, const std::string &cal_format, int axis) {
158 if (((origin_data_format == kOpFormat_DEFAULT) || (origin_data_format == kOpFormat_NCHW)) &&
159 (cal_format == kOpFormat_NHWC)) {
160 return kNCHWToNHWCAxisMap[axis];
161 } else if (((cal_format == kOpFormat_DEFAULT) || (cal_format == kOpFormat_NCHW)) &&
162 (origin_data_format == kOpFormat_NHWC)) {
163 return kNHWCToNCHWAxisMap[axis];
164 } else {
165 return axis;
166 }
167 }
168
169 // transpose shape: NCHW To NHWC
ShapeNCHW2NHWC(std::vector<size_t> * shape)170 void ShapeNCHW2NHWC(std::vector<size_t> *shape) {
171 std::swap((*shape)[1], (*shape)[3]);
172 std::swap((*shape)[2], (*shape)[1]);
173 }
174
175 // transpose shape: NCDHW To NDHWC
ShapeNCDHW2NDHWC(std::vector<size_t> * shape)176 void ShapeNCDHW2NDHWC(std::vector<size_t> *shape) {
177 std::swap((*shape)[1], (*shape)[2]);
178 std::swap((*shape)[2], (*shape)[3]);
179 std::swap((*shape)[3], (*shape)[4]);
180 }
181
SetDimA(const std::vector<size_t> & shape,int * dimA,size_t len,const std::string & format)182 void SetDimA(const std::vector<size_t> &shape, int *dimA, size_t len, const std::string &format) {
183 if (shape.size() != len) {
184 MS_EXCEPTION(ValueError) << "Invalid size of input shape " << shape.size() << "-D with dimA " << len << "-D.";
185 }
186 if (Anyone(format, "NCHW", "DefaultFormat", "NCDHW")) {
187 for (size_t i = 0; i < len; ++i) {
188 dimA[i] = SizeToInt(shape[i]);
189 }
190 } else if (format == "NHWC") {
191 dimA[0] = SizeToInt(shape[0]);
192 dimA[1] = SizeToInt(shape[3]);
193 dimA[2] = SizeToInt(shape[1]);
194 dimA[3] = SizeToInt(shape[2]);
195 } else {
196 MS_LOG(ERROR) << "Unsupported data format " << format;
197 }
198 }
SetStrideA(const std::vector<size_t> & shape,int * strideA,size_t len,const std::string & format)199 void SetStrideA(const std::vector<size_t> &shape, int *strideA, size_t len, const std::string &format) {
200 if (shape.size() != len) {
201 MS_EXCEPTION(ValueError) << "Invalid size of input shape " << shape.size() << "-D with strideA " << len << "-D.";
202 }
203 if (Anyone(format, "NCHW", "DefaultFormat", "NCDHW")) {
204 for (size_t i = 0; i < len; ++i) {
205 strideA[i] = SizeToInt(accumulate(shape.begin() + i + 1, shape.end(), 1, std::multiplies<size_t>()));
206 }
207 } else if (format == "NHWC") {
208 strideA[0] = SizeToInt(shape[1] * shape[2] * shape[3]);
209 strideA[1] = 1;
210 strideA[2] = SizeToInt(shape[2] * shape[3]);
211 strideA[3] = SizeToInt(shape[3]);
212 } else {
213 MS_LOG(ERROR) << "Unsupported data format " << format;
214 }
215 }
216
SetNCHW(const std::vector<size_t> & shape,int * n,int * c,int * h,int * w,const std::string & format)217 void SetNCHW(const std::vector<size_t> &shape, int *n, int *c, int *h, int *w, const std::string &format) {
218 if (Anyone(format, "NCHW", "DefaultFormat")) {
219 *n = SizeToInt(shape[0]);
220 *c = SizeToInt(shape[1]);
221 *h = SizeToInt(shape[2]);
222 *w = SizeToInt(shape[3]);
223 } else if (format == "NHWC") {
224 *n = SizeToInt(shape[0]);
225 *c = SizeToInt(shape[3]);
226 *h = SizeToInt(shape[1]);
227 *w = SizeToInt(shape[2]);
228 } else {
229 MS_LOG(ERROR) << "Unsupported data format " << format;
230 }
231 }
232
SetNCDHW(const std::vector<size_t> & shape,int * n,int * c,int * d,int * h,int * w,const std::string & format)233 void SetNCDHW(const std::vector<size_t> &shape, int *n, int *c, int *d, int *h, int *w, const std::string &format) {
234 if (Anyone(format, "NCDHW", "DefaultFormat")) {
235 *n = SizeToInt(shape[0]);
236 *c = SizeToInt(shape[1]);
237 *d = SizeToInt(shape[2]);
238 *h = SizeToInt(shape[3]);
239 *w = SizeToInt(shape[4]);
240 } else if (format == "NDHWC") {
241 *n = SizeToInt(shape[0]);
242 *c = SizeToInt(shape[4]);
243 *d = SizeToInt(shape[1]);
244 *h = SizeToInt(shape[2]);
245 *w = SizeToInt(shape[3]);
246 } else {
247 MS_LOG(ERROR) << "Unsupported data format " << format;
248 }
249 }
250
CheckBroadcast4TensorOp(const std::vector<int> & A,const std::vector<int> & B,const std::vector<int> & Out)251 inline void CheckBroadcast4TensorOp(const std::vector<int> &A, const std::vector<int> &B,
252 const std::vector<int> &Out) {
253 if (A != Out && B != Out) {
254 MS_EXCEPTION(ValueError)
255 << "Double-sided broadcast was not supported in cudnn of cudnnOpTensor:\n"
256 "InputA must match the corresponding dimension of the destination tensor outC, and each "
257 "dimension of the inputB "
258 "must match the corresponding dimension of outC or must be equal to 1.";
259 }
260 }
261
262 // The tensor size is limited to 2G by cudnn.
CheckTensorSize(const std::initializer_list<std::vector<size_t>> & shapes)263 inline void CheckTensorSize(const std::initializer_list<std::vector<size_t>> &shapes) {
264 for (auto shape : shapes) {
265 size_t total_size = 1;
266 for (auto i : shape) {
267 total_size *= i;
268 }
269 if (total_size >= SHAPE_SIZE_LIMIT) {
270 MS_EXCEPTION(ValueError) << "The total size of the tensor exceeds the max_limit of 2 Giga-elements, which is "
271 << total_size << " elements (" << shape << ").";
272 }
273 }
274 }
275
276 // set the tensor descriptor for cudnn/cublas
CudnnSetTensorNdDescriptor(const std::vector<size_t> & shape,cudnnTensorDescriptor_t descriptor,cudnnDataType_t data_type,const std::weak_ptr<CNode> & node)277 void CudnnSetTensorNdDescriptor(const std::vector<size_t> &shape, cudnnTensorDescriptor_t descriptor,
278 cudnnDataType_t data_type, const std::weak_ptr<CNode> &node) {
279 if (shape.size() < 3) {
280 MS_EXCEPTION(ValueError) << "cudnnSetTensorNdDescriptor don't support" << shape.size() << "D.";
281 }
282 const int nbDims = shape.size();
283 std::unique_ptr<int[]> dim = std::make_unique<int[]>(nbDims);
284 std::unique_ptr<int[]> stride = std::make_unique<int[]>(nbDims);
285
286 for (int i = 0; i < nbDims; i++) {
287 dim[i] = SizeToInt(shape[i]);
288 stride[i] = 1;
289 }
290
291 for (int i = nbDims - 2; i >= 0; i--) {
292 stride[i] = stride[i + 1] * SizeToInt(shape[i + 1]);
293 }
294
295 CHECK_CUDNN_RET_WITH_EXCEPT(node,
296 cudnnSetTensorNdDescriptor(descriptor, data_type, nbDims, dim.get(), stride.get()),
297 "cudnnSetTensorNdDescriptor failed");
298 }
299
300 // choose the suitable datatype for cudnn/cublas
GetCudnnDataType(const std::string & Type)301 inline cudnnDataType_t GetCudnnDataType(const std::string &Type) {
302 auto type = kCudnnDtypeMap.find(Type);
303 if (type == kCudnnDtypeMap.end()) {
304 MS_EXCEPTION(TypeError) << Type << " is not supported.";
305 }
306 return type->second;
307 }
GetCudaDataType(const std::string & Type)308 inline cudaDataType_t GetCudaDataType(const std::string &Type) {
309 auto type = kCudaDtypeMap.find(Type);
310 if (type == kCudaDtypeMap.end()) {
311 MS_EXCEPTION(TypeError) << Type << " is not supported.";
312 }
313 return type->second;
314 }
315
316 device::DynamicKernelPtr dynamic_kernel_;
317 };
318 } // namespace kernel
319 } // namespace mindspore
320
321 #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_GPUKERNEL_H_
322