• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_GPUKERNEL_H_
18 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_GPUKERNEL_H_
19 
20 #include <cuda.h>
21 #include <cudnn.h>
22 #include <string>
23 #include <vector>
24 #include <initializer_list>
25 #include <utility>
26 #include <map>
27 #include <memory>
28 #include <numeric>
29 #include <functional>
30 #include <algorithm>
31 #include "backend/kernel_compiler/kernel.h"
32 #include "backend/kernel_compiler/gpu/kernel_constants.h"
33 #include "runtime/device/gpu/gpu_device_manager.h"
34 #include "runtime/device/gpu/gpu_common.h"
35 #include "backend/session/anf_runtime_algorithm.h"
36 #include "runtime/device/executor/dynamic_kernel.h"
37 using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
38 
39 // The max_limit of tensor shape size: 2 Giga-elements(2^31, the largest number in 32 bits).
40 #define SHAPE_SIZE_LIMIT 2147483648
41 
42 namespace mindspore {
43 namespace kernel {
44 constexpr size_t kDim2DShapeSize = 4;
45 constexpr size_t kDim3DShapeSize = 5;
46 constexpr size_t kPoolingNbDims = kDim3DShapeSize;
47 
48 static std::map<int, int> kNCHWToNHWCAxisMap = {
49   {0, 0},
50   {1, 3},
51   {2, 1},
52   {3, 2},
53 };
54 static std::map<int, int> kNHWCToNCHWAxisMap = {
55   {0, 0},
56   {1, 2},
57   {2, 3},
58   {3, 1},
59 };
60 
61 static auto Anyone = [](auto &&k, auto &&... args) { return ((args == k) || ...); };
62 
CeilDivide(int m,int n)63 inline int CeilDivide(int m, int n) { return (m + n - 1) / n; }
64 
GetPad(int input,int kernel,int stride)65 inline int GetPad(int input, int kernel, int stride) {
66   return std::max<int>(0, (CeilDivide(input, stride) - 1) * stride + kernel - input);
67 }
68 
69 class GpuDynamicKernel : public device::DynamicKernel {
70  public:
GpuDynamicKernel(const CNodePtr & cnode_ptr)71   explicit GpuDynamicKernel(const CNodePtr &cnode_ptr) : DynamicKernel(nullptr, cnode_ptr) {}
72   ~GpuDynamicKernel() = default;
73 
74   void UpdateArgs() override;
PostExecute()75   void PostExecute() final { MS_LOG(EXCEPTION) << "`PostExecute()` should not invoked with gpu backend"; };
Execute()76   void Execute() final { MS_LOG(EXCEPTION) << "`Execute()` should not invoked with gpu backend"; }
77 };
78 
79 class GpuKernel : public KernelMod {
80  public:
81   virtual ~GpuKernel() = default;
82   virtual bool Init(const CNodePtr &kernel_node) = 0;
ResetResource()83   virtual void ResetResource() noexcept {
84     MS_LOG(ERROR) << "kernel must override the `ResetResource()` method when dynamic shape";
85   }
DestroyResource()86   virtual void DestroyResource() noexcept {}
PostExecute()87   virtual void PostExecute() {}
88 
InitDynamicKernel(const CNodePtr & cnode_ptr)89   void InitDynamicKernel(const CNodePtr &cnode_ptr) { dynamic_kernel_ = std::make_shared<GpuDynamicKernel>(cnode_ptr); }
DynamicKernel()90   device::DynamicKernelPtr DynamicKernel() const { return dynamic_kernel_; }
91 
92  protected:
InitResource()93   virtual void InitResource() {}
94   virtual void InitSizeLists() = 0;
95   std::weak_ptr<CNode> kernel_node_;
96 
97   template <typename T>
GetDeviceAddress(const std::vector<AddressPtr> & addr_list,size_t index)98   inline T *GetDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) {
99     if (index >= addr_list.size()) {
100       MS_LOG(EXCEPTION) << "Address index(" << index << ") out of range(" << addr_list.size() << ")";
101     }
102 
103     if ((addr_list[index] == nullptr) || (addr_list[index]->addr == nullptr) || (addr_list[index]->size == 0)) {
104       auto kernel_node = kernel_node_.lock();
105       const std::string &prim_name = AnfAlgo::GetCNodeName(kernel_node);
106       MS_LOG(EXCEPTION) << "The device address is empty, address index: " << index << ", op name is: " << prim_name;
107     }
108 
109     return reinterpret_cast<T *>(addr_list[index]->addr);
110   }
111 
112   template <typename T>
GetPossiblyNullDeviceAddress(const std::vector<AddressPtr> & addr_list,size_t index)113   inline T *GetPossiblyNullDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) {
114     if (index >= addr_list.size()) {
115       MS_LOG(EXCEPTION) << "Address index(" << index << ") out of range(" << addr_list.size() << ")";
116     }
117     // Kernels may run normally without workspace, the addr_list[index] maybe nullptr.
118     if ((addr_list[index] == nullptr) || (addr_list[index]->size == 0)) {
119       return nullptr;
120     }
121     if (addr_list[index]->addr == nullptr) {
122       MS_LOG(EXCEPTION) << "The device address is empty, address index:" << index;
123     }
124     return reinterpret_cast<T *>(addr_list[index]->addr);
125   }
126 
127   template <typename T>
GetAttr(const CNodePtr & kernel_node,const std::string & key)128   inline T GetAttr(const CNodePtr &kernel_node, const std::string &key) const {
129     const PrimitivePtr &prim = AnfAlgo::GetCNodePrimitive(kernel_node);
130     const ValuePtr &attr = prim->GetAttr(key);
131     if (attr == nullptr) {
132       const std::string &prim_name = AnfAlgo::GetCNodeName(kernel_node);
133       MS_LOG(EXCEPTION) << "The attr(" << key << ") of kernel(" << prim_name << ") not exist";
134     }
135     return GetValue<T>(attr);
136   }
137   template <typename T>
GetAttrWithDefault(const CNodePtr & kernel_node,const std::string & key,const T & value)138   inline T GetAttrWithDefault(const CNodePtr &kernel_node, const std::string &key, const T &value) const {
139     const PrimitivePtr &prim = AnfAlgo::GetCNodePrimitive(kernel_node);
140     const ValuePtr &attr = prim->GetAttr(key);
141     if (attr == nullptr) {
142       return value;
143     }
144     return GetValue<T>(attr);
145   }
146   // expand Nd Shape to 4d (N in [0,4])
ShapeNdTo4d(const std::vector<size_t> & src,std::vector<size_t> * dst)147   void ShapeNdTo4d(const std::vector<size_t> &src, std::vector<size_t> *dst) {
148     if (src.size() > 4) {
149       MS_EXCEPTION(ValueError) << src.size() << "-D data is not supported!";
150     }
151     dst->push_back(src.size() < 4 ? 1 : src[src.size() - 4]);
152     dst->push_back(src.size() < 3 ? 1 : src[src.size() - 3]);
153     dst->push_back(src.size() < 2 ? 1 : src[src.size() - 2]);
154     dst->push_back(src.size() == 0 ? 1 : src[src.size() - 1]);
155   }
156 
AxisTransform(const std::string & origin_data_format,const std::string & cal_format,int axis)157   int AxisTransform(const std::string &origin_data_format, const std::string &cal_format, int axis) {
158     if (((origin_data_format == kOpFormat_DEFAULT) || (origin_data_format == kOpFormat_NCHW)) &&
159         (cal_format == kOpFormat_NHWC)) {
160       return kNCHWToNHWCAxisMap[axis];
161     } else if (((cal_format == kOpFormat_DEFAULT) || (cal_format == kOpFormat_NCHW)) &&
162                (origin_data_format == kOpFormat_NHWC)) {
163       return kNHWCToNCHWAxisMap[axis];
164     } else {
165       return axis;
166     }
167   }
168 
169   // transpose shape: NCHW To NHWC
ShapeNCHW2NHWC(std::vector<size_t> * shape)170   void ShapeNCHW2NHWC(std::vector<size_t> *shape) {
171     std::swap((*shape)[1], (*shape)[3]);
172     std::swap((*shape)[2], (*shape)[1]);
173   }
174 
175   // transpose shape: NCDHW To NDHWC
ShapeNCDHW2NDHWC(std::vector<size_t> * shape)176   void ShapeNCDHW2NDHWC(std::vector<size_t> *shape) {
177     std::swap((*shape)[1], (*shape)[2]);
178     std::swap((*shape)[2], (*shape)[3]);
179     std::swap((*shape)[3], (*shape)[4]);
180   }
181 
SetDimA(const std::vector<size_t> & shape,int * dimA,size_t len,const std::string & format)182   void SetDimA(const std::vector<size_t> &shape, int *dimA, size_t len, const std::string &format) {
183     if (shape.size() != len) {
184       MS_EXCEPTION(ValueError) << "Invalid size of input shape " << shape.size() << "-D with dimA " << len << "-D.";
185     }
186     if (Anyone(format, "NCHW", "DefaultFormat", "NCDHW")) {
187       for (size_t i = 0; i < len; ++i) {
188         dimA[i] = SizeToInt(shape[i]);
189       }
190     } else if (format == "NHWC") {
191       dimA[0] = SizeToInt(shape[0]);
192       dimA[1] = SizeToInt(shape[3]);
193       dimA[2] = SizeToInt(shape[1]);
194       dimA[3] = SizeToInt(shape[2]);
195     } else {
196       MS_LOG(ERROR) << "Unsupported data format " << format;
197     }
198   }
SetStrideA(const std::vector<size_t> & shape,int * strideA,size_t len,const std::string & format)199   void SetStrideA(const std::vector<size_t> &shape, int *strideA, size_t len, const std::string &format) {
200     if (shape.size() != len) {
201       MS_EXCEPTION(ValueError) << "Invalid size of input shape " << shape.size() << "-D with strideA " << len << "-D.";
202     }
203     if (Anyone(format, "NCHW", "DefaultFormat", "NCDHW")) {
204       for (size_t i = 0; i < len; ++i) {
205         strideA[i] = SizeToInt(accumulate(shape.begin() + i + 1, shape.end(), 1, std::multiplies<size_t>()));
206       }
207     } else if (format == "NHWC") {
208       strideA[0] = SizeToInt(shape[1] * shape[2] * shape[3]);
209       strideA[1] = 1;
210       strideA[2] = SizeToInt(shape[2] * shape[3]);
211       strideA[3] = SizeToInt(shape[3]);
212     } else {
213       MS_LOG(ERROR) << "Unsupported data format " << format;
214     }
215   }
216 
SetNCHW(const std::vector<size_t> & shape,int * n,int * c,int * h,int * w,const std::string & format)217   void SetNCHW(const std::vector<size_t> &shape, int *n, int *c, int *h, int *w, const std::string &format) {
218     if (Anyone(format, "NCHW", "DefaultFormat")) {
219       *n = SizeToInt(shape[0]);
220       *c = SizeToInt(shape[1]);
221       *h = SizeToInt(shape[2]);
222       *w = SizeToInt(shape[3]);
223     } else if (format == "NHWC") {
224       *n = SizeToInt(shape[0]);
225       *c = SizeToInt(shape[3]);
226       *h = SizeToInt(shape[1]);
227       *w = SizeToInt(shape[2]);
228     } else {
229       MS_LOG(ERROR) << "Unsupported data format " << format;
230     }
231   }
232 
SetNCDHW(const std::vector<size_t> & shape,int * n,int * c,int * d,int * h,int * w,const std::string & format)233   void SetNCDHW(const std::vector<size_t> &shape, int *n, int *c, int *d, int *h, int *w, const std::string &format) {
234     if (Anyone(format, "NCDHW", "DefaultFormat")) {
235       *n = SizeToInt(shape[0]);
236       *c = SizeToInt(shape[1]);
237       *d = SizeToInt(shape[2]);
238       *h = SizeToInt(shape[3]);
239       *w = SizeToInt(shape[4]);
240     } else if (format == "NDHWC") {
241       *n = SizeToInt(shape[0]);
242       *c = SizeToInt(shape[4]);
243       *d = SizeToInt(shape[1]);
244       *h = SizeToInt(shape[2]);
245       *w = SizeToInt(shape[3]);
246     } else {
247       MS_LOG(ERROR) << "Unsupported data format " << format;
248     }
249   }
250 
CheckBroadcast4TensorOp(const std::vector<int> & A,const std::vector<int> & B,const std::vector<int> & Out)251   inline void CheckBroadcast4TensorOp(const std::vector<int> &A, const std::vector<int> &B,
252                                       const std::vector<int> &Out) {
253     if (A != Out && B != Out) {
254       MS_EXCEPTION(ValueError)
255         << "Double-sided broadcast was not supported in cudnn of cudnnOpTensor:\n"
256            "InputA must match the corresponding dimension of the destination tensor outC, and each "
257            "dimension of the inputB "
258            "must match the corresponding dimension of outC or must be equal to 1.";
259     }
260   }
261 
262   // The tensor size is limited to 2G by cudnn.
CheckTensorSize(const std::initializer_list<std::vector<size_t>> & shapes)263   inline void CheckTensorSize(const std::initializer_list<std::vector<size_t>> &shapes) {
264     for (auto shape : shapes) {
265       size_t total_size = 1;
266       for (auto i : shape) {
267         total_size *= i;
268       }
269       if (total_size >= SHAPE_SIZE_LIMIT) {
270         MS_EXCEPTION(ValueError) << "The total size of the tensor exceeds the max_limit of 2 Giga-elements, which is "
271                                  << total_size << " elements (" << shape << ").";
272       }
273     }
274   }
275 
276   // set the tensor descriptor for cudnn/cublas
CudnnSetTensorNdDescriptor(const std::vector<size_t> & shape,cudnnTensorDescriptor_t descriptor,cudnnDataType_t data_type,const std::weak_ptr<CNode> & node)277   void CudnnSetTensorNdDescriptor(const std::vector<size_t> &shape, cudnnTensorDescriptor_t descriptor,
278                                   cudnnDataType_t data_type, const std::weak_ptr<CNode> &node) {
279     if (shape.size() < 3) {
280       MS_EXCEPTION(ValueError) << "cudnnSetTensorNdDescriptor don't support" << shape.size() << "D.";
281     }
282     const int nbDims = shape.size();
283     std::unique_ptr<int[]> dim = std::make_unique<int[]>(nbDims);
284     std::unique_ptr<int[]> stride = std::make_unique<int[]>(nbDims);
285 
286     for (int i = 0; i < nbDims; i++) {
287       dim[i] = SizeToInt(shape[i]);
288       stride[i] = 1;
289     }
290 
291     for (int i = nbDims - 2; i >= 0; i--) {
292       stride[i] = stride[i + 1] * SizeToInt(shape[i + 1]);
293     }
294 
295     CHECK_CUDNN_RET_WITH_EXCEPT(node,
296                                 cudnnSetTensorNdDescriptor(descriptor, data_type, nbDims, dim.get(), stride.get()),
297                                 "cudnnSetTensorNdDescriptor failed");
298   }
299 
300   // choose the suitable datatype for cudnn/cublas
GetCudnnDataType(const std::string & Type)301   inline cudnnDataType_t GetCudnnDataType(const std::string &Type) {
302     auto type = kCudnnDtypeMap.find(Type);
303     if (type == kCudnnDtypeMap.end()) {
304       MS_EXCEPTION(TypeError) << Type << " is not supported.";
305     }
306     return type->second;
307   }
GetCudaDataType(const std::string & Type)308   inline cudaDataType_t GetCudaDataType(const std::string &Type) {
309     auto type = kCudaDtypeMap.find(Type);
310     if (type == kCudaDtypeMap.end()) {
311       MS_EXCEPTION(TypeError) << Type << " is not supported.";
312     }
313     return type->second;
314   }
315 
316   device::DynamicKernelPtr dynamic_kernel_;
317 };
318 }  // namespace kernel
319 }  // namespace mindspore
320 
321 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_GPUKERNEL_H_
322