• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2024 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_GPU_KERNEL_H_
18 #define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_GPU_KERNEL_H_
19 
20 #include <cuda.h>
21 #include <cudnn.h>
22 #include <string>
23 #include <vector>
24 #include <initializer_list>
25 #include <utility>
26 #include <map>
27 #include <memory>
28 #include <numeric>
29 #include <functional>
30 #include <algorithm>
31 #include <tuple>
32 #include <set>
33 #include <optional>
34 #include "kernel/kernel.h"
35 #include "plugin/device/gpu/kernel/gpu_kernel_mod.h"
36 #include "plugin/factory/ms_factory.h"
37 #include "plugin/device/gpu/kernel/kernel_constants.h"
38 #include "plugin/device/gpu/hal/device/gpu_device_manager.h"
39 #include "plugin/device/gpu/hal/device/gpu_device_address.h"
40 #include "plugin/device/gpu/hal/device/gpu_common.h"
41 #include "include/backend/anf_runtime_algorithm.h"
42 #include "include/common/utils/anfalgo.h"
43 #include "kernel/kernel_build_info.h"
44 #include "kernel/common_utils.h"
45 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
46 
47 using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
48 
49 // The max_limit of tensor shape size: 2 Giga-elements(2^31, the largest number in 32 bits).
50 #define SHAPE_SIZE_LIMIT 2147483648
51 
52 namespace mindspore {
53 namespace kernel {
54 constexpr size_t kShapeIndex1st = 1;
55 constexpr size_t kShapeIndex2nd = 2;
56 constexpr size_t kShapeIndex3rd = 3;
57 constexpr size_t kShapeIndex4th = 4;
58 constexpr size_t kShapeIndex5nd = 5;
59 constexpr size_t kShapeIndex6rd = 6;
60 constexpr size_t kShapeIndex7th = 7;
61 
62 constexpr size_t kDim2DShapeSize = 4;
63 constexpr size_t kDim3DShapeSize = 5;
64 constexpr size_t kPoolingNbDims = kDim3DShapeSize;
65 
66 constexpr size_t kHelperDimsNum = 5;
67 
68 static std::map<int, int> kNCHWToNHWCAxisMap = {
69   {0, 0},
70   {1, 3},
71   {2, 1},
72   {3, 2},
73 };
74 static std::map<int, int> kNHWCToNCHWAxisMap = {
75   {0, 0},
76   {1, 2},
77   {2, 3},
78   {3, 1},
79 };
80 
81 static auto Anyone = [](auto &&k, auto &&... args) { return ((args == k) || ...); };
82 
CeilDivide(int m,int n)83 inline int CeilDivide(int m, int n) { return (m + n - 1) / n; }
84 
GetPad(int input,int kernel,int stride)85 inline int GetPad(int input, int kernel, int stride) {
86   return std::max<int>(0, (CeilDivide(input, stride) - 1) * stride + kernel - input);
87 }
88 
89 // Choose the suitable datatype for cudnn
GetCudnnDataType(const std::string & Type)90 inline cudnnDataType_t GetCudnnDataType(const std::string &Type) {
91   auto type = kCudnnDtypeMap.find(Type);
92   if (type == kCudnnDtypeMap.end()) {
93     MS_EXCEPTION(TypeError) << Type << " is not supported.";
94   }
95   return type->second;
96 }
97 
98 // Choose the suitable datatype for cublas
GetCudaDataType(const std::string & Type)99 inline cudaDataType_t GetCudaDataType(const std::string &Type) {
100   auto type = kCudaDtypeMap.find(Type);
101   if (type == kCudaDtypeMap.end()) {
102     MS_EXCEPTION(TypeError) << Type << " is not supported.";
103   }
104   return type->second;
105 }
106 
107 class NativeGpuKernelMod : public GpuKernelMod {
108  public:
109   using ReduceDetail = std::tuple<size_t, TypeId, TypeId>;
110   using ReducePrecisonRes = std::tuple<bool, std::vector<ReduceDetail>, std::vector<ReduceDetail>>;
111 
DestroyResource()112   virtual void DestroyResource() noexcept {}
113   bool CheckSupport(const std::string &kernel_name, const KernelAttr &kernel_attr);
114   std::vector<KernelAttr> GetAllSupportedList(const std::string &kernel_name);
115   ReducePrecisonRes ReducePrecisionCheck(const std::string &kernel_name, const KernelAttr &kernel_attr);
GetGpuSupportedList(const std::string & kernel_name)116   static std::vector<KernelAttr> GetGpuSupportedList(const std::string &kernel_name) {
117     if (!Factory<NativeGpuKernelMod>::Instance().IsRegistered(kernel_name)) {
118       return {};
119     }
120     return Factory<NativeGpuKernelMod>::Instance().Create(kernel_name)->GetAllSupportedList(kernel_name);
121   }
GetOpSupport()122   std::vector<KernelAttr> GetOpSupport() { return {}; }
123   static bool GpuCheckSupport(const std::string &kernel_name, const KernelAttr &kernel_attr);
124 
GpuReducePrecisionCheck(const std::string & kernel_name,const KernelAttr & kernel_attr)125   static ReducePrecisonRes GpuReducePrecisionCheck(const std::string &kernel_name, const KernelAttr &kernel_attr) {
126     return Factory<NativeGpuKernelMod>::Instance().Create(kernel_name)->ReducePrecisionCheck(kernel_name, kernel_attr);
127   }
GetKernelModType()128   enum KernelModType GetKernelModType() const override { return KernelModType::NativeGpuKernelMod; }
129 
130  protected:
InitResource()131   virtual void InitResource() {}
132   static mindspore::HashMap<std::string, std::vector<KernelAttr>> support_map_;
133 };
134 
135 std::vector<void *> ConvertPtrs(const std::vector<KernelTensor *> &input_ptrs);
136 
137 // expand Nd Shape to 4d (N in [0,4])
138 bool ShapeNdTo4d(const ShapeVector &src, ShapeVector *dst);
139 
140 template <typename T>
GetPossiblyNullDeviceAddress(const std::vector<KernelTensor * > & addr_list,size_t index)141 inline T *GetPossiblyNullDeviceAddress(const std::vector<KernelTensor *> &addr_list, size_t index) {
142   if (index >= addr_list.size()) {
143     MS_LOG(ERROR) << "Address index(" << index << ") out of range(" << addr_list.size() << ")";
144     return nullptr;
145   }
146   // Kernels may run normally without workspace, the addr_list[index] maybe nullptr.
147   if ((addr_list[index] == nullptr) || (addr_list[index]->size() == 0)) {
148     return nullptr;
149   }
150   if (addr_list[index]->device_ptr() == nullptr) {
151     MS_LOG(ERROR) << "The device address is empty, address index:" << index;
152     return nullptr;
153   }
154   return reinterpret_cast<T *>(addr_list[index]->device_ptr());
155 }
156 template <typename T>
GetPossiblyNullDeviceAddress(const std::vector<AddressPtr> & addr_list,size_t index)157 inline T *GetPossiblyNullDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) {
158   if (index >= addr_list.size()) {
159     MS_LOG(ERROR) << "Address index(" << index << ") out of range(" << addr_list.size() << ")";
160     return nullptr;
161   }
162   // Kernels may run normally without workspace, the addr_list[index] maybe nullptr.
163   if ((addr_list[index] == nullptr) || (addr_list[index]->size == 0)) {
164     return nullptr;
165   }
166   if (addr_list[index]->addr == nullptr) {
167     MS_LOG(ERROR) << "The device address is empty, address index:" << index;
168     return nullptr;
169   }
170   return reinterpret_cast<T *>(addr_list[index]->addr);
171 }
172 
173 int AxisTransform(const std::string &origin_data_format, const std::string &cal_format, int axis);
174 
175 // transpose shape: NCHW To NHWC
176 void ShapeNCHW2NHWC(ShapeVector *shape);
177 
178 // transpose shape: NCDHW To NDHWC
179 void ShapeNCDHW2NDHWC(ShapeVector *shape);
180 
181 //////////////// old: format string /////////////
182 void SetDimA(const ShapeVector &shape, int *dimA, size_t len, const std::string &format);
183 
184 void SetStrideA(const ShapeVector &shape, int *strideA, size_t len, const std::string &format);
185 
186 void SetNCHW(const ShapeVector &shape, int *n, int *c, int *h, int *w, const std::string &format);
187 
188 void SetNCDHW(const ShapeVector &shape, int *n, int *c, int *d, int *h, int *w, const std::string &format);
189 ////////////////////////////////////////////////
190 //////////////// new: format enum///////////////
191 void SetDimA(const ShapeVector &shape, int *dimA, size_t len, const mindspore::Format &format);
192 
193 void SetStrideA(const ShapeVector &shape, int *strideA, size_t len, const mindspore::Format &format);
194 
195 void SetNCHW(const ShapeVector &shape, int *n, int *c, int *h, int *w, const mindspore::Format &format);
196 
197 void SetNCDHW(const ShapeVector &shape, int *n, int *c, int *d, int *h, int *w, const mindspore::Format &format);
198 ////////////////////////////////////////////////
199 
200 bool CheckBroadcast4TensorOp(const std::vector<int> &A, const std::vector<int> &B, const std::vector<int> &Out);
201 
202 // The tensor size is limited to 2G by cudnn.
203 bool CheckTensorSize(const std::initializer_list<ShapeVector> &shapes);
204 
205 // set the tensor descriptor for cudnn/cublas
206 bool CudnnSetTensorNdDescriptor(const ShapeVector &shape, cudnnTensorDescriptor_t descriptor, cudnnDataType_t data_type,
207                                 const std::string &node_name);
208 
209 // choose the suitable datatype for cudnn/cublas
210 bool GetCudnnDataType(const std::string &Type, cudnnDataType_t *out_type);
211 
212 bool GetCudaDataType(const std::string &Type, cudaDataType_t *out_type);
213 
214 bool ShapeEqual(const ShapeVector &s1, const ShapeVector &s2);
215 
216 template <typename T>
GetDimValue(const std::vector<KernelTensor * > & inputs,const int index,const string kernel_name,const TypeId & dim_type)217 T GetDimValue(const std::vector<KernelTensor *> &inputs, const int index, const string kernel_name,
218               const TypeId &dim_type) {
219   size_t size = abstract::TypeIdSize(dim_type);
220   auto dim_gpu_addr =
221     std::make_shared<device::gpu::GPUDeviceAddress>(inputs[index]->device_ptr(), size, kOpFormat_DEFAULT, dim_type);
222   int res = 0;
223   if (dim_type == kNumberTypeInt32) {
224     int32_t host_dim = 0;
225     dim_gpu_addr->SyncDeviceToHost(size, &host_dim);
226     res = static_cast<T>(host_dim);
227   } else if (dim_type == kNumberTypeInt64) {
228     int64_t host_dim = 0;
229     dim_gpu_addr->SyncDeviceToHost(size, &host_dim);
230     res = static_cast<T>(host_dim);
231   } else {
232     MS_LOG(EXCEPTION) << "For '" << kernel_name << "', got unsupported data type of dim: " << dim_type;
233   }
234   return res;
235 }
236 // This is necessary for gpu kernels to support uint8 data type. In cuda, an unsigned,
237 // 8 bit integral type is represented by an unsigned char, but the MS_REG_GPU_KERNEL
238 // macros defined below will create compilation errors when datatype T contains a space,
239 // because the variable created by the macro will also contain a space. So, we solve this
240 // problem by writing uchar when calling these macros, and expanding uchar after the
241 // variable has been created.
242 using uchar = unsigned char;
243 
GetTensorSize(std::vector<size_t> shape)244 inline size_t GetTensorSize(std::vector<size_t> shape) {
245   return std::accumulate(shape.begin(), shape.end(), size_t(1), std::multiplies<size_t>());
246 }
247 }  // namespace kernel
248 }  // namespace mindspore
249 
250 #endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_GPU_KERNEL_H_
251