1 /**
2 * Copyright 2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "plugin/device/gpu/kernel/dynamic_akg/dynamic_akg_gpu_kernel_mod.h"
18 #include <fstream>
19 #include <numeric>
20 #include <functional>
21 #include <chrono>
22 #include <cmath>
23 #include <algorithm>
24 #include <map>
25 #include "nlohmann/json.hpp"
26 #include "utils/ms_utils.h"
27 #include "kernel/framework_utils.h"
28 #include "mindspore/ccsrc/include/common/debug/common.h"
29 #include "plugin/device/gpu/hal/device/gpu_common.h"
30
31 namespace mindspore {
32 namespace kernel {
33 using std::fstream;
34 using std::string;
35 using std::unordered_map;
36 using std::vector;
37
38 constexpr auto kStaticTileImpl = "StaticTileImpl";
39 constexpr auto kSupportInfo = "SupportInfo";
40
41 constexpr int64_t kRemove = -100000;
42 constexpr int64_t kKeep = -99999;
43
44 DynamicAkgGpuKernelManagerPtr DynamicAkgGpuKernelMod::kernel_manager_ = std::make_shared<DynamicAkgGpuKernelManager>();
DynamicAkgGpuKernelManager()45 DynamicAkgGpuKernelManager::DynamicAkgGpuKernelManager() {}
46
GetCUResult(const char * kernel_content,bool force_reload,vector<uint32_t> * thread_info,CUfunction * func,const string kernel_name)47 CUresult DynamicAkgGpuKernelManager::GetCUResult(const char *kernel_content, bool force_reload,
48 vector<uint32_t> *thread_info, CUfunction *func,
49 const string kernel_name) {
50 string fn = kernel_name;
51 CUmodule module;
52 CUjit_option options[] = {};
53 void *optionValues[] = {};
54 CUresult result = cuModuleLoadDataEx(&module, kernel_content, 0, options, optionValues);
55 if (result != CUDA_SUCCESS) {
56 const char *msg = nullptr;
57 cuGetErrorName(result, &msg);
58 MS_LOG(ERROR) << "cuModuleLoadDataEx failed. Kernel name: << " << fn << ". Error message: " << msg;
59 return result;
60 }
61 result = cuModuleGetFunction(func, module, fn.c_str());
62 if (result != CUDA_SUCCESS) {
63 const char *msg = nullptr;
64 cuGetErrorName(result, &msg);
65 MS_LOG(ERROR) << "cuModuleGetFunction failed. Kernel name: << " << fn << ". Error message: " << msg;
66 return result;
67 }
68 infotable_[fn] = std::make_shared<GpuKernelMeta>(*func, module, *thread_info);
69 return result;
70 }
71
GetFunction(const KernelPackPtr & kernel_pack,bool force_reload,vector<uint32_t> * thread_info,CUfunction * func,const string kernel_name)72 CUresult DynamicAkgGpuKernelManager::GetFunction(const KernelPackPtr &kernel_pack, bool force_reload,
73 vector<uint32_t> *thread_info, CUfunction *func,
74 const string kernel_name) {
75 if (kernel_pack->GetJson() == nullptr || kernel_pack->GetJson()->contents == nullptr ||
76 kernel_pack->GetKernel() == nullptr || kernel_pack->GetKernel()->contents == nullptr) {
77 MS_LOG(ERROR) << "Invalid kernel pack, json or kernel is nullptr of kernel : " << kernel_name << ".\n";
78 return CUDA_ERROR_INVALID_IMAGE;
79 }
80 return GetCUResult(&kernel_pack->GetKernel()->contents[0], force_reload, thread_info, func, kernel_name);
81 }
82
UpdateShapeList(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & outputs)83 void DynamicAkgGpuKernelMod::UpdateShapeList(const std::vector<KernelTensor *> &inputs,
84 const std::vector<KernelTensor *> &outputs) {
85 shape_list_.clear();
86 for (size_t i = 0; i < inputs.size(); i++) {
87 auto in_shape = inputs[i]->GetShapeVector();
88 (void)shape_list_.emplace_back(in_shape);
89 }
90 for (size_t i = 0; i < outputs.size(); i++) {
91 auto out_shape = outputs[i]->GetShapeVector();
92 (void)shape_list_.emplace_back(out_shape);
93 }
94 for (auto it : kernel_map_) {
95 it.second->shape_list_ = shape_list_;
96 }
97 MS_LOG(INFO) << "Done UpdateShapeList for " << kernel_name_ << " shape_list_ = " << shape_list_;
98 }
99
UpdateStaticShapeMappingInfo()100 void DynamicAkgGpuKernelMod::UpdateStaticShapeMappingInfo() {
101 thread_info_.clear();
102 thread_info_.emplace_back(parsed_js_[kBlockIdxX]);
103 thread_info_.emplace_back(parsed_js_[kBlockIdxY]);
104 thread_info_.emplace_back(parsed_js_[kBlockIdxZ]);
105 thread_info_.emplace_back(parsed_js_[kThreadIdxX]);
106 thread_info_.emplace_back(parsed_js_[kThreadIdxY]);
107 thread_info_.emplace_back(parsed_js_[kThreadIdxZ]);
108 MS_LOG(INFO) << "Done UpdateStaticShapeMappingInfo for " << kernel_name_;
109 }
110
DynamicAkgGpuKernelMod(const KernelPackPtr & kernel_pack)111 DynamicAkgGpuKernelMod::DynamicAkgGpuKernelMod(const KernelPackPtr &kernel_pack) : kernel_pack_(kernel_pack) {
112 if (kernel_pack != nullptr) {
113 auto js = kernel_pack->GetJson();
114 if (js != nullptr) {
115 parsed_js_ = nlohmann::json::parse(js->contents, js->contents + js->len);
116 kernel_name_ = parsed_js_["kernelName"];
117 }
118 }
119 }
120
CheckJsonParsed()121 void DynamicAkgGpuKernelMod::CheckJsonParsed() {
122 if (parsed_js_ != nullptr) {
123 return;
124 }
125 if (kernel_pack_ == nullptr) {
126 MS_EXCEPTION(RuntimeError) << "Invalid kernel pack for kernel: " << kernel_name_ << ".";
127 }
128 auto js = kernel_pack_->GetJson();
129 if (js == nullptr) {
130 MS_EXCEPTION(RuntimeError) << "Invalid kernel pack, json is nullptr for kernel: " << kernel_name_ << ".";
131 }
132 parsed_js_ = nlohmann::json::parse(js->contents, js->contents + js->len);
133 }
134
Initialize()135 void DynamicAkgGpuKernelMod::Initialize() {
136 CheckJsonParsed();
137 if (is_dynamic_) {
138 InitAkgKernelImpls();
139 } else {
140 UpdateStaticShapeMappingInfo();
141 const size_t thread_info_num = 6;
142 if (thread_info_.size() != thread_info_num ||
143 (std::any_of(thread_info_.begin(), thread_info_.end(), [](uint32_t t) { return t <= 0; }))) {
144 MS_EXCEPTION(ValueError) << "For " << kernel_name_
145 << ", gpu mapping config must be updated to 6 positive numbers before "
146 << "launch, but got thread_info = " << thread_info_;
147 }
148 }
149 }
150
InitAkgKernelImpls()151 void DynamicAkgGpuKernelMod::InitAkgKernelImpls() {
152 kernel_map_[kernel_name_] = std::make_shared<DynamicTileImpl>(kernel_name_, parsed_js_);
153 if (parsed_js_.find(kStaticTileImpl) != parsed_js_.end()) {
154 auto static_kernel_json = parsed_js_.at(kStaticTileImpl);
155 auto static_kernel_name = static_kernel_json["kernelName"];
156 kernel_map_[static_kernel_name] = std::make_shared<StaticTileImpl>(static_kernel_name, static_kernel_json);
157 }
158 for (auto it : kernel_map_) {
159 it.second->InitJsonShapeInformation();
160 it.second->InitJsonMappingInformation();
161 if (it.second->parsed_js_[kSupportInfo]["OperatorType"] == "Reduce") {
162 it.second->preprocessDynamicReduceTiling();
163 }
164 }
165 MS_LOG(INFO) << "InitAkgKernelImpls " << kernel_map_.size();
166 }
167
SelectKernelImpl()168 AkgKernelImplInfoPtr DynamicAkgGpuKernelMod::SelectKernelImpl() {
169 if (kernel_map_.find(kernel_name_) == kernel_map_.end()) {
170 MS_EXCEPTION(RuntimeError) << "No default kernel for " << kernel_name_;
171 }
172 auto default_kernel = kernel_map_[kernel_name_];
173 AkgKernelImplInfoPtr static_kernel = nullptr;
174 for (auto it : kernel_map_) {
175 if (it.second == default_kernel) {
176 continue;
177 }
178 static_kernel = it.second;
179 }
180 if (static_kernel == nullptr) {
181 MS_LOG(DEBUG) << "For " << kernel_name_ << ", only have default kernel, return";
182 return default_kernel;
183 }
184 if (default_kernel->runtime_vars_.empty()) {
185 MS_LOG(DEBUG) << "For " << kernel_name_ << ", default kernel is static tile, return";
186 return default_kernel;
187 }
188 static_kernel->Init();
189 default_kernel->Init();
190 for (auto it : default_kernel->runtime_vars_) {
191 MS_LOG(INFO) << "Runtime var: " << it.second->ToString();
192 bool is_thread = it.second->curr_map_id >= AKG_KERNEL_MOD_TX_IDX && it.second->curr_map_id <= AKG_KERNEL_MOD_TZ_IDX;
193 const int64_t upper_bound_limit = 32;
194 if (is_thread && it.second->upper_bound <= upper_bound_limit) {
195 return static_kernel;
196 }
197 }
198 MS_LOG(INFO) << kernel_name_ << " use dynamic tile kernel, shape " << shape_list_ << "; Static thread info "
199 << static_kernel->thread_info_;
200 return kernel_map_[kernel_name_];
201 }
202
Resize(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & outputs)203 int DynamicAkgGpuKernelMod::Resize(const std::vector<KernelTensor *> &inputs,
204 const std::vector<KernelTensor *> &outputs) {
205 int ret = KernelMod::Resize(inputs, outputs);
206 UpdateShapeList(inputs, outputs);
207 kernel_impl_ = SelectKernelImpl();
208 kernel_impl_->Resize();
209 MS_LOG(INFO) << "Done resize for DynamicAkgGpuKernelMod for " << kernel_name_;
210 return ret;
211 }
212
Launch(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & workspace,const std::vector<KernelTensor * > & outputs,void * stream_ptr)213 bool DynamicAkgGpuKernelMod::Launch(const std::vector<KernelTensor *> &inputs,
214 const std::vector<KernelTensor *> &workspace,
215 const std::vector<KernelTensor *> &outputs, void *stream_ptr) {
216 if (kernel_impl_ != nullptr) {
217 kernel_name_ = kernel_impl_->kernel_name_;
218 thread_info_ = kernel_impl_->thread_info_;
219 arg_size_vec_ = kernel_impl_->arg_size_vec_;
220 }
221
222 if (stream_ptr == 0) {
223 MS_LOG(ERROR) << "stream_ptr should not be nullptr. Kernel name: " << kernel_name_;
224 return false;
225 }
226 if (kernel_pack_ == nullptr) {
227 MS_LOG(ERROR) << "kernel pack should not be nullptr. Kernel name: " << kernel_name_;
228 return false;
229 }
230 MS_LOG(INFO) << "Start Launch for " << kernel_name_;
231 CUresult result;
232 if (kernel_addr_ == nullptr) {
233 result = kernel_manager_->GetFunction(kernel_pack_, false, &thread_info_, &kernel_addr_, kernel_name_);
234 if (result != CUDA_SUCCESS) {
235 const char *msg = nullptr;
236 cuGetErrorName(result, &msg);
237 MS_LOG(ERROR) << "Get function " << kernel_name_ << " failed. Error message: " << msg;
238 return false;
239 }
240 }
241
242 vector<void *> runtimeargs;
243 vector<void *> dev_addrs;
244 vector<size_t> shp;
245 CUdeviceptr dev_ptr_fake;
246 if (is_dynamic_) {
247 runtimeargs.reserve(arg_size_vec_.size());
248 dev_addrs.reserve(arg_size_vec_.size());
249 shp.reserve(arg_size_vec_.size());
250 size_t inum = 0;
251 size_t onum = 0;
252 for (size_t idx = 0; idx < arg_size_vec_.size(); idx++) {
253 if (arg_size_vec_[idx] == kRemove) {
254 runtimeargs.push_back(reinterpret_cast<void *>(&dev_ptr_fake));
255 } else if (arg_size_vec_[idx] == kKeep) {
256 if (inum < inputs.size()) {
257 runtimeargs.push_back(reinterpret_cast<void *>(&dev_addrs.emplace_back(inputs[inum]->device_ptr())));
258 inum++;
259 } else if (onum < outputs.size()) {
260 runtimeargs.push_back(reinterpret_cast<void *>(&dev_addrs.emplace_back(outputs[onum]->device_ptr())));
261 onum++;
262 }
263 } else {
264 size_t arg_size = static_cast<size_t>(arg_size_vec_[idx]);
265 runtimeargs.push_back(reinterpret_cast<void *>(&shp.emplace_back(arg_size)));
266 }
267 }
268 } else {
269 runtimeargs.reserve(inputs.size() + outputs.size() + workspace.size());
270 dev_addrs.reserve(inputs.size() + outputs.size() + workspace.size());
271 (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(runtimeargs),
272 [&dev_addrs](const KernelTensor *input) {
273 return reinterpret_cast<void *>(&dev_addrs.emplace_back(input->device_ptr()));
274 });
275 (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs),
276 [&dev_addrs](const KernelTensor *output) {
277 return reinterpret_cast<void *>(&dev_addrs.emplace_back(output->device_ptr()));
278 });
279 if (!workspace.empty()) {
280 (void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(runtimeargs),
281 [&dev_addrs](const KernelTensor *ws) {
282 return reinterpret_cast<void *>(&dev_addrs.emplace_back(ws->device_ptr()));
283 });
284 }
285 }
286
287 result = cuLaunchKernel(kernel_addr_, thread_info_[AKG_KERNEL_MOD_BX_IDX], thread_info_[AKG_KERNEL_MOD_BY_IDX],
288 thread_info_[AKG_KERNEL_MOD_BZ_IDX], thread_info_[AKG_KERNEL_MOD_TX_IDX],
289 thread_info_[AKG_KERNEL_MOD_TY_IDX], thread_info_[AKG_KERNEL_MOD_TZ_IDX], 0,
290 reinterpret_cast<CUstream>(stream_ptr), reinterpret_cast<void **>(&runtimeargs[0]), 0);
291 if (result != CUDA_SUCCESS) {
292 const char *msg = nullptr;
293 cuGetErrorName(result, &msg);
294 MS_LOG(ERROR) << "Launch kernel failed. Kernel name: " << kernel_name_ << ". cuLaunchKernel error message: " << msg;
295 return false;
296 }
297 MS_LOG(INFO) << "End Launch for " << kernel_name_;
298 return true;
299 }
300 } // namespace kernel
301 } // namespace mindspore
302