• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/litert/kernel/opencl/kernel/concat.h"
18 #include <cstring>
19 #include <string>
20 #include <algorithm>
21 #include <set>
22 #include "src/litert/kernel_registry.h"
23 #include "src/litert/kernel/opencl/utils.h"
24 
25 const std::vector<std::string> cl_index_str = {".x", ".y", ".z", ".w"};
26 
27 using mindspore::kernel::KERNEL_ARCH::kGPU;
28 using mindspore::lite::KernelRegistrar;
29 using mindspore::lite::RET_ERROR;
30 using mindspore::lite::RET_OK;
31 using mindspore::lite::opencl::ImageSize;
32 using mindspore::schema::PrimitiveType_Concat;
33 
34 namespace mindspore {
35 namespace kernel {
RunAxis0()36 int ConcatOpenCLKernel::RunAxis0() {
37   auto allocator_ = ocl_runtime_->GetAllocator();
38   ImageSize img_size;
39   auto dst_data = out_tensors_[0]->data();
40   MS_ASSERT(dst_data);
41   auto dst_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
42   auto *out_image = allocator_->GetImage(dst_data);
43   for (size_t i = 0; i < in_tensors_.size(); i++) {
44     auto src_data = weight_ptrs_.at(i) == nullptr ? in_tensors_[i]->data() : weight_ptrs_.at(i);
45     if (allocator_->GetImageSize(src_data, &img_size) != RET_OK) {
46       MS_LOG(WARNING) << "GetImageSize failed.";
47       return RET_ERROR;
48     }
49     auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
50     auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
51     auto *input_image = allocator_->GetImage(src_data);
52     if (ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*input_image, *out_image, src_origin, dst_origin,
53                                                                  region) != CL_SUCCESS) {
54       MS_LOG(WARNING) << "enqueueCopyImage failed.";
55     }
56     dst_origin[1] += region[1];
57   }
58   return RET_OK;
59 }
60 
ConcatGetWorkGroup(const std::vector<size_t> & global,std::vector<size_t> * local,int max_size)61 void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
62   const int max_divider = 8;
63   const int max_x = 2;
64   const int max_y = 8;
65   int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
66   if (x == 0) {
67     return;
68   }
69   int yz = max_size / x;
70   int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
71   if (y == 0) {
72     return;
73   }
74   int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
75 
76   local->clear();
77   local->push_back(x);
78   local->push_back(y);
79   local->push_back(z);
80 }
81 
CheckSpecs()82 int ConcatOpenCLKernel::CheckSpecs() {
83   if (in_tensors_.size() < INPUT_TENSOR_SIZE_2 || out_tensors_.size() != OUTPUT_TENSOR_SIZE_1) {
84     MS_LOG(WARNING) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
85     return RET_ERROR;
86   }
87   std::set<lite::opencl::GpuType> mali_devices = {lite::opencl::MALI, lite::opencl::MALI_T, lite::opencl::MALI_G,
88                                                   lite::opencl::MALI_G78};
89   auto cur_gpu_type = ocl_runtime_->GetGpuInfo().type;
90   if ((mali_devices.find(cur_gpu_type) != mali_devices.end()) && (in_tensors_.size() > INPUT_TENSOR_SIZE_16)) {
91     MS_LOG(WARNING) << "For MALI serial, the size of inputs should be no more than 16, but got " << in_tensors_.size()
92                     << "in Concat kernel.";
93     return RET_ERROR;
94   }
95   auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
96   auto out_tensors_shape_size = out_tensors_[0]->shape().size();
97   MS_LOG(DEBUG) << " concat at axis = " << param->axis_;
98   if (out_tensors_shape_size > DIMENSION_4D) {
99     MS_LOG(WARNING) << " GPU Unsupported shape.size > 4 ";
100     return RET_ERROR;
101   }
102 
103   auto out_tensor_info = GpuTensorInfo(out_tensors_[0]);
104   auto height = out_tensor_info.N * out_tensor_info.D * out_tensor_info.H;
105   auto width = out_tensor_info.W * out_tensor_info.Slice;
106   if ((height > ocl_runtime_->GetMaxImage2DHeight()) || (width > ocl_runtime_->GetMaxImage2DWidth())) {
107     MS_LOG(WARNING) << "Output tensor is too larger to use OpenCL in Concat kernel.";
108     return RET_ERROR;
109   }
110 
111   for (auto &in_tensor : in_tensors_) {
112     auto in_tensors_shape_size = in_tensor->shape().size();
113     if (in_tensors_shape_size > DIMENSION_4D) {
114       MS_LOG(WARNING) << " GPU Unsupported in_tensor shape.size > 4 ";
115       return RET_ERROR;
116     }
117   }
118   axis_ = param->axis_;
119   if (axis_ < 0) {
120     axis_ += in_tensors_.front()->shape().size();
121   }
122   constexpr int max_axis = 3;
123   constexpr int min_axis = 0;
124   if (axis_ < min_axis || axis_ > max_axis) {
125     MS_LOG(WARNING) << " only support axis >= 0 and axis <= 3 ";
126     return RET_ERROR;
127   }
128   if (out_tensors_shape_size < 4 && type() == PrimitiveType_Concat && axis_ != 0) {
129     if (out_tensors_shape_size == DIMENSION_2D) {
130       axis_ = axis_ + 2;
131     } else if (out_tensors_shape_size == DIMENSION_3D) {
132       axis_ = axis_ + 1;
133     } else {
134       MS_LOG(WARNING) << " Unsupported axis =:  " << axis_ << "  shape().size()=:  " << out_tensors_shape_size;
135       return RET_ERROR;
136     }
137   }
138   return RET_OK;
139 }
140 
GenMainCodeAxis3UnAlign()141 std::string ConcatOpenCLKernel::GenMainCodeAxis3UnAlign() {
142   std::stringstream code;
143   int result_index = 0;
144   int temp_index = 0;
145   int output_index = 0;
146   code << "DTYPE4 result = (DTYPE4)(0);\n";
147   for (size_t j = 0; j < in_tensors_.size(); j++) {
148     std::vector<int> in_shape(DIMENSION_4D);
149     Broadcast2GpuShape(in_tensors_[j]->shape().data(), in_tensors_[j]->shape().size(), in_shape.data(), DIMENSION_4D,
150                        1);
151     auto align_num = UP_DIV(in_shape[CLIDX_W], C4NUM);
152 
153     for (int k = 0; k < align_num; k++) {
154       code << "DTYPE4 t" << temp_index << " = READ_IMAGE(input" << j << ", smp_zero, (int2)(((Y) * (" << align_num
155            << ") + (" << k << ")), (X)));\n";
156       for (int m = 0; (m < C4NUM) && (m < in_shape[CLIDX_W] - k * C4NUM); m++) {
157         code << "result" << cl_index_str[result_index++ % C4NUM] << " = t" << temp_index << cl_index_str[m] << ";\n";
158         if (result_index % C4NUM == 0) {
159           code << "WRITE_IMAGE(output, (int2)(((Y) * (" << out_shape_.s[CLIDX_W] << ") + (" << output_index++
160                << ")), (X)), result);\n";
161         }
162       }
163       temp_index++;
164     }
165   }
166   if (out_shape_.s[CLIDX_W] > output_index) {
167     code << "WRITE_IMAGE(output, (int2)(((Y) * (" << out_shape_.s[CLIDX_W] << ") + (" << output_index++
168          << ")), (X)), result);\n";
169   }
170   return code.str();
171 }
172 
GenMainCodeOthers()173 std::string ConcatOpenCLKernel::GenMainCodeOthers() {
174   std::stringstream code;
175   code << "DTYPE4 result;\n";
176   if (axis_ == kNHWC_H) {
177     code << "int IN = X / " << out_shape_.s[CLIDX_Y] << ";\n"
178          << "int IH = X - IN * " << out_shape_.s[CLIDX_Y] << ";\n";
179   }
180 
181   for (size_t j = 0; j < in_tensors_.size(); j++) {
182     std::vector<int> in_shape(DIMENSION_4D);
183     Broadcast2GpuShape(in_tensors_[j]->shape().data(), in_tensors_[j]->shape().size(), in_shape.data(), DIMENSION_4D,
184                        1);
185     in_shape[CLIDX_W] = UP_DIV(in_shape[CLIDX_W], C4NUM);
186     std::string variable_name;
187     std::string function_y;
188     if (axis_ == kNHWC_H) {
189       variable_name = "IH";
190       function_y = "IN * " + std::to_string(in_shape[CLIDX_Y]) + " + IH";
191     } else if (axis_ == kNHWC_C) {
192       variable_name = "Z";
193       function_y = "X";
194     } else {
195       variable_name = "Y";
196       function_y = "X";
197     }
198     if (j == 0) {
199       code << "int boundary0 = " << in_shape[axis_] << ";\n";
200       code << "if (" << variable_name << " < boundary0) {\n";
201       code << "int coordinate_x = Y * " << in_shape[CLIDX_W] << " + Z;\n";
202       code << "int coordinate_y = " << function_y << ";\n";
203       code << "result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y));\n";
204       code << "}\n";
205     } else {
206       code << "int boundary" << j << " = boundary" << (j - 1) << " + " << in_shape[axis_] << ";\n";
207       code << "if (" << variable_name << " >= boundary" << (j - 1) << " && " << variable_name << " < boundary" << j
208            << ") {\n";
209       if (axis_ == kNHWC_H) {
210         code << "int coordinate_x = Y * " << in_shape[CLIDX_W] << " + Z;\n";
211         code << "int coordinate_y = " << function_y << " - boundary" << (j - 1) << ";\n";
212       } else if (axis_ == kNHWC_W) {
213         code << "int coordinate_x = (Y - boundary" << (j - 1) << ") * " << in_shape[CLIDX_W] << " + Z;\n";
214         code << "int coordinate_y = X;\n";
215       } else if (axis_ == kNHWC_C) {
216         code << "int coordinate_x = Y * " << in_shape[CLIDX_W] << " + Z - boundary" << (j - 1) << ";\n";
217         code << "int coordinate_y = X;\n";
218       }
219 
220       code << "result = READ_IMAGE(input" << j << ", smp_none, (int2)(coordinate_x, coordinate_y));\n";
221       code << "}\n";
222     }
223   }
224   code << "WRITE_IMAGE(output, (int2)((Y) * " << out_shape_.s[CLIDX_W] << " + Z, (X)), result);\n";
225   return code.str();
226 }
227 
GenCode()228 std::string ConcatOpenCLKernel::GenCode() {
229   std::vector<int> out_shape(DIMENSION_4D);
230   Broadcast2GpuShape(out_tensors_[0]->shape().data(), out_tensors_[0]->shape().size(), out_shape.data(), DIMENSION_4D,
231                      1);
232   for (size_t i = 0; i < out_shape.size(); i++) {
233     out_shape_.s[i] = out_shape[i];
234   }
235   out_shape_.s[CLIDX_W] = UP_DIV(out_shape_.s[CLIDX_W], C4NUM);
236 
237   std::stringstream code;
238   auto header = OpenCLKernelHeader();
239   if (header.empty()) {
240     MS_LOG(ERROR) << "Generate OpenCL kernel header failed.";
241     return "";
242   }
243   code << header;
244   code << "__kernel void Concat(\n";
245   for (size_t i = 0; i < in_tensors_.size(); i++) {
246     code << "__read_only image2d_t input" << i << ",\n";
247   }
248   code << "__write_only image2d_t output\n) {\n";
249 
250   if (axis_ == kNHWC_C && !Align_) {
251     code << "int X = get_global_id(0);\n"
252          << "int Y = get_global_id(1);\n"
253          << "if (X >= " << out_shape[CLIDX_X] * out_shape[CLIDX_Y] << " || Y >= " << out_shape[CLIDX_Z]
254          << ") return;\n";
255 
256     code << GenMainCodeAxis3UnAlign();
257   } else {
258     code << "int X = get_global_id(0);\n"
259          << "int Y = get_global_id(1);\n"
260          << "int Z = get_global_id(2);\n"
261          << "if (X >= " << out_shape_.s[CLIDX_X] * out_shape_.s[CLIDX_Y] << " || Y >= " << out_shape_.s[CLIDX_Z]
262          << " || Z >= " << out_shape_.s[CLIDX_W] << ") return;\n";
263 
264     code << GenMainCodeOthers();
265   }
266   code << "}\n";
267   return code.str();
268 }
269 
SetGlobalLocal()270 int ConcatOpenCLKernel::SetGlobalLocal() {
271   const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
272   if (axis_ == 3 && !Align_) {
273     OH = out_shape_.s[0] * out_shape_.s[1];
274     OW = out_shape_.s[2];
275     global_size_ = {OH, OW, 1};
276     local_size_ = {1, 1, 1};
277   } else {
278     OH = out_shape_.s[0] * out_shape_.s[1];
279     OW = out_shape_.s[2];
280     OC = out_shape_.s[3];
281     global_size_ = {OH, OW, OC};
282     local_size_ = {1, 1, 1};
283   }
284   ConcatGetWorkGroup(global_size_, &local_size_, max_global[0]);
285   OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
286 
287   return RET_OK;
288 }
289 
ConvertWeightToTensor()290 int ConcatOpenCLKernel::ConvertWeightToTensor() {
291   auto allocator = ocl_runtime_->GetAllocator();
292   bool fp16_enable = ocl_runtime_->GetFp16Enable();
293   for (auto in_tensor : in_tensors_) {
294     auto in_shape = GpuTensorInfo(in_tensor);
295     if (in_tensor->IsConst()) {
296       std::vector<char> weight(in_shape.Image2DSize, 0);
297       bool src_is_fp16 = in_tensor->data_type() == kNumberTypeFloat16;
298       PackNHWCToNHWC4(in_tensor->data(), weight.data(), src_is_fp16,
299                       fp16_enable && in_tensor->data_type() != kNumberTypeInt32, in_shape);
300       size_t dtype;
301       switch (in_tensor->data_type()) {
302         case kNumberTypeInt32: {
303           dtype = CL_SIGNED_INT32;
304           break;
305         }
306         case kNumberTypeFloat32: {
307           dtype = CL_FLOAT;
308           break;
309         }
310         case kNumberTypeFloat16: {
311           dtype = CL_HALF_FLOAT;
312           break;
313         }
314         default:
315           MS_LOG(ERROR) << "Unsupported data type is" << in_tensor->data_type();
316           return RET_ERROR;
317       }
318       ImageSize img_size{in_shape.width, in_shape.height, dtype};
319       auto weight_ptr_ = allocator->Malloc(img_size, weight.data());
320       if (weight_ptr_ == nullptr) {
321         MS_LOG(ERROR) << "Malloc failed.";
322         return RET_ERROR;
323       }
324       weight_ptrs_.push_back(weight_ptr_);
325     } else {
326       weight_ptrs_.push_back(nullptr);
327     }
328   }
329   return RET_OK;
330 }
331 
Prepare()332 int ConcatOpenCLKernel::Prepare() {
333   int ret = ConvertWeightToTensor();
334   if (ret != RET_OK) {
335     MS_LOG(ERROR) << "ConvertWeightToTensor failed.";
336     return ret;
337   }
338   if (axis_ == 0) {
339     if (std::any_of(in_tensors_.begin(), in_tensors_.end(), [](lite::Tensor *t) { return t->shape().size() != 1; })) {
340       return RET_OK;
341     }
342     axis_ = 3;
343   }
344   for (auto const &in_tensor : in_tensors_) {
345     if (in_tensor->shape().back() % C4NUM != 0) {
346       Align_ = false;
347     }
348   }
349 
350   std::string source = GenCode();
351   // For debug.
352   dump_code_ = "[" + this->name() + "]\n" + source;
353 
354   if (source.empty()) {
355     MS_LOG(ERROR) << "Failed to generate source code for " << this->name();
356     return RET_ERROR;
357   }
358 
359   std::string program_name = "Concat\n" + source;
360   std::string kernel_name = "Concat";
361   if (!ocl_runtime_->LoadSource(program_name, source)) {
362     MS_LOG(ERROR) << "Load source failed.";
363     return RET_ERROR;
364   }
365 
366   std::vector<std::string> build_options_ext{};
367   ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
368   if (ret != RET_OK) {
369     MS_LOG(ERROR) << "Build kernel failed.";
370     return ret;
371   }
372   (void)SetGlobalLocal();
373   return RET_OK;
374 }
375 
Run()376 int ConcatOpenCLKernel::Run() {
377   MS_LOG(DEBUG) << this->name() << " Running! ";
378   if (axis_ == 0) {
379     return RunAxis0();
380   }
381   int arg_cn = 0;
382   for (size_t i = 0; i < in_tensors_.size(); ++i) {
383     auto input_ptr = weight_ptrs_.at(i) == nullptr ? in_tensors_[i]->data() : weight_ptrs_.at(i);
384     if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_ptr) != CL_SUCCESS) {
385       MS_LOG(ERROR) << "SetKernelArg failed.";
386       return RET_ERROR;
387     }
388   }
389 
390   if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data()) != CL_SUCCESS) {
391     MS_LOG(ERROR) << "SetKernelArg failed.";
392     return RET_ERROR;
393   }
394 
395   if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
396     MS_LOG(ERROR) << "RunKernel failed.";
397     return RET_ERROR;
398   }
399   return RET_OK;
400 }
401 
402 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>)
403 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>)
404 REG_KERNEL(kGPU, kNumberTypeInt32, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>)
405 }  // namespace kernel
406 }  // namespace mindspore
407