1 /**
2 * Copyright 2019 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/litert/kernel/opencl/kernel/concat.h"
18 #include <cstring>
19 #include <string>
20 #include <algorithm>
21 #include <set>
22 #include "src/litert/kernel_registry.h"
23 #include "src/litert/kernel/opencl/utils.h"
24
25 const std::vector<std::string> cl_index_str = {".x", ".y", ".z", ".w"};
26
27 using mindspore::kernel::KERNEL_ARCH::kGPU;
28 using mindspore::lite::KernelRegistrar;
29 using mindspore::lite::RET_ERROR;
30 using mindspore::lite::RET_OK;
31 using mindspore::lite::opencl::ImageSize;
32 using mindspore::schema::PrimitiveType_Concat;
33
34 namespace mindspore {
35 namespace kernel {
RunAxis0()36 int ConcatOpenCLKernel::RunAxis0() {
37 auto allocator_ = ocl_runtime_->GetAllocator();
38 ImageSize img_size;
39 auto dst_data = out_tensors_[0]->data();
40 MS_ASSERT(dst_data);
41 auto dst_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
42 auto *out_image = allocator_->GetImage(dst_data);
43 for (size_t i = 0; i < in_tensors_.size(); i++) {
44 auto src_data = weight_ptrs_.at(i) == nullptr ? in_tensors_[i]->data() : weight_ptrs_.at(i);
45 if (allocator_->GetImageSize(src_data, &img_size) != RET_OK) {
46 MS_LOG(WARNING) << "GetImageSize failed.";
47 return RET_ERROR;
48 }
49 auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
50 auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
51 auto *input_image = allocator_->GetImage(src_data);
52 if (ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*input_image, *out_image, src_origin, dst_origin,
53 region) != CL_SUCCESS) {
54 MS_LOG(WARNING) << "enqueueCopyImage failed.";
55 }
56 dst_origin[1] += region[1];
57 }
58 return RET_OK;
59 }
60
ConcatGetWorkGroup(const std::vector<size_t> & global,std::vector<size_t> * local,int max_size)61 void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
62 const int max_divider = 8;
63 const int max_x = 2;
64 const int max_y = 8;
65 int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
66 if (x == 0) {
67 return;
68 }
69 int yz = max_size / x;
70 int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
71 if (y == 0) {
72 return;
73 }
74 int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
75
76 local->clear();
77 local->push_back(x);
78 local->push_back(y);
79 local->push_back(z);
80 }
81
CheckSpecs()82 int ConcatOpenCLKernel::CheckSpecs() {
83 if (in_tensors_.size() < INPUT_TENSOR_SIZE_2 || out_tensors_.size() != OUTPUT_TENSOR_SIZE_1) {
84 MS_LOG(WARNING) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
85 return RET_ERROR;
86 }
87 std::set<lite::opencl::GpuType> mali_devices = {lite::opencl::MALI, lite::opencl::MALI_T, lite::opencl::MALI_G,
88 lite::opencl::MALI_G78};
89 auto cur_gpu_type = ocl_runtime_->GetGpuInfo().type;
90 if ((mali_devices.find(cur_gpu_type) != mali_devices.end()) && (in_tensors_.size() > INPUT_TENSOR_SIZE_16)) {
91 MS_LOG(WARNING) << "For MALI serial, the size of inputs should be no more than 16, but got " << in_tensors_.size()
92 << "in Concat kernel.";
93 return RET_ERROR;
94 }
95 auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
96 auto out_tensors_shape_size = out_tensors_[0]->shape().size();
97 MS_LOG(DEBUG) << " concat at axis = " << param->axis_;
98 if (out_tensors_shape_size > DIMENSION_4D) {
99 MS_LOG(WARNING) << " GPU Unsupported shape.size > 4 ";
100 return RET_ERROR;
101 }
102
103 auto out_tensor_info = GpuTensorInfo(out_tensors_[0]);
104 auto height = out_tensor_info.N * out_tensor_info.D * out_tensor_info.H;
105 auto width = out_tensor_info.W * out_tensor_info.Slice;
106 if ((height > ocl_runtime_->GetMaxImage2DHeight()) || (width > ocl_runtime_->GetMaxImage2DWidth())) {
107 MS_LOG(WARNING) << "Output tensor is too larger to use OpenCL in Concat kernel.";
108 return RET_ERROR;
109 }
110
111 for (auto &in_tensor : in_tensors_) {
112 auto in_tensors_shape_size = in_tensor->shape().size();
113 if (in_tensors_shape_size > DIMENSION_4D) {
114 MS_LOG(WARNING) << " GPU Unsupported in_tensor shape.size > 4 ";
115 return RET_ERROR;
116 }
117 }
118 axis_ = param->axis_;
119 if (axis_ < 0) {
120 axis_ += in_tensors_.front()->shape().size();
121 }
122 constexpr int max_axis = 3;
123 constexpr int min_axis = 0;
124 if (axis_ < min_axis || axis_ > max_axis) {
125 MS_LOG(WARNING) << " only support axis >= 0 and axis <= 3 ";
126 return RET_ERROR;
127 }
128 if (out_tensors_shape_size < 4 && type() == PrimitiveType_Concat && axis_ != 0) {
129 if (out_tensors_shape_size == DIMENSION_2D) {
130 axis_ = axis_ + 2;
131 } else if (out_tensors_shape_size == DIMENSION_3D) {
132 axis_ = axis_ + 1;
133 } else {
134 MS_LOG(WARNING) << " Unsupported axis =: " << axis_ << " shape().size()=: " << out_tensors_shape_size;
135 return RET_ERROR;
136 }
137 }
138 return RET_OK;
139 }
140
GenMainCodeAxis3UnAlign()141 std::string ConcatOpenCLKernel::GenMainCodeAxis3UnAlign() {
142 std::stringstream code;
143 int result_index = 0;
144 int temp_index = 0;
145 int output_index = 0;
146 code << "DTYPE4 result = (DTYPE4)(0);\n";
147 for (size_t j = 0; j < in_tensors_.size(); j++) {
148 std::vector<int> in_shape(DIMENSION_4D);
149 Broadcast2GpuShape(in_tensors_[j]->shape().data(), in_tensors_[j]->shape().size(), in_shape.data(), DIMENSION_4D,
150 1);
151 auto align_num = UP_DIV(in_shape[CLIDX_W], C4NUM);
152
153 for (int k = 0; k < align_num; k++) {
154 code << "DTYPE4 t" << temp_index << " = READ_IMAGE(input" << j << ", smp_zero, (int2)(((Y) * (" << align_num
155 << ") + (" << k << ")), (X)));\n";
156 for (int m = 0; (m < C4NUM) && (m < in_shape[CLIDX_W] - k * C4NUM); m++) {
157 code << "result" << cl_index_str[result_index++ % C4NUM] << " = t" << temp_index << cl_index_str[m] << ";\n";
158 if (result_index % C4NUM == 0) {
159 code << "WRITE_IMAGE(output, (int2)(((Y) * (" << out_shape_.s[CLIDX_W] << ") + (" << output_index++
160 << ")), (X)), result);\n";
161 }
162 }
163 temp_index++;
164 }
165 }
166 if (out_shape_.s[CLIDX_W] > output_index) {
167 code << "WRITE_IMAGE(output, (int2)(((Y) * (" << out_shape_.s[CLIDX_W] << ") + (" << output_index++
168 << ")), (X)), result);\n";
169 }
170 return code.str();
171 }
172
GenMainCodeOthers()173 std::string ConcatOpenCLKernel::GenMainCodeOthers() {
174 std::stringstream code;
175 code << "DTYPE4 result;\n";
176 if (axis_ == kNHWC_H) {
177 code << "int IN = X / " << out_shape_.s[CLIDX_Y] << ";\n"
178 << "int IH = X - IN * " << out_shape_.s[CLIDX_Y] << ";\n";
179 }
180
181 for (size_t j = 0; j < in_tensors_.size(); j++) {
182 std::vector<int> in_shape(DIMENSION_4D);
183 Broadcast2GpuShape(in_tensors_[j]->shape().data(), in_tensors_[j]->shape().size(), in_shape.data(), DIMENSION_4D,
184 1);
185 in_shape[CLIDX_W] = UP_DIV(in_shape[CLIDX_W], C4NUM);
186 std::string variable_name;
187 std::string function_y;
188 if (axis_ == kNHWC_H) {
189 variable_name = "IH";
190 function_y = "IN * " + std::to_string(in_shape[CLIDX_Y]) + " + IH";
191 } else if (axis_ == kNHWC_C) {
192 variable_name = "Z";
193 function_y = "X";
194 } else {
195 variable_name = "Y";
196 function_y = "X";
197 }
198 if (j == 0) {
199 code << "int boundary0 = " << in_shape[axis_] << ";\n";
200 code << "if (" << variable_name << " < boundary0) {\n";
201 code << "int coordinate_x = Y * " << in_shape[CLIDX_W] << " + Z;\n";
202 code << "int coordinate_y = " << function_y << ";\n";
203 code << "result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y));\n";
204 code << "}\n";
205 } else {
206 code << "int boundary" << j << " = boundary" << (j - 1) << " + " << in_shape[axis_] << ";\n";
207 code << "if (" << variable_name << " >= boundary" << (j - 1) << " && " << variable_name << " < boundary" << j
208 << ") {\n";
209 if (axis_ == kNHWC_H) {
210 code << "int coordinate_x = Y * " << in_shape[CLIDX_W] << " + Z;\n";
211 code << "int coordinate_y = " << function_y << " - boundary" << (j - 1) << ";\n";
212 } else if (axis_ == kNHWC_W) {
213 code << "int coordinate_x = (Y - boundary" << (j - 1) << ") * " << in_shape[CLIDX_W] << " + Z;\n";
214 code << "int coordinate_y = X;\n";
215 } else if (axis_ == kNHWC_C) {
216 code << "int coordinate_x = Y * " << in_shape[CLIDX_W] << " + Z - boundary" << (j - 1) << ";\n";
217 code << "int coordinate_y = X;\n";
218 }
219
220 code << "result = READ_IMAGE(input" << j << ", smp_none, (int2)(coordinate_x, coordinate_y));\n";
221 code << "}\n";
222 }
223 }
224 code << "WRITE_IMAGE(output, (int2)((Y) * " << out_shape_.s[CLIDX_W] << " + Z, (X)), result);\n";
225 return code.str();
226 }
227
GenCode()228 std::string ConcatOpenCLKernel::GenCode() {
229 std::vector<int> out_shape(DIMENSION_4D);
230 Broadcast2GpuShape(out_tensors_[0]->shape().data(), out_tensors_[0]->shape().size(), out_shape.data(), DIMENSION_4D,
231 1);
232 for (size_t i = 0; i < out_shape.size(); i++) {
233 out_shape_.s[i] = out_shape[i];
234 }
235 out_shape_.s[CLIDX_W] = UP_DIV(out_shape_.s[CLIDX_W], C4NUM);
236
237 std::stringstream code;
238 auto header = OpenCLKernelHeader();
239 if (header.empty()) {
240 MS_LOG(ERROR) << "Generate OpenCL kernel header failed.";
241 return "";
242 }
243 code << header;
244 code << "__kernel void Concat(\n";
245 for (size_t i = 0; i < in_tensors_.size(); i++) {
246 code << "__read_only image2d_t input" << i << ",\n";
247 }
248 code << "__write_only image2d_t output\n) {\n";
249
250 if (axis_ == kNHWC_C && !Align_) {
251 code << "int X = get_global_id(0);\n"
252 << "int Y = get_global_id(1);\n"
253 << "if (X >= " << out_shape[CLIDX_X] * out_shape[CLIDX_Y] << " || Y >= " << out_shape[CLIDX_Z]
254 << ") return;\n";
255
256 code << GenMainCodeAxis3UnAlign();
257 } else {
258 code << "int X = get_global_id(0);\n"
259 << "int Y = get_global_id(1);\n"
260 << "int Z = get_global_id(2);\n"
261 << "if (X >= " << out_shape_.s[CLIDX_X] * out_shape_.s[CLIDX_Y] << " || Y >= " << out_shape_.s[CLIDX_Z]
262 << " || Z >= " << out_shape_.s[CLIDX_W] << ") return;\n";
263
264 code << GenMainCodeOthers();
265 }
266 code << "}\n";
267 return code.str();
268 }
269
SetGlobalLocal()270 int ConcatOpenCLKernel::SetGlobalLocal() {
271 const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
272 if (axis_ == 3 && !Align_) {
273 OH = out_shape_.s[0] * out_shape_.s[1];
274 OW = out_shape_.s[2];
275 global_size_ = {OH, OW, 1};
276 local_size_ = {1, 1, 1};
277 } else {
278 OH = out_shape_.s[0] * out_shape_.s[1];
279 OW = out_shape_.s[2];
280 OC = out_shape_.s[3];
281 global_size_ = {OH, OW, OC};
282 local_size_ = {1, 1, 1};
283 }
284 ConcatGetWorkGroup(global_size_, &local_size_, max_global[0]);
285 OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
286
287 return RET_OK;
288 }
289
ConvertWeightToTensor()290 int ConcatOpenCLKernel::ConvertWeightToTensor() {
291 auto allocator = ocl_runtime_->GetAllocator();
292 bool fp16_enable = ocl_runtime_->GetFp16Enable();
293 for (auto in_tensor : in_tensors_) {
294 auto in_shape = GpuTensorInfo(in_tensor);
295 if (in_tensor->IsConst()) {
296 std::vector<char> weight(in_shape.Image2DSize, 0);
297 bool src_is_fp16 = in_tensor->data_type() == kNumberTypeFloat16;
298 PackNHWCToNHWC4(in_tensor->data(), weight.data(), src_is_fp16,
299 fp16_enable && in_tensor->data_type() != kNumberTypeInt32, in_shape);
300 size_t dtype;
301 switch (in_tensor->data_type()) {
302 case kNumberTypeInt32: {
303 dtype = CL_SIGNED_INT32;
304 break;
305 }
306 case kNumberTypeFloat32: {
307 dtype = CL_FLOAT;
308 break;
309 }
310 case kNumberTypeFloat16: {
311 dtype = CL_HALF_FLOAT;
312 break;
313 }
314 default:
315 MS_LOG(ERROR) << "Unsupported data type is" << in_tensor->data_type();
316 return RET_ERROR;
317 }
318 ImageSize img_size{in_shape.width, in_shape.height, dtype};
319 auto weight_ptr_ = allocator->Malloc(img_size, weight.data());
320 if (weight_ptr_ == nullptr) {
321 MS_LOG(ERROR) << "Malloc failed.";
322 return RET_ERROR;
323 }
324 weight_ptrs_.push_back(weight_ptr_);
325 } else {
326 weight_ptrs_.push_back(nullptr);
327 }
328 }
329 return RET_OK;
330 }
331
Prepare()332 int ConcatOpenCLKernel::Prepare() {
333 int ret = ConvertWeightToTensor();
334 if (ret != RET_OK) {
335 MS_LOG(ERROR) << "ConvertWeightToTensor failed.";
336 return ret;
337 }
338 if (axis_ == 0) {
339 if (std::any_of(in_tensors_.begin(), in_tensors_.end(), [](lite::Tensor *t) { return t->shape().size() != 1; })) {
340 return RET_OK;
341 }
342 axis_ = 3;
343 }
344 for (auto const &in_tensor : in_tensors_) {
345 if (in_tensor->shape().back() % C4NUM != 0) {
346 Align_ = false;
347 }
348 }
349
350 std::string source = GenCode();
351 // For debug.
352 dump_code_ = "[" + this->name() + "]\n" + source;
353
354 if (source.empty()) {
355 MS_LOG(ERROR) << "Failed to generate source code for " << this->name();
356 return RET_ERROR;
357 }
358
359 std::string program_name = "Concat\n" + source;
360 std::string kernel_name = "Concat";
361 if (!ocl_runtime_->LoadSource(program_name, source)) {
362 MS_LOG(ERROR) << "Load source failed.";
363 return RET_ERROR;
364 }
365
366 std::vector<std::string> build_options_ext{};
367 ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
368 if (ret != RET_OK) {
369 MS_LOG(ERROR) << "Build kernel failed.";
370 return ret;
371 }
372 (void)SetGlobalLocal();
373 return RET_OK;
374 }
375
Run()376 int ConcatOpenCLKernel::Run() {
377 MS_LOG(DEBUG) << this->name() << " Running! ";
378 if (axis_ == 0) {
379 return RunAxis0();
380 }
381 int arg_cn = 0;
382 for (size_t i = 0; i < in_tensors_.size(); ++i) {
383 auto input_ptr = weight_ptrs_.at(i) == nullptr ? in_tensors_[i]->data() : weight_ptrs_.at(i);
384 if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_ptr) != CL_SUCCESS) {
385 MS_LOG(ERROR) << "SetKernelArg failed.";
386 return RET_ERROR;
387 }
388 }
389
390 if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data()) != CL_SUCCESS) {
391 MS_LOG(ERROR) << "SetKernelArg failed.";
392 return RET_ERROR;
393 }
394
395 if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
396 MS_LOG(ERROR) << "RunKernel failed.";
397 return RET_ERROR;
398 }
399 return RET_OK;
400 }
401
402 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>)
403 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>)
404 REG_KERNEL(kGPU, kNumberTypeInt32, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>)
405 } // namespace kernel
406 } // namespace mindspore
407