1 /**
2 * Copyright 2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include <cmath>
17 #include <cstring>
18 #include <memory>
19 #include "schema/inner/model_generated.h"
20 #include "common/common_test.h"
21 #include "include/api/context.h"
22 #include "include/api/model.h"
23 #include "include/lite_session.h"
24 #include "include/context.h"
25 #include "include/errorcode.h"
26 #include "src/common/log_adapter.h"
27 #include "src/lite_session.h"
28 #include "include/registry/register_kernel_interface.h"
29 #include "include/registry/register_kernel.h"
30 #include "include/registry/opencl_runtime_wrapper.h"
31 #include "include/api/data_type.h"
32
33 using mindspore::kernel::Kernel;
34 using mindspore::kernel::KernelInterface;
35 using mindspore::lite::RET_ERROR;
36 using mindspore::lite::RET_OK;
37 using mindspore::lite::RET_PARAM_INVALID;
38 using mindspore::schema::PrimitiveType_AddFusion;
39 #define UP_ROUND(x, y) (((x) + (y) - (1)) / (y) * (y))
40 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
41 #define C4NUM 4
42
43 namespace mindspore {
44 namespace {
45 constexpr auto kFloat32 = DataType::kNumberTypeFloat32;
46 static const char *arithmetic_source =
47 "\n"
48 "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
49 "__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
50 "\n"
51 "__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t "
52 "output,\n"
53 " const int2 output_shape) {\n"
54 " int X = get_global_id(0);\n"
55 " int Y = get_global_id(1);\n"
56 " if (X >= output_shape.x || Y >= output_shape.y) {\n"
57 " return;\n"
58 " }\n"
59 "\n"
60 " FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));\n"
61 " FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));\n"
62 " FLT4 result = a + b;\n"
63 "\n"
64 " WRITE_IMAGE(output, (int2)(X, Y), result);\n"
65 "}\n";
66
67 template <typename SrcT, typename DstT>
Broadcast2GpuShape(DstT * dst,const SrcT * src,int src_num)68 void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) {
69 if (src == nullptr || src_num <= 0) {
70 return;
71 }
72 auto *N = dst;
73 auto *H = dst + 1;
74 auto *W = dst + 2;
75 auto *C = dst + 3;
76 if (src_num == 1) { // 1 1 1 C
77 *C = src[0];
78 } else if (src_num == 2) { // N 1 1 C
79 *N = src[0];
80 *C = src[1];
81 } else if (src_num == 3) { // N 1 W C
82 *N = src[0];
83 *W = src[1];
84 *C = src[2];
85 } else if (src_num == 4) { // N H W C
86 *N = src[0];
87 *H = src[1];
88 *W = src[2];
89 *C = src[3];
90 } else if (src_num > 4) {
91 std::cerr << "GPU doesn't support ndim>=" << src_num;
92 }
93 }
94
95 template <typename SrcT, typename DstT>
Broadcast2GpuShape(DstT * dst,const SrcT * src,int src_num,DstT default_value)96 void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) {
97 for (int i = 0; i < 4; ++i) {
98 dst[i] = default_value;
99 }
100 if (src == nullptr || src_num <= 0) {
101 return;
102 }
103 Broadcast2GpuShape(dst, src, src_num);
104 }
105 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
106 #define C4NUM 4
107 struct GpuTensorInfo {
108 GpuTensorInfo() = default;
GpuTensorInfomindspore::__anon1bc9c7b50111::GpuTensorInfo109 explicit GpuTensorInfo(const MSTensor *tensor, registry::opencl::OpenCLRuntimeWrapper *opencl_run) {
110 if (tensor == nullptr) {
111 return;
112 }
113 auto shape_ori = tensor->Shape();
114 int64_t shape[4];
115 Broadcast2GpuShape(shape, shape_ori.data(), shape_ori.size(), 1l);
116 N = shape[0];
117 H = shape[1];
118 W = shape[2];
119 C = shape[3];
120 Slice = UP_DIV(C, C4NUM);
121 if (tensor->DataType() == mindspore::DataType::kNumberTypeFloat16) {
122 FLT_size = sizeof(cl_half);
123 } else {
124 FLT_size = sizeof(cl_float);
125 }
126 FLT4_size = FLT_size * 4;
127 if (W * Slice <= opencl_run->GetMaxImage2DWidth()) {
128 height = N * H;
129 width = W * Slice;
130 } else {
131 height = N * H * W;
132 width = Slice;
133 if (height > opencl_run->GetMaxImage2DHeight()) {
134 height = -1;
135 width = -1;
136 }
137 }
138
139 ElementsNum = N * H * W * C;
140 Image2DSize = height * width * FLT4_size;
141 }
142 size_t N{1};
143 size_t H{1};
144 size_t W{1};
145 size_t C{1};
146 size_t Slice{};
147 size_t width{};
148 size_t height{};
149 size_t FLT_size{4};
150 size_t FLT4_size{16};
151 size_t ElementsNum{};
152 size_t Image2DSize{};
153 };
154 } // namespace
155
156 class CustomAddKernel : public kernel::Kernel {
157 public:
CustomAddKernel(const std::vector<MSTensor> & inputs,const std::vector<MSTensor> & outputs,const schema::Primitive * primitive,const mindspore::Context * ctx,const std::string & build_options,bool fp16_enable)158 CustomAddKernel(const std::vector<MSTensor> &inputs, const std::vector<MSTensor> &outputs,
159 const schema::Primitive *primitive, const mindspore::Context *ctx, const std::string &build_options,
160 bool fp16_enable)
161 : Kernel(inputs, outputs, primitive, ctx), build_options_(build_options), fp16_enable_(fp16_enable) {}
~CustomAddKernel()162 ~CustomAddKernel() override { FreeWeight(); }
163 // Prepare will be called during graph compilation
Prepare()164 int Prepare() override {
165 const std::string kernel_name_ = "ElementAdd";
166 const std::string program_name = "Arithmetic";
167 std::string source = arithmetic_source;
168 if (opencl_runtime_.LoadSource(program_name, source) != kSuccess) {
169 std::cerr << "Load source failed.";
170 return lite::RET_ERROR;
171 }
172 std::vector<std::string> build_options_ext = {"-cl-mad-enable -cl-fast-relaxed-math -Werror"};
173
174 build_options_ext.push_back(build_options_);
175 if (opencl_runtime_.BuildKernel(&kernel_, program_name, kernel_name_, build_options_ext) != kSuccess) {
176 std::cerr << "Build kernel failed.";
177 return lite::RET_ERROR;
178 }
179
180 auto out_shape = GpuTensorInfo(&outputs_[0], &opencl_runtime_);
181 local_range_ = cl::NullRange;
182 global_range_ = cl::NDRange(out_shape.width, out_shape.height);
183 for (int i = 0; i < inputs_.size(); ++i) {
184 auto &in_tensor = inputs_.at(i);
185 GpuTensorInfo in_shape = GpuTensorInfo(&in_tensor, &opencl_runtime_);
186 if (in_tensor.IsConst()) {
187 std::vector<char> weight(in_shape.Image2DSize, 0);
188 bool src_is_fp16 = in_tensor.DataType() == mindspore::DataType::kNumberTypeFloat16;
189 PackNHWCToNHWC4(in_tensor.MutableData(), weight.data(), src_is_fp16, fp16_enable_, in_shape,
190 in_tensor.DataType());
191 DataType dtype =
192 fp16_enable_ ? mindspore::DataType::kNumberTypeFloat16 : mindspore::DataType::kNumberTypeFloat32;
193 auto allocator = opencl_runtime_.GetAllocator();
194 if (allocator == nullptr) {
195 std::cerr << "GetAllocator fail.";
196 FreeWeight();
197 return lite::RET_ERROR;
198 }
199 auto weight_ptr = allocator->Malloc(in_shape.width, in_shape.height, dtype);
200 if (weight_ptr == nullptr) {
201 std::cerr << "Malloc fail.";
202 FreeWeight();
203 return lite::RET_ERROR;
204 }
205 weight_ptrs_.push_back(weight_ptr);
206 if (opencl_runtime_.WriteImage(weight_ptr, weight.data()) != kSuccess) {
207 std::cerr << "WriteImage fail.";
208 FreeWeight();
209 return lite::RET_ERROR;
210 }
211 } else {
212 weight_ptrs_.push_back(nullptr);
213 }
214 }
215
216 int arg_idx = 3;
217 cl_int2 output_shape{static_cast<int>(global_range_[0]), static_cast<int>(global_range_[1])};
218 if (opencl_runtime_.SetKernelArg(kernel_, arg_idx, output_shape) != kSuccess) {
219 std::cerr << "Set kernel arg" << arg_idx << "failed.";
220 FreeWeight();
221 return lite::RET_ERROR;
222 }
223
224 std::cout << kernel_name_ << " Init Done!" << std::endl;
225 return lite::RET_OK;
226 }
227
228 // Execute is called to compute.
Execute()229 int Execute() override {
230 if (inputs_.size() != 2) {
231 return lite::RET_PARAM_INVALID;
232 }
233 PreProcess();
234 std::cout << this->name() << " Running!" << std::endl;
235 auto input_0_ptr = weight_ptrs_[0] == nullptr ? inputs_[0].MutableData() : weight_ptrs_[0];
236 auto input_1_ptr = weight_ptrs_[1] == nullptr ? inputs_[1].MutableData() : weight_ptrs_[1];
237 int arg_idx = 0;
238 if (opencl_runtime_.SetKernelArg(kernel_, arg_idx++, input_0_ptr) != kSuccess) {
239 std::cerr << "Set kernel arg" << arg_idx - 1 << "failed.";
240 return lite::RET_ERROR;
241 }
242 if (opencl_runtime_.SetKernelArg(kernel_, arg_idx++, input_1_ptr) != kSuccess) {
243 std::cerr << "Set kernel arg" << arg_idx - 1 << "failed.";
244 return lite::RET_ERROR;
245 }
246 if (opencl_runtime_.SetKernelArg(kernel_, arg_idx++, outputs_[0].MutableData()) != kSuccess) {
247 std::cerr << "Set kernel arg" << arg_idx - 1 << "failed.";
248 return lite::RET_ERROR;
249 }
250 if (opencl_runtime_.RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != kSuccess) {
251 std::cerr << "Run kernel failed.";
252 return lite::RET_ERROR;
253 }
254
255 return lite::RET_OK;
256 }
257
CheckSpecs()258 int CheckSpecs() {
259 for (auto &tensor : inputs_) {
260 if (tensor.DataType() != DataType::kNumberTypeFloat32 && tensor.DataType() != DataType::kNumberTypeFloat16) {
261 std::cerr << "ArithmeticOpenCLKernel only support fp32/fp16 input";
262 return lite::RET_ERROR;
263 }
264 }
265 for (auto &tensor : outputs_) {
266 if (tensor.DataType() != DataType::kNumberTypeFloat32 && tensor.DataType() != DataType::kNumberTypeFloat16) {
267 std::cerr << "ArithmeticOpenCLKernel only support fp32/fp16 output";
268 return lite::RET_ERROR;
269 }
270 }
271
272 if (inputs_.size() != 2 || outputs_.size() != 1) {
273 std::cerr << "in size: " << inputs_.size() << ", out size: " << outputs_.size();
274 return lite::RET_ERROR;
275 }
276
277 return lite::RET_OK;
278 }
279
280 // Resize is used to update some parameters if current node can change along with inputs.
ReSize()281 int ReSize() override {
282 if (CheckOutputs(outputs_) == lite::RET_OK) {
283 return lite::RET_OK;
284 }
285 auto status =
286 registry::RegisterKernelInterface::GetKernelInterface({}, primitive_)->Infer(&inputs_, &outputs_, primitive_);
287 if (status != kSuccess) {
288 std::cerr << "infer failed." << std::endl;
289 return lite::RET_ERROR;
290 }
291 auto ret = CheckSpecs();
292 if (ret != lite::RET_OK) {
293 std::cerr << "ReSize failed for check kernel specs!";
294 return ret;
295 }
296 ret = Prepare();
297 if (ret != lite::RET_OK) {
298 std::cerr << "ReSize failed for kernel prepare!";
299 return ret;
300 }
301 return lite::RET_OK;
302 }
303
304 private:
305 std::string build_options_;
306 bool fp16_enable_;
307 cl::Kernel kernel_;
308 cl::Event event_;
309 cl::NDRange global_range_{cl::NullRange};
310 cl::NDRange local_range_{cl::NullRange};
311 std::vector<void *> weight_ptrs_;
312 registry::opencl::OpenCLRuntimeWrapper opencl_runtime_;
313
PreProcess()314 int PreProcess() {
315 int ret;
316 ret = ReSize();
317 if (ret != lite::RET_OK) {
318 return ret;
319 }
320 for (auto i = 0; i < outputs_.size(); ++i) {
321 auto *output = &outputs_.at(i);
322 auto img_info = GpuTensorInfo(output, &opencl_runtime_);
323 auto allocator = output->allocator();
324 if (allocator == nullptr) {
325 std::cerr << "The output tensor of OpenCL kernel must have an allocator.";
326 return lite::RET_ERROR;
327 }
328 auto data_ptr = allocator->Malloc(img_info.width, img_info.height, output->DataType());
329 if (data_ptr == nullptr) {
330 std::cerr << "Malloc data failed";
331 return lite::RET_ERROR;
332 }
333 output->SetData(data_ptr);
334 }
335 return lite::RET_OK;
336 }
337
CheckOutputs(const std::vector<mindspore::MSTensor> & outputs)338 int CheckOutputs(const std::vector<mindspore::MSTensor> &outputs) {
339 for (auto &output : outputs) {
340 auto output_shape = output.Shape();
341 if (std::find(output_shape.begin(), output_shape.end(), -1) != output_shape.end()) {
342 return lite::RET_INFER_INVALID;
343 }
344 }
345 return lite::RET_OK;
346 }
347
PackNHWCToNHWC4(void * src,void * dst,bool src_is_fp16,bool dst_is_fp16,const GpuTensorInfo & tensor,mindspore::DataType data_type)348 void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor,
349 mindspore::DataType data_type) {
350 auto src_fp16 = reinterpret_cast<float16_t *>(src);
351 auto src_fp32 = reinterpret_cast<float32_t *>(src);
352 auto src_int32 = reinterpret_cast<int32_t *>(src);
353 auto dst_fp16 = reinterpret_cast<float16_t *>(dst);
354 auto dst_fp32 = reinterpret_cast<float32_t *>(dst);
355 auto dst_int32 = reinterpret_cast<int32_t *>(dst);
356 for (int n = 0, src_idx = 0; n < tensor.N; n++) {
357 for (int h = 0; h < tensor.H; ++h) {
358 for (int w = 0; w < tensor.W; ++w) {
359 for (int c = 0; c < tensor.C; ++c, ++src_idx) {
360 int dst_idx = ((n * tensor.H + h) * tensor.W + w) * tensor.Slice * C4NUM + c;
361 if (data_type == mindspore::DataType::kNumberTypeInt32) {
362 dst_int32[dst_idx] = src_int32[src_idx];
363 } else if (dst_is_fp16) {
364 dst_fp16[dst_idx] = src_is_fp16 ? src_fp16[src_idx] : static_cast<float16_t>(src_fp32[src_idx]);
365 } else {
366 dst_fp32[dst_idx] = src_is_fp16 ? static_cast<float32_t>(src_fp16[src_idx]) : src_fp32[src_idx];
367 }
368 }
369 }
370 }
371 }
372 // scalar
373 if (tensor.ElementsNum == 1) {
374 if (dst_is_fp16) {
375 dst_fp16[3] = dst_fp16[2] = dst_fp16[1] = dst_fp16[0];
376 } else {
377 dst_fp32[3] = dst_fp32[2] = dst_fp32[1] = dst_fp32[0];
378 }
379 }
380 }
381
FreeWeight()382 void FreeWeight() {
383 auto allocator = opencl_runtime_.GetAllocator();
384 if (allocator == nullptr) {
385 std::cerr << "GetAllocator fail.";
386 return;
387 }
388 for (auto &weight_ptr : weight_ptrs_) {
389 if (weight_ptr != nullptr) {
390 allocator->Free(weight_ptr);
391 weight_ptr = nullptr;
392 }
393 }
394 }
395 };
396
397 class CustomAddInfer : public kernel::KernelInterface {
398 public:
399 CustomAddInfer() = default;
400 ~CustomAddInfer() = default;
401
Infer(std::vector<mindspore::MSTensor> * inputs,std::vector<mindspore::MSTensor> * outputs,const schema::Primitive * primitive)402 Status Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
403 const schema::Primitive *primitive) override {
404 (*outputs)[0].SetFormat((*inputs)[0].format());
405 (*outputs)[0].SetDataType((*inputs)[0].DataType());
406 (*outputs)[0].SetShape((*inputs)[0].Shape());
407 return kSuccess;
408 }
409 };
410
411 namespace {
CustomAddCreator(const std::vector<MSTensor> & inputs,const std::vector<MSTensor> & outputs,const schema::Primitive * primitive,const mindspore::Context * ctx)412 std::shared_ptr<kernel::Kernel> CustomAddCreator(const std::vector<MSTensor> &inputs,
413 const std::vector<MSTensor> &outputs,
414 const schema::Primitive *primitive, const mindspore::Context *ctx) {
415 const std::string build_options = " -DFLT4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef ";
416 bool fp16_enable = false;
417
418 std::cout << "using fp32 add.\n" << std::endl;
419 return std::make_shared<CustomAddKernel>(inputs, outputs, primitive, ctx, build_options, fp16_enable);
420 }
421
CustomAddInferCreator()422 std::shared_ptr<kernel::KernelInterface> CustomAddInferCreator() { return std::make_shared<CustomAddInfer>(); }
423 } // namespace
424
425 REGISTER_CUSTOM_KERNEL_INTERFACE(BuiltInTest, Custom_Add, CustomAddInferCreator)
426 // Register custom “Custom_Add” operator
427 REGISTER_CUSTOM_KERNEL(GPU, BuiltInTest, kFloat32, Custom_Add, CustomAddCreator)
428
429 class TestGPURegistryCustomOp : public mindspore::CommonTest {
430 public:
431 TestGPURegistryCustomOp() = default;
432 };
433
TEST_F(TestGPURegistryCustomOp,TestGPUCustomAdd)434 TEST_F(TestGPURegistryCustomOp, TestGPUCustomAdd) {
435 auto meta_graph = std::make_shared<schema::MetaGraphT>();
436 meta_graph->name = "graph";
437
438 auto node = std::make_unique<schema::CNodeT>();
439 node->inputIndex = {0, 1};
440 node->outputIndex = {2};
441 node->primitive = std::make_unique<schema::PrimitiveT>();
442 node->primitive->value.type = schema::PrimitiveType_Custom;
443 auto primitive = new schema::CustomT;
444 primitive->type = "Custom_Add";
445 node->primitive->value.value = primitive;
446 node->name = "Add";
447 meta_graph->nodes.emplace_back(std::move(node));
448 meta_graph->inputIndex = {0, 1};
449 meta_graph->outputIndex = {2};
450
451 auto input0 = std::make_unique<schema::TensorT>();
452 input0->nodeType = lite::NodeType_ValueNode;
453 input0->format = schema::Format_NHWC;
454 input0->dataType = TypeId::kNumberTypeFloat32;
455 input0->dims = {1, 28, 28, 3};
456 input0->offset = -1;
457 meta_graph->allTensors.emplace_back(std::move(input0));
458
459 auto weight = std::make_unique<schema::TensorT>();
460 weight->nodeType = lite::NodeType_ValueNode;
461 weight->format = schema::Format_NHWC;
462 weight->dataType = TypeId::kNumberTypeFloat32;
463 weight->dims = {1, 28, 28, 3};
464
465 weight->offset = -1;
466 meta_graph->allTensors.emplace_back(std::move(weight));
467
468 auto output = std::make_unique<schema::TensorT>();
469 output->nodeType = lite::NodeType_Parameter;
470 output->format = schema::Format_NHWC;
471 output->dataType = TypeId::kNumberTypeFloat32;
472 output->offset = -1;
473 meta_graph->allTensors.emplace_back(std::move(output));
474
475 flatbuffers::FlatBufferBuilder builder(1024);
476 auto offset = schema::MetaGraph::Pack(builder, meta_graph.get());
477 builder.Finish(offset);
478 schema::FinishMetaGraphBuffer(builder, offset);
479 size_t size = builder.GetSize();
480 const char *content = reinterpret_cast<char *>(builder.GetBufferPointer());
481
482 // create a context
483 auto context = std::make_shared<mindspore::Context>();
484 context->SetThreadNum(1);
485 context->SetEnableParallel(false);
486 context->SetThreadAffinity(lite::HIGHER_CPU);
487 auto &device_list = context->MutableDeviceInfo();
488
489 std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
490 device_info->SetEnableFP16(false);
491 device_list.push_back(device_info);
492
493 std::shared_ptr<GPUDeviceInfo> provider_gpu_device_info = std::make_shared<GPUDeviceInfo>();
494 provider_gpu_device_info->SetEnableFP16(false);
495 provider_gpu_device_info->SetProviderDevice("GPU");
496 provider_gpu_device_info->SetProvider("BuiltInTest");
497 device_list.push_back(provider_gpu_device_info);
498
499 // build a model
500 auto model = std::make_shared<mindspore::Model>();
501 auto ret = model->Build(content, size, kFlatBuffer, context);
502 ASSERT_EQ(kSuccess, ret.StatusCode());
503 auto inputs = model->GetInputs();
504 ASSERT_EQ(inputs.size(), 2);
505 auto inTensor = inputs.front();
506 auto impl = inTensor.impl();
507 ASSERT_NE(nullptr, impl);
508 float *in0_data = static_cast<float *>(inTensor.MutableData());
509 in0_data[0] = 10.0f;
510 auto inTensor1 = inputs.back();
511 impl = inTensor1.impl();
512 ASSERT_NE(nullptr, impl);
513 float *in1_data = static_cast<float *>(inTensor1.MutableData());
514 in1_data[0] = 20.0f;
515 std::vector<mindspore::MSTensor> outputs;
516 ret = model->Predict(inputs, &outputs);
517 ASSERT_EQ(kSuccess, ret.StatusCode());
518 ASSERT_EQ(outputs.size(), 1);
519 impl = outputs.front().impl();
520 ASSERT_NE(nullptr, impl);
521 ASSERT_EQ(28 * 28 * 3, outputs.front().ElementNum());
522 ASSERT_EQ(DataType::kNumberTypeFloat32, outputs.front().DataType());
523 auto *outData = reinterpret_cast<const float *>(outputs.front().Data().get());
524 ASSERT_NE(nullptr, outData);
525 ASSERT_EQ(30.0f, outData[0]);
526 MS_LOG(INFO) << "Register add op test pass.";
527 }
528 } // namespace mindspore
529