• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include <cmath>
17 #include <cstring>
18 #include <memory>
19 #include "schema/inner/model_generated.h"
20 #include "common/common_test.h"
21 #include "include/api/context.h"
22 #include "include/api/model.h"
23 #include "include/lite_session.h"
24 #include "include/context.h"
25 #include "include/errorcode.h"
26 #include "src/common/log_adapter.h"
27 #include "src/lite_session.h"
28 #include "include/registry/register_kernel_interface.h"
29 #include "include/registry/register_kernel.h"
30 #include "include/registry/opencl_runtime_wrapper.h"
31 #include "include/api/data_type.h"
32 
33 using mindspore::kernel::Kernel;
34 using mindspore::kernel::KernelInterface;
35 using mindspore::lite::RET_ERROR;
36 using mindspore::lite::RET_OK;
37 using mindspore::lite::RET_PARAM_INVALID;
38 using mindspore::schema::PrimitiveType_AddFusion;
39 #define UP_ROUND(x, y) (((x) + (y) - (1)) / (y) * (y))
40 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
41 #define C4NUM 4
42 
43 namespace mindspore {
44 namespace {
45 constexpr auto kFloat32 = DataType::kNumberTypeFloat32;
46 static const char *arithmetic_source =
47   "\n"
48   "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
49   "__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
50   "\n"
51   "__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t "
52   "output,\n"
53   "                         const int2 output_shape) {\n"
54   "  int X = get_global_id(0);\n"
55   "  int Y = get_global_id(1);\n"
56   "  if (X >= output_shape.x || Y >= output_shape.y) {\n"
57   "    return;\n"
58   "  }\n"
59   "\n"
60   "  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));\n"
61   "  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));\n"
62   "  FLT4 result = a + b;\n"
63   "\n"
64   "  WRITE_IMAGE(output, (int2)(X, Y), result);\n"
65   "}\n";
66 
67 template <typename SrcT, typename DstT>
Broadcast2GpuShape(DstT * dst,const SrcT * src,int src_num)68 void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) {
69   if (src == nullptr || src_num <= 0) {
70     return;
71   }
72   auto *N = dst;
73   auto *H = dst + 1;
74   auto *W = dst + 2;
75   auto *C = dst + 3;
76   if (src_num == 1) {  // 1 1 1 C
77     *C = src[0];
78   } else if (src_num == 2) {  // N 1 1 C
79     *N = src[0];
80     *C = src[1];
81   } else if (src_num == 3) {  // N 1 W C
82     *N = src[0];
83     *W = src[1];
84     *C = src[2];
85   } else if (src_num == 4) {  // N H W C
86     *N = src[0];
87     *H = src[1];
88     *W = src[2];
89     *C = src[3];
90   } else if (src_num > 4) {
91     std::cerr << "GPU doesn't support ndim>=" << src_num;
92   }
93 }
94 
95 template <typename SrcT, typename DstT>
Broadcast2GpuShape(DstT * dst,const SrcT * src,int src_num,DstT default_value)96 void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) {
97   for (int i = 0; i < 4; ++i) {
98     dst[i] = default_value;
99   }
100   if (src == nullptr || src_num <= 0) {
101     return;
102   }
103   Broadcast2GpuShape(dst, src, src_num);
104 }
105 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
106 #define C4NUM 4
107 struct GpuTensorInfo {
108   GpuTensorInfo() = default;
GpuTensorInfomindspore::__anon1bc9c7b50111::GpuTensorInfo109   explicit GpuTensorInfo(const MSTensor *tensor, registry::opencl::OpenCLRuntimeWrapper *opencl_run) {
110     if (tensor == nullptr) {
111       return;
112     }
113     auto shape_ori = tensor->Shape();
114     int64_t shape[4];
115     Broadcast2GpuShape(shape, shape_ori.data(), shape_ori.size(), 1l);
116     N = shape[0];
117     H = shape[1];
118     W = shape[2];
119     C = shape[3];
120     Slice = UP_DIV(C, C4NUM);
121     if (tensor->DataType() == mindspore::DataType::kNumberTypeFloat16) {
122       FLT_size = sizeof(cl_half);
123     } else {
124       FLT_size = sizeof(cl_float);
125     }
126     FLT4_size = FLT_size * 4;
127     if (W * Slice <= opencl_run->GetMaxImage2DWidth()) {
128       height = N * H;
129       width = W * Slice;
130     } else {
131       height = N * H * W;
132       width = Slice;
133       if (height > opencl_run->GetMaxImage2DHeight()) {
134         height = -1;
135         width = -1;
136       }
137     }
138 
139     ElementsNum = N * H * W * C;
140     Image2DSize = height * width * FLT4_size;
141   }
142   size_t N{1};
143   size_t H{1};
144   size_t W{1};
145   size_t C{1};
146   size_t Slice{};
147   size_t width{};
148   size_t height{};
149   size_t FLT_size{4};
150   size_t FLT4_size{16};
151   size_t ElementsNum{};
152   size_t Image2DSize{};
153 };
154 }  // namespace
155 
156 class CustomAddKernel : public kernel::Kernel {
157  public:
CustomAddKernel(const std::vector<MSTensor> & inputs,const std::vector<MSTensor> & outputs,const schema::Primitive * primitive,const mindspore::Context * ctx,const std::string & build_options,bool fp16_enable)158   CustomAddKernel(const std::vector<MSTensor> &inputs, const std::vector<MSTensor> &outputs,
159                   const schema::Primitive *primitive, const mindspore::Context *ctx, const std::string &build_options,
160                   bool fp16_enable)
161       : Kernel(inputs, outputs, primitive, ctx), build_options_(build_options), fp16_enable_(fp16_enable) {}
~CustomAddKernel()162   ~CustomAddKernel() override { FreeWeight(); }
163   // Prepare will be called during graph compilation
Prepare()164   int Prepare() override {
165     const std::string kernel_name_ = "ElementAdd";
166     const std::string program_name = "Arithmetic";
167     std::string source = arithmetic_source;
168     if (opencl_runtime_.LoadSource(program_name, source) != kSuccess) {
169       std::cerr << "Load source failed.";
170       return lite::RET_ERROR;
171     }
172     std::vector<std::string> build_options_ext = {"-cl-mad-enable -cl-fast-relaxed-math -Werror"};
173 
174     build_options_ext.push_back(build_options_);
175     if (opencl_runtime_.BuildKernel(&kernel_, program_name, kernel_name_, build_options_ext) != kSuccess) {
176       std::cerr << "Build kernel failed.";
177       return lite::RET_ERROR;
178     }
179 
180     auto out_shape = GpuTensorInfo(&outputs_[0], &opencl_runtime_);
181     local_range_ = cl::NullRange;
182     global_range_ = cl::NDRange(out_shape.width, out_shape.height);
183     for (int i = 0; i < inputs_.size(); ++i) {
184       auto &in_tensor = inputs_.at(i);
185       GpuTensorInfo in_shape = GpuTensorInfo(&in_tensor, &opencl_runtime_);
186       if (in_tensor.IsConst()) {
187         std::vector<char> weight(in_shape.Image2DSize, 0);
188         bool src_is_fp16 = in_tensor.DataType() == mindspore::DataType::kNumberTypeFloat16;
189         PackNHWCToNHWC4(in_tensor.MutableData(), weight.data(), src_is_fp16, fp16_enable_, in_shape,
190                         in_tensor.DataType());
191         DataType dtype =
192           fp16_enable_ ? mindspore::DataType::kNumberTypeFloat16 : mindspore::DataType::kNumberTypeFloat32;
193         auto allocator = opencl_runtime_.GetAllocator();
194         if (allocator == nullptr) {
195           std::cerr << "GetAllocator fail.";
196           FreeWeight();
197           return lite::RET_ERROR;
198         }
199         auto weight_ptr = allocator->Malloc(in_shape.width, in_shape.height, dtype);
200         if (weight_ptr == nullptr) {
201           std::cerr << "Malloc fail.";
202           FreeWeight();
203           return lite::RET_ERROR;
204         }
205         weight_ptrs_.push_back(weight_ptr);
206         if (opencl_runtime_.WriteImage(weight_ptr, weight.data()) != kSuccess) {
207           std::cerr << "WriteImage fail.";
208           FreeWeight();
209           return lite::RET_ERROR;
210         }
211       } else {
212         weight_ptrs_.push_back(nullptr);
213       }
214     }
215 
216     int arg_idx = 3;
217     cl_int2 output_shape{static_cast<int>(global_range_[0]), static_cast<int>(global_range_[1])};
218     if (opencl_runtime_.SetKernelArg(kernel_, arg_idx, output_shape) != kSuccess) {
219       std::cerr << "Set kernel arg" << arg_idx << "failed.";
220       FreeWeight();
221       return lite::RET_ERROR;
222     }
223 
224     std::cout << kernel_name_ << " Init Done!" << std::endl;
225     return lite::RET_OK;
226   }
227 
228   // Execute is called to compute.
Execute()229   int Execute() override {
230     if (inputs_.size() != 2) {
231       return lite::RET_PARAM_INVALID;
232     }
233     PreProcess();
234     std::cout << this->name() << " Running!" << std::endl;
235     auto input_0_ptr = weight_ptrs_[0] == nullptr ? inputs_[0].MutableData() : weight_ptrs_[0];
236     auto input_1_ptr = weight_ptrs_[1] == nullptr ? inputs_[1].MutableData() : weight_ptrs_[1];
237     int arg_idx = 0;
238     if (opencl_runtime_.SetKernelArg(kernel_, arg_idx++, input_0_ptr) != kSuccess) {
239       std::cerr << "Set kernel arg" << arg_idx - 1 << "failed.";
240       return lite::RET_ERROR;
241     }
242     if (opencl_runtime_.SetKernelArg(kernel_, arg_idx++, input_1_ptr) != kSuccess) {
243       std::cerr << "Set kernel arg" << arg_idx - 1 << "failed.";
244       return lite::RET_ERROR;
245     }
246     if (opencl_runtime_.SetKernelArg(kernel_, arg_idx++, outputs_[0].MutableData()) != kSuccess) {
247       std::cerr << "Set kernel arg" << arg_idx - 1 << "failed.";
248       return lite::RET_ERROR;
249     }
250     if (opencl_runtime_.RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != kSuccess) {
251       std::cerr << "Run kernel failed.";
252       return lite::RET_ERROR;
253     }
254 
255     return lite::RET_OK;
256   }
257 
CheckSpecs()258   int CheckSpecs() {
259     for (auto &tensor : inputs_) {
260       if (tensor.DataType() != DataType::kNumberTypeFloat32 && tensor.DataType() != DataType::kNumberTypeFloat16) {
261         std::cerr << "ArithmeticOpenCLKernel only support fp32/fp16 input";
262         return lite::RET_ERROR;
263       }
264     }
265     for (auto &tensor : outputs_) {
266       if (tensor.DataType() != DataType::kNumberTypeFloat32 && tensor.DataType() != DataType::kNumberTypeFloat16) {
267         std::cerr << "ArithmeticOpenCLKernel only support fp32/fp16 output";
268         return lite::RET_ERROR;
269       }
270     }
271 
272     if (inputs_.size() != 2 || outputs_.size() != 1) {
273       std::cerr << "in size: " << inputs_.size() << ", out size: " << outputs_.size();
274       return lite::RET_ERROR;
275     }
276 
277     return lite::RET_OK;
278   }
279 
280   // Resize is used to update some parameters if current node can change along with inputs.
ReSize()281   int ReSize() override {
282     if (CheckOutputs(outputs_) == lite::RET_OK) {
283       return lite::RET_OK;
284     }
285     auto status =
286       registry::RegisterKernelInterface::GetKernelInterface({}, primitive_)->Infer(&inputs_, &outputs_, primitive_);
287     if (status != kSuccess) {
288       std::cerr << "infer failed." << std::endl;
289       return lite::RET_ERROR;
290     }
291     auto ret = CheckSpecs();
292     if (ret != lite::RET_OK) {
293       std::cerr << "ReSize failed for check kernel specs!";
294       return ret;
295     }
296     ret = Prepare();
297     if (ret != lite::RET_OK) {
298       std::cerr << "ReSize failed for kernel prepare!";
299       return ret;
300     }
301     return lite::RET_OK;
302   }
303 
304  private:
305   std::string build_options_;
306   bool fp16_enable_;
307   cl::Kernel kernel_;
308   cl::Event event_;
309   cl::NDRange global_range_{cl::NullRange};
310   cl::NDRange local_range_{cl::NullRange};
311   std::vector<void *> weight_ptrs_;
312   registry::opencl::OpenCLRuntimeWrapper opencl_runtime_;
313 
PreProcess()314   int PreProcess() {
315     int ret;
316     ret = ReSize();
317     if (ret != lite::RET_OK) {
318       return ret;
319     }
320     for (auto i = 0; i < outputs_.size(); ++i) {
321       auto *output = &outputs_.at(i);
322       auto img_info = GpuTensorInfo(output, &opencl_runtime_);
323       auto allocator = output->allocator();
324       if (allocator == nullptr) {
325         std::cerr << "The output tensor of OpenCL kernel must have an allocator.";
326         return lite::RET_ERROR;
327       }
328       auto data_ptr = allocator->Malloc(img_info.width, img_info.height, output->DataType());
329       if (data_ptr == nullptr) {
330         std::cerr << "Malloc data failed";
331         return lite::RET_ERROR;
332       }
333       output->SetData(data_ptr);
334     }
335     return lite::RET_OK;
336   }
337 
CheckOutputs(const std::vector<mindspore::MSTensor> & outputs)338   int CheckOutputs(const std::vector<mindspore::MSTensor> &outputs) {
339     for (auto &output : outputs) {
340       auto output_shape = output.Shape();
341       if (std::find(output_shape.begin(), output_shape.end(), -1) != output_shape.end()) {
342         return lite::RET_INFER_INVALID;
343       }
344     }
345     return lite::RET_OK;
346   }
347 
PackNHWCToNHWC4(void * src,void * dst,bool src_is_fp16,bool dst_is_fp16,const GpuTensorInfo & tensor,mindspore::DataType data_type)348   void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor,
349                        mindspore::DataType data_type) {
350     auto src_fp16 = reinterpret_cast<float16_t *>(src);
351     auto src_fp32 = reinterpret_cast<float32_t *>(src);
352     auto src_int32 = reinterpret_cast<int32_t *>(src);
353     auto dst_fp16 = reinterpret_cast<float16_t *>(dst);
354     auto dst_fp32 = reinterpret_cast<float32_t *>(dst);
355     auto dst_int32 = reinterpret_cast<int32_t *>(dst);
356     for (int n = 0, src_idx = 0; n < tensor.N; n++) {
357       for (int h = 0; h < tensor.H; ++h) {
358         for (int w = 0; w < tensor.W; ++w) {
359           for (int c = 0; c < tensor.C; ++c, ++src_idx) {
360             int dst_idx = ((n * tensor.H + h) * tensor.W + w) * tensor.Slice * C4NUM + c;
361             if (data_type == mindspore::DataType::kNumberTypeInt32) {
362               dst_int32[dst_idx] = src_int32[src_idx];
363             } else if (dst_is_fp16) {
364               dst_fp16[dst_idx] = src_is_fp16 ? src_fp16[src_idx] : static_cast<float16_t>(src_fp32[src_idx]);
365             } else {
366               dst_fp32[dst_idx] = src_is_fp16 ? static_cast<float32_t>(src_fp16[src_idx]) : src_fp32[src_idx];
367             }
368           }
369         }
370       }
371     }
372     // scalar
373     if (tensor.ElementsNum == 1) {
374       if (dst_is_fp16) {
375         dst_fp16[3] = dst_fp16[2] = dst_fp16[1] = dst_fp16[0];
376       } else {
377         dst_fp32[3] = dst_fp32[2] = dst_fp32[1] = dst_fp32[0];
378       }
379     }
380   }
381 
FreeWeight()382   void FreeWeight() {
383     auto allocator = opencl_runtime_.GetAllocator();
384     if (allocator == nullptr) {
385       std::cerr << "GetAllocator fail.";
386       return;
387     }
388     for (auto &weight_ptr : weight_ptrs_) {
389       if (weight_ptr != nullptr) {
390         allocator->Free(weight_ptr);
391         weight_ptr = nullptr;
392       }
393     }
394   }
395 };
396 
397 class CustomAddInfer : public kernel::KernelInterface {
398  public:
399   CustomAddInfer() = default;
400   ~CustomAddInfer() = default;
401 
Infer(std::vector<mindspore::MSTensor> * inputs,std::vector<mindspore::MSTensor> * outputs,const schema::Primitive * primitive)402   Status Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
403                const schema::Primitive *primitive) override {
404     (*outputs)[0].SetFormat((*inputs)[0].format());
405     (*outputs)[0].SetDataType((*inputs)[0].DataType());
406     (*outputs)[0].SetShape((*inputs)[0].Shape());
407     return kSuccess;
408   }
409 };
410 
411 namespace {
CustomAddCreator(const std::vector<MSTensor> & inputs,const std::vector<MSTensor> & outputs,const schema::Primitive * primitive,const mindspore::Context * ctx)412 std::shared_ptr<kernel::Kernel> CustomAddCreator(const std::vector<MSTensor> &inputs,
413                                                  const std::vector<MSTensor> &outputs,
414                                                  const schema::Primitive *primitive, const mindspore::Context *ctx) {
415   const std::string build_options = " -DFLT4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef ";
416   bool fp16_enable = false;
417 
418   std::cout << "using fp32 add.\n" << std::endl;
419   return std::make_shared<CustomAddKernel>(inputs, outputs, primitive, ctx, build_options, fp16_enable);
420 }
421 
CustomAddInferCreator()422 std::shared_ptr<kernel::KernelInterface> CustomAddInferCreator() { return std::make_shared<CustomAddInfer>(); }
423 }  // namespace
424 
425 REGISTER_CUSTOM_KERNEL_INTERFACE(BuiltInTest, Custom_Add, CustomAddInferCreator)
426 // Register custom “Custom_Add” operator
427 REGISTER_CUSTOM_KERNEL(GPU, BuiltInTest, kFloat32, Custom_Add, CustomAddCreator)
428 
429 class TestGPURegistryCustomOp : public mindspore::CommonTest {
430  public:
431   TestGPURegistryCustomOp() = default;
432 };
433 
TEST_F(TestGPURegistryCustomOp,TestGPUCustomAdd)434 TEST_F(TestGPURegistryCustomOp, TestGPUCustomAdd) {
435   auto meta_graph = std::make_shared<schema::MetaGraphT>();
436   meta_graph->name = "graph";
437 
438   auto node = std::make_unique<schema::CNodeT>();
439   node->inputIndex = {0, 1};
440   node->outputIndex = {2};
441   node->primitive = std::make_unique<schema::PrimitiveT>();
442   node->primitive->value.type = schema::PrimitiveType_Custom;
443   auto primitive = new schema::CustomT;
444   primitive->type = "Custom_Add";
445   node->primitive->value.value = primitive;
446   node->name = "Add";
447   meta_graph->nodes.emplace_back(std::move(node));
448   meta_graph->inputIndex = {0, 1};
449   meta_graph->outputIndex = {2};
450 
451   auto input0 = std::make_unique<schema::TensorT>();
452   input0->nodeType = lite::NodeType_ValueNode;
453   input0->format = schema::Format_NHWC;
454   input0->dataType = TypeId::kNumberTypeFloat32;
455   input0->dims = {1, 28, 28, 3};
456   input0->offset = -1;
457   meta_graph->allTensors.emplace_back(std::move(input0));
458 
459   auto weight = std::make_unique<schema::TensorT>();
460   weight->nodeType = lite::NodeType_ValueNode;
461   weight->format = schema::Format_NHWC;
462   weight->dataType = TypeId::kNumberTypeFloat32;
463   weight->dims = {1, 28, 28, 3};
464 
465   weight->offset = -1;
466   meta_graph->allTensors.emplace_back(std::move(weight));
467 
468   auto output = std::make_unique<schema::TensorT>();
469   output->nodeType = lite::NodeType_Parameter;
470   output->format = schema::Format_NHWC;
471   output->dataType = TypeId::kNumberTypeFloat32;
472   output->offset = -1;
473   meta_graph->allTensors.emplace_back(std::move(output));
474 
475   flatbuffers::FlatBufferBuilder builder(1024);
476   auto offset = schema::MetaGraph::Pack(builder, meta_graph.get());
477   builder.Finish(offset);
478   schema::FinishMetaGraphBuffer(builder, offset);
479   size_t size = builder.GetSize();
480   const char *content = reinterpret_cast<char *>(builder.GetBufferPointer());
481 
482   // create a context
483   auto context = std::make_shared<mindspore::Context>();
484   context->SetThreadNum(1);
485   context->SetEnableParallel(false);
486   context->SetThreadAffinity(lite::HIGHER_CPU);
487   auto &device_list = context->MutableDeviceInfo();
488 
489   std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
490   device_info->SetEnableFP16(false);
491   device_list.push_back(device_info);
492 
493   std::shared_ptr<GPUDeviceInfo> provider_gpu_device_info = std::make_shared<GPUDeviceInfo>();
494   provider_gpu_device_info->SetEnableFP16(false);
495   provider_gpu_device_info->SetProviderDevice("GPU");
496   provider_gpu_device_info->SetProvider("BuiltInTest");
497   device_list.push_back(provider_gpu_device_info);
498 
499   // build a model
500   auto model = std::make_shared<mindspore::Model>();
501   auto ret = model->Build(content, size, kFlatBuffer, context);
502   ASSERT_EQ(kSuccess, ret.StatusCode());
503   auto inputs = model->GetInputs();
504   ASSERT_EQ(inputs.size(), 2);
505   auto inTensor = inputs.front();
506   auto impl = inTensor.impl();
507   ASSERT_NE(nullptr, impl);
508   float *in0_data = static_cast<float *>(inTensor.MutableData());
509   in0_data[0] = 10.0f;
510   auto inTensor1 = inputs.back();
511   impl = inTensor1.impl();
512   ASSERT_NE(nullptr, impl);
513   float *in1_data = static_cast<float *>(inTensor1.MutableData());
514   in1_data[0] = 20.0f;
515   std::vector<mindspore::MSTensor> outputs;
516   ret = model->Predict(inputs, &outputs);
517   ASSERT_EQ(kSuccess, ret.StatusCode());
518   ASSERT_EQ(outputs.size(), 1);
519   impl = outputs.front().impl();
520   ASSERT_NE(nullptr, impl);
521   ASSERT_EQ(28 * 28 * 3, outputs.front().ElementNum());
522   ASSERT_EQ(DataType::kNumberTypeFloat32, outputs.front().DataType());
523   auto *outData = reinterpret_cast<const float *>(outputs.front().Data().get());
524   ASSERT_NE(nullptr, outData);
525   ASSERT_EQ(30.0f, outData[0]);
526   MS_LOG(INFO) << "Register add op test pass.";
527 }
528 }  // namespace mindspore
529