• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include <cmath>
17 #include <cstring>
18 #include <memory>
19 #include "schema/inner/model_generated.h"
20 #include "common/common_test.h"
21 #include "include/api/context.h"
22 #include "include/api/model.h"
23 #include "include/errorcode.h"
24 #include "src/common/log_adapter.h"
25 #include "src/litert/lite_session.h"
26 #include "include/registry/register_kernel_interface.h"
27 #include "include/registry/register_kernel.h"
28 #include "include/registry/opencl_runtime_wrapper.h"
29 #include "include/api/data_type.h"
30 
31 using mindspore::kernel::Kernel;
32 using mindspore::kernel::KernelInterface;
33 using mindspore::schema::PrimitiveType_AddFusion;
34 #define UP_ROUND(x, y) (((x) + (y) - (1)) / (y) * (y))
35 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
36 
37 namespace mindspore {
38 namespace {
39 constexpr int kDimIndex2 = 2;
40 constexpr int kDimIndex3 = 3;
41 constexpr int kDimIndex4 = 4;
42 constexpr size_t kDimSize2 = 2;
43 constexpr auto kFloat32 = DataType::kNumberTypeFloat32;
44 static const char *arithmetic_source =
45   "\n"
46   "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
47   "__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
48   "\n"
49   "__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t "
50   "output,\n"
51   "                         const int2 output_shape) {\n"
52   "  int X = get_global_id(0);\n"
53   "  int Y = get_global_id(1);\n"
54   "  if (X >= output_shape.x || Y >= output_shape.y) {\n"
55   "    return;\n"
56   "  }\n"
57   "\n"
58   "  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));\n"
59   "  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));\n"
60   "  FLT4 result = a + b;\n"
61   "\n"
62   "  WRITE_IMAGE(output, (int2)(X, Y), result);\n"
63   "}\n";
64 
65 template <typename SrcT, typename DstT>
Broadcast2GpuShape(DstT * dst,const SrcT * src,int src_num)66 void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) {
67   if (src == nullptr || src_num <= 0) {
68     return;
69   }
70   auto *N = dst;
71   auto *H = dst + 1;
72   auto *W = dst + kDimIndex2;
73   auto *C = dst + kDimIndex3;
74   if (src_num == 1) {  // 1 1 1 C
75     *C = src[0];
76   } else if (src_num == kDimIndex2) {  // N 1 1 C
77     *N = src[0];
78     *C = src[1];
79   } else if (src_num == kDimIndex3) {  // N 1 W C
80     *N = src[0];
81     *W = src[1];
82     *C = src[2];
83   } else if (src_num == kDimIndex4) {  // N H W C
84     *N = src[0];
85     *H = src[1];
86     *W = src[2];
87     *C = src[3];
88   } else if (src_num > 4) {
89     std::cerr << "GPU doesn't support ndim>=" << src_num;
90   }
91 }
92 
93 template <typename SrcT, typename DstT>
Broadcast2GpuShape(DstT * dst,const SrcT * src,int src_num,DstT default_value)94 void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) {
95   for (int i = 0; i < 4; ++i) {
96     dst[i] = default_value;
97   }
98   if (src == nullptr || src_num <= 0) {
99     return;
100   }
101   Broadcast2GpuShape(dst, src, src_num);
102 }
103 
104 struct GpuTensorInfo {
105   GpuTensorInfo() = default;
GpuTensorInfomindspore::__anon6e20fdab0111::GpuTensorInfo106   explicit GpuTensorInfo(const MSTensor *tensor, registry::opencl::OpenCLRuntimeWrapper *opencl_run) {
107     if (tensor == nullptr) {
108       return;
109     }
110     auto shape_ori = tensor->Shape();
111     int64_t shape[kDimIndex4];
112     Broadcast2GpuShape(shape, shape_ori.data(), shape_ori.size(), 1l);
113     N = shape[0];
114     H = shape[1];
115     W = shape[kDimIndex2];
116     C = shape[kDimIndex3];
117     Slice = UP_DIV(C, C4NUM);
118     if (tensor->DataType() == mindspore::DataType::kNumberTypeFloat16) {
119       FLT_size = sizeof(cl_half);
120     } else {
121       FLT_size = sizeof(cl_float);
122     }
123     FLT4_size = FLT_size * 4;
124     if (W * Slice <= opencl_run->GetMaxImage2DWidth()) {
125       height = N * H;
126       width = W * Slice;
127     } else {
128       height = N * H * W;
129       width = Slice;
130       if (height > opencl_run->GetMaxImage2DHeight()) {
131         height = -1;
132         width = -1;
133       }
134     }
135 
136     ElementsNum = N * H * W * C;
137     Image2DSize = height * width * FLT4_size;
138   }
139   size_t N{1};
140   size_t H{1};
141   size_t W{1};
142   size_t C{1};
143   size_t Slice{};
144   size_t width{};
145   size_t height{};
146   size_t FLT_size{4};
147   size_t FLT4_size{16};
148   size_t ElementsNum{};
149   size_t Image2DSize{};
150 };
151 }  // namespace
152 
153 class CustomAddKernel : public kernel::Kernel {
154  public:
CustomAddKernel(const std::vector<MSTensor> & inputs,const std::vector<MSTensor> & outputs,const schema::Primitive * primitive,const mindspore::Context * ctx,bool fp16_enable)155   CustomAddKernel(const std::vector<MSTensor> &inputs, const std::vector<MSTensor> &outputs,
156                   const schema::Primitive *primitive, const mindspore::Context *ctx, bool fp16_enable)
157       : Kernel(inputs, outputs, primitive, ctx), fp16_enable_(fp16_enable) {}
~CustomAddKernel()158   ~CustomAddKernel() override { FreeWeight(); }
159 
CheckInputsDataTypes()160   int CheckInputsDataTypes() { return kSuccess; }
161 
162   // Prepare will be called during graph compilation
Prepare()163   int Prepare() override {
164     auto ret = CheckSpecs();
165     if (ret != kSuccess) {
166       std::cerr << "Prepare failed for check kernel specs!";
167       return ret;
168     }
169     const std::string kernel_name_ = "ElementAdd";
170     const std::string program_name = "Arithmetic";
171     std::string source = arithmetic_source;
172     if (opencl_runtime_.LoadSource(program_name, source) != kSuccess) {
173       std::cerr << "Load source failed.";
174       return kLiteError;
175     }
176     std::vector<std::string> build_options_ext = {"-cl-mad-enable -cl-fast-relaxed-math -Werror"};
177     if (fp16_enable_) {
178       build_options_ext.push_back(" -DFLT4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh");
179     } else {
180       build_options_ext.push_back(" -DFLT4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef");
181     }
182 
183     if (opencl_runtime_.BuildKernel(&kernel_, program_name, kernel_name_, build_options_ext) != kSuccess) {
184       std::cerr << "Build kernel failed.";
185       return kLiteError;
186     }
187 
188     auto out_shape = GpuTensorInfo(&outputs_[0], &opencl_runtime_);
189     local_range_ = cl::NullRange;
190     global_range_ = cl::NDRange(out_shape.width, out_shape.height);
191     for (int i = 0; i < inputs_.size(); ++i) {
192       auto &in_tensor = inputs_.at(i);
193       if (in_tensor.IsConst()) {
194         GpuTensorInfo in_shape = GpuTensorInfo(&in_tensor, &opencl_runtime_);
195         std::vector<char> weight(in_shape.Image2DSize, 0);
196         bool src_is_fp16 = in_tensor.DataType() == mindspore::DataType::kNumberTypeFloat16;
197         PackNHWCToNHWC4(in_tensor.MutableData(), weight.data(), src_is_fp16, fp16_enable_, in_shape,
198                         in_tensor.DataType());
199         DataType dtype =
200           fp16_enable_ ? mindspore::DataType::kNumberTypeFloat16 : mindspore::DataType::kNumberTypeFloat32;
201         auto allocator = opencl_runtime_.GetAllocator();
202         if (allocator == nullptr) {
203           std::cerr << "GetAllocator fail.";
204           FreeWeight();
205           return kLiteError;
206         }
207         auto weight_ptr = allocator->Malloc(in_shape.width, in_shape.height, dtype);
208         if (weight_ptr == nullptr) {
209           std::cerr << "Malloc fail.";
210           FreeWeight();
211           return kLiteError;
212         }
213         weight_ptrs_.push_back(weight_ptr);
214         if (opencl_runtime_.WriteImage(weight_ptr, weight.data()) != kSuccess) {
215           std::cerr << "WriteImage fail.";
216           FreeWeight();
217           return kLiteError;
218         }
219       } else {
220         weight_ptrs_.push_back(nullptr);
221       }
222     }
223 
224     int arg_idx = 3;
225     cl_int2 output_shape{static_cast<int>(global_range_[0]), static_cast<int>(global_range_[1])};
226     if (opencl_runtime_.SetKernelArg(kernel_, arg_idx, output_shape) != kSuccess) {
227       std::cerr << "Set kernel arg" << arg_idx << "failed.";
228       FreeWeight();
229       return kLiteError;
230     }
231 
232     std::cout << kernel_name_ << " Prepare Done!" << std::endl;
233     return kSuccess;
234   }
235 
236   // Execute is called to compute.
Execute()237   int Execute() override {
238     if (inputs_.size() != kDimSize2) {
239       return kLiteParamInvalid;
240     }
241     PreProcess();
242     std::cout << this->name() << " Running!" << std::endl;
243     auto input_0_ptr = weight_ptrs_[0] == nullptr ? inputs_[0].MutableData() : weight_ptrs_[0];
244     auto input_1_ptr = weight_ptrs_[1] == nullptr ? inputs_[1].MutableData() : weight_ptrs_[1];
245     int arg_idx = 0;
246     if (opencl_runtime_.SetKernelArg(kernel_, arg_idx++, input_0_ptr) != kSuccess) {
247       std::cerr << "Set kernel arg" << arg_idx - 1 << "failed.";
248       return kLiteError;
249     }
250     if (opencl_runtime_.SetKernelArg(kernel_, arg_idx++, input_1_ptr) != kSuccess) {
251       std::cerr << "Set kernel arg" << arg_idx - 1 << "failed.";
252       return kLiteError;
253     }
254     if (opencl_runtime_.SetKernelArg(kernel_, arg_idx++, outputs_[0].MutableData()) != kSuccess) {
255       std::cerr << "Set kernel arg" << arg_idx - 1 << "failed.";
256       return kLiteError;
257     }
258     if (opencl_runtime_.RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != kSuccess) {
259       std::cerr << "Run kernel failed.";
260       return kLiteError;
261     }
262 
263     return kSuccess;
264   }
265 
CheckSpecs()266   int CheckSpecs() {
267     for (auto &tensor : inputs_) {
268       if (tensor.DataType() != DataType::kNumberTypeFloat32 && tensor.DataType() != DataType::kNumberTypeFloat16) {
269         std::cerr << "ArithmeticOpenCLKernel only support fp32/fp16 input";
270         return kLiteError;
271       }
272     }
273     for (auto &tensor : outputs_) {
274       if (tensor.DataType() != DataType::kNumberTypeFloat32 && tensor.DataType() != DataType::kNumberTypeFloat16) {
275         std::cerr << "ArithmeticOpenCLKernel only support fp32/fp16 output";
276         return kLiteError;
277       }
278     }
279 
280     if (inputs_.size() != kDimSize2 || outputs_.size() != 1) {
281       std::cerr << "in size: " << inputs_.size() << ", out size: " << outputs_.size();
282       return kLiteError;
283     }
284 
285     for (int i = 0; i < inputs_.size(); ++i) {
286       auto &in_tensor = inputs_.at(i);
287       if (!in_tensor.IsConst()) {
288         if (fp16_enable_ && in_tensor.DataType() == mindspore::DataType::kNumberTypeFloat32) {
289           std::cerr << "Inputs data type error, expectation kNumberTypeFloat16 but kNumberTypeFloat32.";
290           return kLiteError;
291         } else if (!fp16_enable_ && in_tensor.DataType() == mindspore::DataType::kNumberTypeFloat16) {
292           std::cerr << "Inputs data type error, expectation kNumberTypeFloat32 but kNumberTypeFloat16.";
293           return kLiteError;
294         }
295       }
296     }
297 
298     return kSuccess;
299   }
300 
301   // Resize is used to update some parameters if current node can change along with inputs.
ReSize()302   int ReSize() override {
303     if (CheckOutputs(outputs_) == kSuccess) {
304       return kSuccess;
305     }
306     auto status =
307       registry::RegisterKernelInterface::GetKernelInterface("", primitive_)->Infer(&inputs_, &outputs_, primitive_);
308     if (status != kSuccess) {
309       std::cerr << "infer failed." << std::endl;
310       return kLiteError;
311     }
312     auto ret = Prepare();
313     if (ret != kSuccess) {
314       std::cerr << "ReSize failed for kernel prepare!";
315       return ret;
316     }
317     return kSuccess;
318   }
319 
320  private:
321   const bool fp16_enable_;
322   cl::Kernel kernel_;
323   cl::Event event_;
324   cl::NDRange global_range_{cl::NullRange};
325   cl::NDRange local_range_{cl::NullRange};
326   std::vector<void *> weight_ptrs_;
327   registry::opencl::OpenCLRuntimeWrapper opencl_runtime_;
328 
PreProcess()329   int PreProcess() {
330     int ret;
331     ret = ReSize();
332     if (ret != kSuccess) {
333       return ret;
334     }
335     for (auto i = 0; i < outputs_.size(); ++i) {
336       auto *output = &outputs_.at(i);
337       auto img_info = GpuTensorInfo(output, &opencl_runtime_);
338       auto allocator = output->allocator();
339       if (allocator == nullptr) {
340         std::cerr << "The output tensor of OpenCL kernel must have an allocator.";
341         return kLiteError;
342       }
343       auto data_ptr = allocator->Malloc(img_info.width, img_info.height, output->DataType());
344       if (data_ptr == nullptr) {
345         std::cerr << "Malloc data failed";
346         return kLiteError;
347       }
348       output->SetData(data_ptr);
349     }
350     return kSuccess;
351   }
352 
CheckOutputs(const std::vector<mindspore::MSTensor> & outputs)353   int CheckOutputs(const std::vector<mindspore::MSTensor> &outputs) {
354     for (auto &output : outputs) {
355       auto output_shape = output.Shape();
356       if (std::find(output_shape.begin(), output_shape.end(), -1) != output_shape.end()) {
357         return kLiteInferInvalid;
358       }
359     }
360     return kSuccess;
361   }
362 
PackNHWCToNHWC4(void * src,void * dst,bool src_is_fp16,bool dst_is_fp16,const GpuTensorInfo & tensor,mindspore::DataType data_type)363   void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor,
364                        mindspore::DataType data_type) {
365     auto src_fp16 = static_cast<float16_t *>(src);
366     auto src_fp32 = static_cast<float32_t *>(src);
367     auto src_int32 = static_cast<int32_t *>(src);
368     auto dst_fp16 = static_cast<float16_t *>(dst);
369     auto dst_fp32 = static_cast<float32_t *>(dst);
370     auto dst_int32 = static_cast<int32_t *>(dst);
371     for (int n = 0, src_idx = 0; n < tensor.N; n++) {
372       for (int h = 0; h < tensor.H; ++h) {
373         for (int w = 0; w < tensor.W; ++w) {
374           for (int c = 0; c < tensor.C; ++c, ++src_idx) {
375             int dst_idx = ((n * tensor.H + h) * tensor.W + w) * tensor.Slice * C4NUM + c;
376             if (data_type == mindspore::DataType::kNumberTypeInt32) {
377               dst_int32[dst_idx] = src_int32[src_idx];
378             } else if (dst_is_fp16) {
379               dst_fp16[dst_idx] = src_is_fp16 ? src_fp16[src_idx] : static_cast<float16_t>(src_fp32[src_idx]);
380             } else {
381               dst_fp32[dst_idx] = src_is_fp16 ? static_cast<float32_t>(src_fp16[src_idx]) : src_fp32[src_idx];
382             }
383           }
384         }
385       }
386     }
387     // scalar
388     if (tensor.ElementsNum == 1) {
389       if (dst_is_fp16) {
390         dst_fp16[kDimIndex3] = dst_fp16[kDimIndex2] = dst_fp16[1] = dst_fp16[0];
391       } else {
392         dst_fp32[kDimIndex3] = dst_fp32[kDimIndex2] = dst_fp32[1] = dst_fp32[0];
393       }
394     }
395   }
396 
FreeWeight()397   void FreeWeight() {
398     auto allocator = opencl_runtime_.GetAllocator();
399     if (allocator == nullptr) {
400       std::cerr << "GetAllocator fail.";
401       return;
402     }
403     for (auto &weight_ptr : weight_ptrs_) {
404       if (weight_ptr != nullptr) {
405         allocator->Free(weight_ptr);
406         weight_ptr = nullptr;
407       }
408     }
409   }
410 };
411 
412 class CustomAddInfer : public kernel::KernelInterface {
413  public:
414   CustomAddInfer() = default;
415   ~CustomAddInfer() = default;
416 
Infer(std::vector<mindspore::MSTensor> * inputs,std::vector<mindspore::MSTensor> * outputs,const schema::Primitive * primitive)417   Status Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
418                const schema::Primitive *primitive) override {
419     (*outputs)[0].SetFormat((*inputs)[0].format());
420     (*outputs)[0].SetDataType((*inputs)[0].DataType());
421     (*outputs)[0].SetShape((*inputs)[0].Shape());
422     return kSuccess;
423   }
424 };
425 
426 namespace {
CustomAddCreator(const std::vector<MSTensor> & inputs,const std::vector<MSTensor> & outputs,const schema::Primitive * primitive,const mindspore::Context * ctx)427 std::shared_ptr<kernel::Kernel> CustomAddCreator(const std::vector<MSTensor> &inputs,
428                                                  const std::vector<MSTensor> &outputs,
429                                                  const schema::Primitive *primitive, const mindspore::Context *ctx) {
430   bool fp16_enable = false;
431 
432   std::cout << "using fp32 add.\n" << std::endl;
433   return std::make_shared<CustomAddKernel>(inputs, outputs, primitive, ctx, fp16_enable);
434 }
435 
CustomAddInferCreator()436 std::shared_ptr<kernel::KernelInterface> CustomAddInferCreator() { return std::make_shared<CustomAddInfer>(); }
437 }  // namespace
438 
439 REGISTER_CUSTOM_KERNEL_INTERFACE(BuiltInTest, Custom_Add, CustomAddInferCreator)
440 // Register custom “Custom_Add” operator
441 REGISTER_CUSTOM_KERNEL(GPU, BuiltInTest, kFloat32, Custom_Add, CustomAddCreator)
442 
443 class TestGPURegistryCustomOp : public mindspore::CommonTest {
444  public:
445   TestGPURegistryCustomOp() = default;
446 };
447 
TEST_F(TestGPURegistryCustomOp,TestGPUCustomAdd)448 TEST_F(TestGPURegistryCustomOp, TestGPUCustomAdd) {
449   auto meta_graph = std::make_shared<schema::MetaGraphT>();
450   meta_graph->name = "graph";
451 
452   auto node = std::make_unique<schema::CNodeT>();
453   node->inputIndex = {0, 1};
454   node->outputIndex = {2};
455   node->primitive = std::make_unique<schema::PrimitiveT>();
456   node->primitive->value.type = schema::PrimitiveType_Custom;
457   auto primitive = new schema::CustomT;
458   primitive->type = "Custom_Add";
459   node->primitive->value.value = primitive;
460   node->name = "Add";
461   meta_graph->nodes.emplace_back(std::move(node));
462   meta_graph->inputIndex = {0, 1};
463   meta_graph->outputIndex = {2};
464 
465   auto input0 = std::make_unique<schema::TensorT>();
466   input0->nodeType = lite::NodeType_Parameter;
467   input0->format = schema::Format_NHWC;
468   input0->dataType = TypeId::kNumberTypeFloat32;
469   input0->dims = {1, 28, 28, 3};
470   input0->offset = -1;
471   meta_graph->allTensors.emplace_back(std::move(input0));
472 
473   auto weight = std::make_unique<schema::TensorT>();
474   weight->nodeType = lite::NodeType_ValueNode;
475   weight->format = schema::Format_NHWC;
476   weight->dataType = TypeId::kNumberTypeFloat32;
477   weight->dims = {1, 28, 28, 3};
478 
479   weight->offset = -1;
480   meta_graph->allTensors.emplace_back(std::move(weight));
481 
482   auto output = std::make_unique<schema::TensorT>();
483   output->nodeType = lite::NodeType_Parameter;
484   output->format = schema::Format_NHWC;
485   output->dataType = TypeId::kNumberTypeFloat32;
486   output->offset = -1;
487   meta_graph->allTensors.emplace_back(std::move(output));
488 
489   flatbuffers::FlatBufferBuilder builder(1024);
490   auto offset = schema::MetaGraph::Pack(builder, meta_graph.get());
491   builder.Finish(offset);
492   schema::FinishMetaGraphBuffer(builder, offset);
493   size_t size = builder.GetSize();
494   const char *content = reinterpret_cast<char *>(builder.GetBufferPointer());
495 
496   // create a context
497   auto context = std::make_shared<mindspore::Context>();
498   context->SetThreadNum(1);
499   context->SetEnableParallel(false);
500   context->SetThreadAffinity(lite::HIGHER_CPU);
501   auto &device_list = context->MutableDeviceInfo();
502 
503   std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
504   device_info->SetEnableFP16(false);
505   device_list.push_back(device_info);
506 
507   std::shared_ptr<GPUDeviceInfo> provider_gpu_device_info = std::make_shared<GPUDeviceInfo>();
508   provider_gpu_device_info->SetEnableFP16(false);
509   provider_gpu_device_info->SetProviderDevice("GPU");
510   provider_gpu_device_info->SetProvider("BuiltInTest");
511   device_list.push_back(provider_gpu_device_info);
512 
513   // build a model
514   auto model = std::make_shared<mindspore::Model>();
515   auto ret = model->Build(content, size, kMindIR_Lite, context);
516   ASSERT_EQ(kSuccess, ret.StatusCode());
517   auto inputs = model->GetInputs();
518   ASSERT_EQ(inputs.size(), 2);
519   auto inTensor = inputs.front();
520   auto impl = inTensor.impl();
521   ASSERT_NE(nullptr, impl);
522   float *in0_data = static_cast<float *>(inTensor.MutableData());
523   in0_data[0] = 10.0f;
524   auto inTensor1 = inputs.back();
525   impl = inTensor1.impl();
526   ASSERT_NE(nullptr, impl);
527   float *in1_data = static_cast<float *>(inTensor1.MutableData());
528   in1_data[0] = 20.0f;
529   std::vector<mindspore::MSTensor> outputs;
530   ret = model->Predict(inputs, &outputs);
531   ASSERT_EQ(kSuccess, ret.StatusCode());
532   ASSERT_EQ(outputs.size(), 1);
533   impl = outputs.front().impl();
534   ASSERT_NE(nullptr, impl);
535   ASSERT_EQ(28 * 28 * 3, outputs.front().ElementNum());
536   ASSERT_EQ(DataType::kNumberTypeFloat32, outputs.front().DataType());
537   auto *outData = reinterpret_cast<const float *>(outputs.front().Data().get());
538   ASSERT_NE(nullptr, outData);
539   ASSERT_EQ(30.0f, outData[0]);
540   MS_LOG(INFO) << "Register add op test pass.";
541 }
542 }  // namespace mindspore
543