• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <set>
18 #include <string>
19 #include <map>
20 #include <algorithm>
21 #include "include/errorcode.h"
22 #include "src/kernel_registry.h"
23 #include "src/runtime/kernel/opencl/kernel/reduce.h"
24 #include "src/runtime/kernel/opencl/utils.h"
25 #include "src/runtime/kernel/opencl/cl/reduce.cl.inc"
26 
27 using mindspore::kernel::KERNEL_ARCH::kGPU;
28 using mindspore::lite::KernelRegistrar;
29 using mindspore::lite::RET_ERROR;
30 using mindspore::lite::RET_NULL_PTR;
31 using mindspore::lite::RET_OK;
32 using mindspore::lite::RET_PARAM_INVALID;
33 using mindspore::schema::PrimitiveType_ReduceFusion;
34 using mindspore::schema::ReduceMode;
35 using mindspore::schema::ReduceMode_ReduceMax;
36 using mindspore::schema::ReduceMode_ReduceMean;
37 using mindspore::schema::ReduceMode_ReduceMin;
38 using mindspore::schema::ReduceMode_ReduceProd;
39 using mindspore::schema::ReduceMode_ReduceSum;
40 using mindspore::schema::ReduceMode_ReduceSumSquare;
41 
42 namespace mindspore::kernel {
GetReduceTypeStr(int type)43 std::string ReduceOpenCLKernel::GetReduceTypeStr(int type) {
44   static const std::map<int, std::string> reduce_type2str{
45     {ReduceMode_ReduceMean, "Mean"}, {ReduceMode_ReduceSum, "Sum"},   {ReduceMode_ReduceMin, "Min"},
46     {ReduceMode_ReduceMax, "Max"},   {ReduceMode_ReduceProd, "Prod"}, {ReduceMode_ReduceSumSquare, "SumSquare"}};
47   auto result_iter = reduce_type2str.find(type);
48   if (result_iter != reduce_type2str.end()) {
49     return result_iter->second;
50   }
51   return "";
52 }
53 
GenC4Mask()54 cl_float4 ReduceOpenCLKernel::GenC4Mask() {
55   auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
56   int last_c4 = inShape.C % C4NUM;
57   if (last_c4 == 0) last_c4 = C4NUM;
58   static const std::map<int, float> reduce_type2init{
59     {ReduceMode_ReduceMean, 0.f},     {ReduceMode_ReduceSum, 0.f},  {ReduceMode_ReduceMin, 10000.f},
60     {ReduceMode_ReduceMax, -10000.f}, {ReduceMode_ReduceProd, 1.f}, {ReduceMode_ReduceSumSquare, 0.f}};
61   float init_float = reduce_type2init.find(reduce_param->mode_)->second;
62   cl_float4 mask = {0.f, 0.f, 0.f, 0.f};
63   for (int i = 0; i < last_c4; i++) {
64     mask.s[C4NUM - i - 1] = init_float;
65   }
66   return mask;
67 }
68 
IsHWReduce(const bool * reduce_axes_)69 bool IsHWReduce(const bool *reduce_axes_) {
70   return !reduce_axes_[0] && reduce_axes_[1] && reduce_axes_[2] && !reduce_axes_[3];
71 }
72 
IsWCReduce(const bool * reduce_axes_)73 bool IsWCReduce(const bool *reduce_axes_) {
74   return !reduce_axes_[0] && !reduce_axes_[1] && reduce_axes_[2] && reduce_axes_[3];
75 }
76 
IsCReduce(const bool * reduce_axes_)77 bool IsCReduce(const bool *reduce_axes_) {
78   return !reduce_axes_[0] && !reduce_axes_[1] && !reduce_axes_[2] && reduce_axes_[3];
79 }
80 
SetAxes()81 int ReduceOpenCLKernel::SetAxes() {
82   // axes is input tensor
83   // get num_axes
84   int num_axes = 0;
85   auto *axes_tensor = in_tensors_.at(1);
86   if (axes_tensor->shape().size() != 1) {
87     MS_LOG(ERROR) << "in Reduce: axes tensor's ndim should be 1.";
88     return RET_ERROR;
89   } else {
90     num_axes = axes_tensor->shape().front();
91   }
92   // check axes tensor
93   if (CheckParamLikeTensor("Reduce", "axes", axes_tensor, kNumberTypeInt32, {num_axes}) != RET_OK) {
94     return RET_ERROR;
95   }
96   // copy axes from tensor to private var
97   CHECK_NULL_RETURN(axes_tensor->data());
98   for (int i = 0; i < std::min(num_axes, MAX_SHAPE_SIZE); ++i) {
99     axes_[i] = reinterpret_cast<int *>(axes_tensor->data())[i];
100   }
101   if (num_axes > 2 || num_axes < 1) {
102     MS_LOG(ERROR) << "Unsupported reduce num axes " << num_axes;
103     return RET_PARAM_INVALID;
104   }
105 
106   for (int i = 0; i < num_axes; i++) {
107     int axis = axes_[i];
108     axis = inShape.AlignAxis(axis);
109     reduce_axes_[axis] = true;
110   }
111   if (num_axes == 1) {
112     if (reduce_axes_[1] && inShape.W == 1) {
113       reduce_axes_[2] = true;
114     } else if (reduce_axes_[2]) {
115       if (inShape.H == 1) {
116         reduce_axes_[1] = true;
117       } else if (inShape.C == 1) {
118         reduce_axes_[3] = true;
119       }
120     } else if (reduce_axes_[3] && inShape.W == 1) {
121       reduce_axes_[3] = true;
122     }
123   }
124   return RET_OK;
125 }
126 
CheckSpecs()127 int ReduceOpenCLKernel::CheckSpecs() {
128   if (in_tensors_.size() != INPUT_TENSOR_SIZE_2 || out_tensors_.size() != OUTPUT_TENSOR_SIZE_1) {
129     MS_LOG(WARNING) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
130     return RET_ERROR;
131   }
132   auto input = in_tensors_.at(0);
133   CHECK_NULL_RETURN(input);
134   if (input->shape()[0] > DIMENSION_1D) {
135     MS_LOG(WARNING) << "reduce op only support n = 1";
136     return RET_PARAM_INVALID;
137   }
138   inShape = GpuTensorInfo(in_tensors_[0]);
139   auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
140   CHECK_NULL_RETURN(reduce_param);
141   if (GetReduceTypeStr(reduce_param->mode_).empty()) {
142     MS_LOG(WARNING) << "not supported reduce type:" << reduce_param->mode_;
143     return RET_PARAM_INVALID;
144   }
145   auto ret = SetAxes();
146   if (ret != RET_OK) {
147     return ret;
148   }
149   hw_reduce_ = IsHWReduce(reduce_axes_);
150   wc_reduce_ = IsWCReduce(reduce_axes_);
151   c_reduce_ = IsCReduce(reduce_axes_);
152   if (!hw_reduce_ && !wc_reduce_ && !c_reduce_) {
153     MS_LOG(WARNING) << "Unsupported reduce axes";
154     return RET_PARAM_INVALID;
155   }
156   if ((c_reduce_ || wc_reduce_) && !reduce_param->keep_dims_) {
157     MS_LOG(WARNING) << "reduce axis (2,3) should keep dims";
158     return RET_PARAM_INVALID;
159   }
160   return RET_OK;
161 }
162 
Prepare()163 int ReduceOpenCLKernel::Prepare() {
164   auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
165   if (reduce_param == nullptr) {
166     return RET_NULL_PTR;
167   }
168 
169   std::string kernel_name;
170   use_local_ = false;
171   kernel_name = "Global";
172   if (wc_reduce_ && (inShape.W >= LOCAL_CACHE_THREAD || inShape.C >= LOCAL_CACHE_THREAD)) {
173     use_local_ = true;
174     kernel_name = "Local";
175   }
176   if (hw_reduce_ && (inShape.W >= LOCAL_CACHE_THREAD || inShape.H >= LOCAL_CACHE_THREAD)) {
177     use_local_ = true;
178     kernel_name = "Local";
179   }
180   if (wc_reduce_) {
181     kernel_name += "WC";
182   } else if (hw_reduce_) {
183     kernel_name += "HW";
184   } else if (c_reduce_) {
185     kernel_name += "C";
186   }
187   kernel_name += GetReduceTypeStr(reduce_param->mode_);
188   std::string source = reduce_source;
189   const std::string program_name = "Reduce";
190   if (!ocl_runtime_->LoadSource(program_name, source)) {
191     MS_LOG(ERROR) << "Load source failed.";
192     return RET_ERROR;
193   }
194   auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
195   auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
196   if (ret != RET_OK) {
197     MS_LOG(ERROR) << "Build kernel failed.";
198     return ret;
199   }
200   if (SetConstArgs() != RET_OK) {
201     MS_LOG(ERROR) << "SeConstArgs failed.";
202     return RET_ERROR;
203   }
204   SetGlobalLocal();
205   MS_LOG(DEBUG) << kernel_name << " Init Done!";
206   return RET_OK;
207 }
208 
SetConstArgs()209 int ReduceOpenCLKernel::SetConstArgs() {
210   int h = inShape.H;
211   int w = inShape.W;
212   int c = inShape.C;
213   int c4 = UP_DIV(c, C4NUM);
214   cl_int4 size = {h, w, c4, c};
215   int arg_idx = 2;
216   if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size) != CL_SUCCESS) {
217     MS_LOG(ERROR) << "SetKernelArg failed.";
218     return RET_ERROR;
219   }
220   if (wc_reduce_ || c_reduce_) {
221     if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, GenC4Mask()) != CL_SUCCESS) {
222       MS_LOG(ERROR) << "SetKernelArg failed.";
223       return RET_ERROR;
224     }
225   }
226   return RET_OK;
227 }
228 
SetGlobalLocal()229 void ReduceOpenCLKernel::SetGlobalLocal() {
230   int h = inShape.H;
231   int w = inShape.W;
232   int c4 = inShape.Slice;
233   local_size_ = {};
234   if (use_local_) {
235     local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD};
236   }
237   if (hw_reduce_) {
238     global_size_ = {static_cast<size_t>(c4), 1, 1};
239   } else if (wc_reduce_) {
240     global_size_ = {static_cast<size_t>(h), 1, 1};
241   } else if (c_reduce_ && !use_local_) {
242     global_size_ = {static_cast<size_t>(h), static_cast<size_t>(w)};
243   }
244   AlignGlobalLocal(global_size_, local_size_);
245 }
246 
Tune()247 int ReduceOpenCLKernel::Tune() {
248   if (use_local_) {
249     return RET_OK;
250   }
251   return OpenCLKernel::Tune();
252 }
253 
Run()254 int ReduceOpenCLKernel::Run() {
255   MS_LOG(DEBUG) << this->name() << " Running!";
256   int arg_idx = 0;
257   if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data()) != CL_SUCCESS) {
258     MS_LOG(ERROR) << "SetKernelArg failed.";
259     return RET_ERROR;
260   }
261   if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data()) != CL_SUCCESS) {
262     MS_LOG(ERROR) << "SetKernelArg failed.";
263     return RET_ERROR;
264   }
265   if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
266     MS_LOG(ERROR) << "RunKernel failed.";
267     return RET_ERROR;
268   }
269   return RET_OK;
270 }
271 
272 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ReduceFusion, OpenCLKernelCreator<ReduceOpenCLKernel>)
273 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_ReduceFusion, OpenCLKernelCreator<ReduceOpenCLKernel>)
274 }  // namespace mindspore::kernel
275