• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <set>
18 #include <string>
19 #include <map>
20 #include <algorithm>
21 #include "include/errorcode.h"
22 #include "src/litert/kernel_registry.h"
23 #include "src/litert/kernel/opencl/kernel/reduce.h"
24 #include "src/litert/kernel/opencl/utils.h"
25 #include "src/litert/kernel/opencl/cl/reduce.cl.inc"
26 
27 using mindspore::kernel::KERNEL_ARCH::kGPU;
28 using mindspore::lite::KernelRegistrar;
29 using mindspore::lite::RET_ERROR;
30 using mindspore::lite::RET_NULL_PTR;
31 using mindspore::lite::RET_OK;
32 using mindspore::lite::RET_PARAM_INVALID;
33 using mindspore::schema::PrimitiveType_ReduceFusion;
34 using mindspore::schema::ReduceMode;
35 using mindspore::schema::ReduceMode_ReduceMax;
36 using mindspore::schema::ReduceMode_ReduceMean;
37 using mindspore::schema::ReduceMode_ReduceMin;
38 using mindspore::schema::ReduceMode_ReduceProd;
39 using mindspore::schema::ReduceMode_ReduceSum;
40 using mindspore::schema::ReduceMode_ReduceSumSquare;
41 
42 namespace mindspore::kernel {
GetReduceTypeStr(int type)43 std::string ReduceOpenCLKernel::GetReduceTypeStr(int type) {
44   static const std::map<int, std::string> reduce_type2str{
45     {ReduceMode_ReduceMean, "Mean"}, {ReduceMode_ReduceSum, "Sum"},   {ReduceMode_ReduceMin, "Min"},
46     {ReduceMode_ReduceMax, "Max"},   {ReduceMode_ReduceProd, "Prod"}, {ReduceMode_ReduceSumSquare, "SumSquare"}};
47   auto result_iter = reduce_type2str.find(type);
48   if (result_iter != reduce_type2str.end()) {
49     return result_iter->second;
50   }
51   return "";
52 }
53 
GenC4Mask()54 cl_float4 ReduceOpenCLKernel::GenC4Mask() {
55   auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
56   int last_c4 = inShape.C % C4NUM;
57   if (last_c4 == 0) last_c4 = C4NUM;
58   static const std::map<int, float> reduce_type2init{
59     {ReduceMode_ReduceMean, 0.f},     {ReduceMode_ReduceSum, 0.f},  {ReduceMode_ReduceMin, 10000.f},
60     {ReduceMode_ReduceMax, -10000.f}, {ReduceMode_ReduceProd, 1.f}, {ReduceMode_ReduceSumSquare, 0.f}};
61   float init_float = reduce_type2init.find(reduce_param->mode_)->second;
62   cl_float4 mask = {0.f, 0.f, 0.f, 0.f};
63   for (int i = 0; i < last_c4; i++) {
64     mask.s[C4NUM - i - 1] = init_float;
65   }
66   return mask;
67 }
68 
IsHWCReduce()69 bool ReduceOpenCLKernel::IsHWCReduce() {
70   return !reduce_axes_[kNHWC_N] && reduce_axes_[kNHWC_H] && reduce_axes_[kNHWC_W] && reduce_axes_[kNHWC_C];
71 }
72 
IsHWReduce()73 bool ReduceOpenCLKernel::IsHWReduce() {
74   return !reduce_axes_[kNHWC_N] && reduce_axes_[kNHWC_H] && reduce_axes_[kNHWC_W] && !reduce_axes_[kNHWC_C];
75 }
76 
IsWCReduce()77 bool ReduceOpenCLKernel::IsWCReduce() {
78   return !reduce_axes_[kNHWC_N] && !reduce_axes_[kNHWC_H] && reduce_axes_[kNHWC_W] && reduce_axes_[kNHWC_C];
79 }
80 
IsHReduce()81 bool ReduceOpenCLKernel::IsHReduce() {
82   return !reduce_axes_[kNHWC_N] && reduce_axes_[kNHWC_H] && !reduce_axes_[kNHWC_W] && !reduce_axes_[kNHWC_C];
83 }
84 
IsWReduce()85 bool ReduceOpenCLKernel::IsWReduce() {
86   return !reduce_axes_[kNHWC_N] && !reduce_axes_[kNHWC_H] && reduce_axes_[kNHWC_W] && !reduce_axes_[kNHWC_C];
87 }
88 
IsCReduce()89 bool ReduceOpenCLKernel::IsCReduce() {
90   return !reduce_axes_[kNHWC_N] && !reduce_axes_[kNHWC_H] && !reduce_axes_[kNHWC_W] && reduce_axes_[kNHWC_C];
91 }
92 
SetShapeSizeIs0Axes()93 int ReduceOpenCLKernel::SetShapeSizeIs0Axes() {
94   // axes is input tensor
95   auto *axes_tensor = in_tensors_.at(1);
96   auto input_shape_size = in_tensors_.at(0)->shape().size();
97   if (input_shape_size == 0) {
98     return RET_ERROR;
99   }
100 
101   CHECK_NULL_RETURN(axes_tensor->data());
102 
103   auto reduction_indices = reinterpret_cast<int *>(axes_tensor->data())[0];
104   if (reduction_indices == -1) {
105     reduce_axes_[kNHWC_H] = true;
106     reduce_axes_[kNHWC_W] = true;
107     reduce_axes_[kNHWC_C] = true;
108   } else if (reduction_indices == kNHWC_H || reduction_indices == kNHWC_W || reduction_indices == kNHWC_C) {
109     reduction_indices = reduction_indices + (C4NUM % input_shape_size);
110     reduce_axes_[reduction_indices] = true;
111   } else {
112     MS_LOG(WARNING) << "in Reduce: axes tensor's reduction_indices should be -1, 1, 2, 3";
113     return RET_ERROR;
114   }
115   return RET_OK;
116 }
117 
SetShapeSizeIs1Axes()118 int ReduceOpenCLKernel::SetShapeSizeIs1Axes() {
119   // axes is input tensor
120   // get num_axes
121   auto *axes_tensor = in_tensors_.at(1);
122   int num_axes = axes_tensor->shape().front();
123   // check axes tensor
124   if (CheckParamLikeTensor("Reduce", "axes", axes_tensor, kNumberTypeInt32, {num_axes}) != RET_OK) {
125     return RET_ERROR;
126   }
127   // copy axes from tensor to private var
128   CHECK_NULL_RETURN(axes_tensor->data());
129   for (int i = 0; i < std::min(num_axes, MAX_SHAPE_SIZE); ++i) {
130     axes_[i] = reinterpret_cast<int *>(axes_tensor->data())[i];
131   }
132   if (num_axes > C2NUM || num_axes < C1NUM) {
133     MS_LOG(WARNING) << "Unsupported reduce num axes " << num_axes;
134     return RET_PARAM_INVALID;
135   }
136 
137   for (int i = 0; i < num_axes; i++) {
138     int axis = axes_[i];
139     axis = inShape.AlignAxis(axis);
140     reduce_axes_[axis] = true;
141   }
142   if (num_axes == 1) {
143     if (reduce_axes_[kNHWC_H] && inShape.W == 1) {
144       reduce_axes_[kNHWC_W] = true;
145     } else if (reduce_axes_[kNHWC_W]) {
146       if (inShape.H == 1) {
147         reduce_axes_[kNHWC_H] = true;
148       } else if (inShape.C == 1) {
149         reduce_axes_[kNHWC_C] = true;
150       }
151     } else if (reduce_axes_[kNHWC_C] && inShape.W == 1) {
152       reduce_axes_[kNHWC_C] = true;
153     }
154   }
155   return RET_OK;
156 }
157 
SetAxes()158 int ReduceOpenCLKernel::SetAxes() {
159   auto *axes_tensor = in_tensors_.at(1);
160 
161   if (axes_tensor->shape().size() == 0) {
162     return SetShapeSizeIs0Axes();
163   } else if (axes_tensor->shape().size() == 1) {
164     return SetShapeSizeIs1Axes();
165   } else {
166     MS_LOG(WARNING) << "in Reduce: axes tensor's ndim should be 0 or 1.";
167     return RET_ERROR;
168   }
169 
170   return RET_OK;
171 }
172 
IsReduceAxesSupport()173 int ReduceOpenCLKernel::IsReduceAxesSupport() {
174   if (!IsHWReduce() && !IsWCReduce() && !IsHReduce() && !IsWReduce() && !IsCReduce() && !IsHWCReduce()) {
175     MS_LOG(WARNING) << "Unsupported reduce axes";
176     return RET_PARAM_INVALID;
177   }
178   return RET_OK;
179 }
180 
CheckSpecs()181 int ReduceOpenCLKernel::CheckSpecs() {
182   if (in_tensors_.size() != INPUT_TENSOR_SIZE_2 || out_tensors_.size() != OUTPUT_TENSOR_SIZE_1) {
183     MS_LOG(WARNING) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
184     return RET_ERROR;
185   }
186   auto input = in_tensors_.at(0);
187   CHECK_NULL_RETURN(input);
188   if (input->shape()[0] > DIMENSION_1D) {
189     MS_LOG(WARNING) << "reduce op only support n = 1";
190     return RET_PARAM_INVALID;
191   }
192   inShape = GpuTensorInfo(in_tensors_[0]);
193   auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
194   CHECK_NULL_RETURN(reduce_param);
195   if (GetReduceTypeStr(reduce_param->mode_).empty()) {
196     MS_LOG(WARNING) << "not supported reduce type:" << reduce_param->mode_;
197     return RET_PARAM_INVALID;
198   }
199   auto ret = SetAxes();
200   if (ret != RET_OK) {
201     return ret;
202   }
203 
204   if (IsReduceAxesSupport() != RET_OK) {
205     return RET_PARAM_INVALID;
206   }
207   if (IsWCReduce() && !reduce_param->keep_dims_) {
208     MS_LOG(WARNING) << "reduce axis (2,3) should keep dims";
209     return RET_PARAM_INVALID;
210   }
211   return RET_OK;
212 }
213 
Prepare()214 int ReduceOpenCLKernel::Prepare() {
215   auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
216   if (reduce_param == nullptr) {
217     return RET_NULL_PTR;
218   }
219 
220   std::string kernel_name;
221   use_local_ = false;
222   kernel_name = "Global";
223   if (IsWCReduce() && (inShape.W >= LOCAL_CACHE_THREAD || inShape.C >= LOCAL_CACHE_THREAD)) {
224     use_local_ = true;
225     kernel_name = "Local";
226   }
227   if (IsHWReduce() && (inShape.W >= LOCAL_CACHE_THREAD || inShape.H >= LOCAL_CACHE_THREAD)) {
228     use_local_ = true;
229     kernel_name = "Local";
230   }
231 
232   if (IsHWCReduce()) {
233     kernel_name += "HWC";
234   } else if (IsWCReduce()) {
235     kernel_name += "WC";
236   } else if (IsHWReduce()) {
237     kernel_name += "HW";
238   } else if (IsHReduce()) {
239     kernel_name += "H";
240   } else if (IsWReduce()) {
241     kernel_name += "W";
242   } else if (IsCReduce()) {
243     kernel_name += "C";
244   }
245   kernel_name += GetReduceTypeStr(reduce_param->mode_);
246   std::string source = reduce_source;
247   const std::string program_name = "Reduce";
248   if (!ocl_runtime_->LoadSource(program_name, source)) {
249     MS_LOG(ERROR) << "Load source failed.";
250     return RET_ERROR;
251   }
252   auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
253   auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
254   if (ret != RET_OK) {
255     MS_LOG(ERROR) << "Build kernel failed.";
256     return ret;
257   }
258   if (SetConstArgs() != RET_OK) {
259     MS_LOG(ERROR) << "SeConstArgs failed.";
260     return RET_ERROR;
261   }
262   (void)SetGlobalLocal();
263   MS_LOG(DEBUG) << kernel_name << " Init Done!";
264   return RET_OK;
265 }
266 
SetConstArgs()267 int ReduceOpenCLKernel::SetConstArgs() {
268   int h = inShape.H;
269   int w = inShape.W;
270   int c = inShape.C;
271   int c4 = UP_DIV(c, C4NUM);
272   cl_int4 size = {h, w, c4, c};
273   int arg_idx = 2;
274   if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size) != CL_SUCCESS) {
275     MS_LOG(ERROR) << "SetKernelArg failed.";
276     return RET_ERROR;
277   }
278   if (IsWCReduce() || IsCReduce()) {
279     if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, GenC4Mask()) != CL_SUCCESS) {
280       MS_LOG(ERROR) << "SetKernelArg failed.";
281       return RET_ERROR;
282     }
283   }
284   return RET_OK;
285 }
286 
SetGlobalLocal()287 int ReduceOpenCLKernel::SetGlobalLocal() {
288   int h = inShape.H;
289   int w = inShape.W;
290   int c4 = inShape.Slice;
291   local_size_ = {};
292   if (use_local_) {
293     local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD};
294   }
295   if (IsHWCReduce()) {
296     global_size_ = {1, 1, 1};
297   } else if (IsHWReduce()) {
298     global_size_ = {static_cast<size_t>(c4), 1, 1};
299   } else if (IsWCReduce()) {
300     global_size_ = {static_cast<size_t>(h), 1, 1};
301   } else if (IsHReduce()) {
302     global_size_ = {static_cast<size_t>(w), static_cast<size_t>(c4)};
303   } else if (IsWReduce()) {
304     global_size_ = {static_cast<size_t>(h), static_cast<size_t>(c4)};
305   } else if (IsCReduce() && !use_local_) {
306     global_size_ = {static_cast<size_t>(h), static_cast<size_t>(w)};
307   } else {
308     global_size_ = {1, 1, 1};
309   }
310 
311   AlignGlobalLocal(global_size_, local_size_);
312 
313   return RET_OK;
314 }
315 
Tune()316 int ReduceOpenCLKernel::Tune() {
317   if (use_local_) {
318     return RET_OK;
319   }
320   return OpenCLKernel::Tune();
321 }
322 
Run()323 int ReduceOpenCLKernel::Run() {
324   MS_LOG(DEBUG) << this->name() << " Running!";
325   int arg_idx = 0;
326   if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data()) != CL_SUCCESS) {
327     MS_LOG(ERROR) << "SetKernelArg failed.";
328     return RET_ERROR;
329   }
330   if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data()) != CL_SUCCESS) {
331     MS_LOG(ERROR) << "SetKernelArg failed.";
332     return RET_ERROR;
333   }
334   if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
335     MS_LOG(ERROR) << "RunKernel failed.";
336     return RET_ERROR;
337   }
338   return RET_OK;
339 }
340 
341 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ReduceFusion, OpenCLKernelCreator<ReduceOpenCLKernel>)
342 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_ReduceFusion, OpenCLKernelCreator<ReduceOpenCLKernel>)
343 }  // namespace mindspore::kernel
344