1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <set>
18 #include <string>
19 #include <map>
20 #include <algorithm>
21 #include "include/errorcode.h"
22 #include "src/litert/kernel_registry.h"
23 #include "src/litert/kernel/opencl/kernel/reduce.h"
24 #include "src/litert/kernel/opencl/utils.h"
25 #include "src/litert/kernel/opencl/cl/reduce.cl.inc"
26
27 using mindspore::kernel::KERNEL_ARCH::kGPU;
28 using mindspore::lite::KernelRegistrar;
29 using mindspore::lite::RET_ERROR;
30 using mindspore::lite::RET_NULL_PTR;
31 using mindspore::lite::RET_OK;
32 using mindspore::lite::RET_PARAM_INVALID;
33 using mindspore::schema::PrimitiveType_ReduceFusion;
34 using mindspore::schema::ReduceMode;
35 using mindspore::schema::ReduceMode_ReduceMax;
36 using mindspore::schema::ReduceMode_ReduceMean;
37 using mindspore::schema::ReduceMode_ReduceMin;
38 using mindspore::schema::ReduceMode_ReduceProd;
39 using mindspore::schema::ReduceMode_ReduceSum;
40 using mindspore::schema::ReduceMode_ReduceSumSquare;
41
42 namespace mindspore::kernel {
GetReduceTypeStr(int type)43 std::string ReduceOpenCLKernel::GetReduceTypeStr(int type) {
44 static const std::map<int, std::string> reduce_type2str{
45 {ReduceMode_ReduceMean, "Mean"}, {ReduceMode_ReduceSum, "Sum"}, {ReduceMode_ReduceMin, "Min"},
46 {ReduceMode_ReduceMax, "Max"}, {ReduceMode_ReduceProd, "Prod"}, {ReduceMode_ReduceSumSquare, "SumSquare"}};
47 auto result_iter = reduce_type2str.find(type);
48 if (result_iter != reduce_type2str.end()) {
49 return result_iter->second;
50 }
51 return "";
52 }
53
GenC4Mask()54 cl_float4 ReduceOpenCLKernel::GenC4Mask() {
55 auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
56 int last_c4 = inShape.C % C4NUM;
57 if (last_c4 == 0) last_c4 = C4NUM;
58 static const std::map<int, float> reduce_type2init{
59 {ReduceMode_ReduceMean, 0.f}, {ReduceMode_ReduceSum, 0.f}, {ReduceMode_ReduceMin, 10000.f},
60 {ReduceMode_ReduceMax, -10000.f}, {ReduceMode_ReduceProd, 1.f}, {ReduceMode_ReduceSumSquare, 0.f}};
61 float init_float = reduce_type2init.find(reduce_param->mode_)->second;
62 cl_float4 mask = {0.f, 0.f, 0.f, 0.f};
63 for (int i = 0; i < last_c4; i++) {
64 mask.s[C4NUM - i - 1] = init_float;
65 }
66 return mask;
67 }
68
IsHWCReduce()69 bool ReduceOpenCLKernel::IsHWCReduce() {
70 return !reduce_axes_[kNHWC_N] && reduce_axes_[kNHWC_H] && reduce_axes_[kNHWC_W] && reduce_axes_[kNHWC_C];
71 }
72
IsHWReduce()73 bool ReduceOpenCLKernel::IsHWReduce() {
74 return !reduce_axes_[kNHWC_N] && reduce_axes_[kNHWC_H] && reduce_axes_[kNHWC_W] && !reduce_axes_[kNHWC_C];
75 }
76
IsWCReduce()77 bool ReduceOpenCLKernel::IsWCReduce() {
78 return !reduce_axes_[kNHWC_N] && !reduce_axes_[kNHWC_H] && reduce_axes_[kNHWC_W] && reduce_axes_[kNHWC_C];
79 }
80
IsHReduce()81 bool ReduceOpenCLKernel::IsHReduce() {
82 return !reduce_axes_[kNHWC_N] && reduce_axes_[kNHWC_H] && !reduce_axes_[kNHWC_W] && !reduce_axes_[kNHWC_C];
83 }
84
IsWReduce()85 bool ReduceOpenCLKernel::IsWReduce() {
86 return !reduce_axes_[kNHWC_N] && !reduce_axes_[kNHWC_H] && reduce_axes_[kNHWC_W] && !reduce_axes_[kNHWC_C];
87 }
88
IsCReduce()89 bool ReduceOpenCLKernel::IsCReduce() {
90 return !reduce_axes_[kNHWC_N] && !reduce_axes_[kNHWC_H] && !reduce_axes_[kNHWC_W] && reduce_axes_[kNHWC_C];
91 }
92
SetShapeSizeIs0Axes()93 int ReduceOpenCLKernel::SetShapeSizeIs0Axes() {
94 // axes is input tensor
95 auto *axes_tensor = in_tensors_.at(1);
96 auto input_shape_size = in_tensors_.at(0)->shape().size();
97 if (input_shape_size == 0) {
98 return RET_ERROR;
99 }
100
101 CHECK_NULL_RETURN(axes_tensor->data());
102
103 auto reduction_indices = reinterpret_cast<int *>(axes_tensor->data())[0];
104 if (reduction_indices == -1) {
105 reduce_axes_[kNHWC_H] = true;
106 reduce_axes_[kNHWC_W] = true;
107 reduce_axes_[kNHWC_C] = true;
108 } else if (reduction_indices == kNHWC_H || reduction_indices == kNHWC_W || reduction_indices == kNHWC_C) {
109 reduction_indices = reduction_indices + (C4NUM % input_shape_size);
110 reduce_axes_[reduction_indices] = true;
111 } else {
112 MS_LOG(WARNING) << "in Reduce: axes tensor's reduction_indices should be -1, 1, 2, 3";
113 return RET_ERROR;
114 }
115 return RET_OK;
116 }
117
SetShapeSizeIs1Axes()118 int ReduceOpenCLKernel::SetShapeSizeIs1Axes() {
119 // axes is input tensor
120 // get num_axes
121 auto *axes_tensor = in_tensors_.at(1);
122 int num_axes = axes_tensor->shape().front();
123 // check axes tensor
124 if (CheckParamLikeTensor("Reduce", "axes", axes_tensor, kNumberTypeInt32, {num_axes}) != RET_OK) {
125 return RET_ERROR;
126 }
127 // copy axes from tensor to private var
128 CHECK_NULL_RETURN(axes_tensor->data());
129 for (int i = 0; i < std::min(num_axes, MAX_SHAPE_SIZE); ++i) {
130 axes_[i] = reinterpret_cast<int *>(axes_tensor->data())[i];
131 }
132 if (num_axes > C2NUM || num_axes < C1NUM) {
133 MS_LOG(WARNING) << "Unsupported reduce num axes " << num_axes;
134 return RET_PARAM_INVALID;
135 }
136
137 for (int i = 0; i < num_axes; i++) {
138 int axis = axes_[i];
139 axis = inShape.AlignAxis(axis);
140 reduce_axes_[axis] = true;
141 }
142 if (num_axes == 1) {
143 if (reduce_axes_[kNHWC_H] && inShape.W == 1) {
144 reduce_axes_[kNHWC_W] = true;
145 } else if (reduce_axes_[kNHWC_W]) {
146 if (inShape.H == 1) {
147 reduce_axes_[kNHWC_H] = true;
148 } else if (inShape.C == 1) {
149 reduce_axes_[kNHWC_C] = true;
150 }
151 } else if (reduce_axes_[kNHWC_C] && inShape.W == 1) {
152 reduce_axes_[kNHWC_C] = true;
153 }
154 }
155 return RET_OK;
156 }
157
SetAxes()158 int ReduceOpenCLKernel::SetAxes() {
159 auto *axes_tensor = in_tensors_.at(1);
160
161 if (axes_tensor->shape().size() == 0) {
162 return SetShapeSizeIs0Axes();
163 } else if (axes_tensor->shape().size() == 1) {
164 return SetShapeSizeIs1Axes();
165 } else {
166 MS_LOG(WARNING) << "in Reduce: axes tensor's ndim should be 0 or 1.";
167 return RET_ERROR;
168 }
169
170 return RET_OK;
171 }
172
IsReduceAxesSupport()173 int ReduceOpenCLKernel::IsReduceAxesSupport() {
174 if (!IsHWReduce() && !IsWCReduce() && !IsHReduce() && !IsWReduce() && !IsCReduce() && !IsHWCReduce()) {
175 MS_LOG(WARNING) << "Unsupported reduce axes";
176 return RET_PARAM_INVALID;
177 }
178 return RET_OK;
179 }
180
CheckSpecs()181 int ReduceOpenCLKernel::CheckSpecs() {
182 if (in_tensors_.size() != INPUT_TENSOR_SIZE_2 || out_tensors_.size() != OUTPUT_TENSOR_SIZE_1) {
183 MS_LOG(WARNING) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
184 return RET_ERROR;
185 }
186 auto input = in_tensors_.at(0);
187 CHECK_NULL_RETURN(input);
188 if (input->shape()[0] > DIMENSION_1D) {
189 MS_LOG(WARNING) << "reduce op only support n = 1";
190 return RET_PARAM_INVALID;
191 }
192 inShape = GpuTensorInfo(in_tensors_[0]);
193 auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
194 CHECK_NULL_RETURN(reduce_param);
195 if (GetReduceTypeStr(reduce_param->mode_).empty()) {
196 MS_LOG(WARNING) << "not supported reduce type:" << reduce_param->mode_;
197 return RET_PARAM_INVALID;
198 }
199 auto ret = SetAxes();
200 if (ret != RET_OK) {
201 return ret;
202 }
203
204 if (IsReduceAxesSupport() != RET_OK) {
205 return RET_PARAM_INVALID;
206 }
207 if (IsWCReduce() && !reduce_param->keep_dims_) {
208 MS_LOG(WARNING) << "reduce axis (2,3) should keep dims";
209 return RET_PARAM_INVALID;
210 }
211 return RET_OK;
212 }
213
Prepare()214 int ReduceOpenCLKernel::Prepare() {
215 auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
216 if (reduce_param == nullptr) {
217 return RET_NULL_PTR;
218 }
219
220 std::string kernel_name;
221 use_local_ = false;
222 kernel_name = "Global";
223 if (IsWCReduce() && (inShape.W >= LOCAL_CACHE_THREAD || inShape.C >= LOCAL_CACHE_THREAD)) {
224 use_local_ = true;
225 kernel_name = "Local";
226 }
227 if (IsHWReduce() && (inShape.W >= LOCAL_CACHE_THREAD || inShape.H >= LOCAL_CACHE_THREAD)) {
228 use_local_ = true;
229 kernel_name = "Local";
230 }
231
232 if (IsHWCReduce()) {
233 kernel_name += "HWC";
234 } else if (IsWCReduce()) {
235 kernel_name += "WC";
236 } else if (IsHWReduce()) {
237 kernel_name += "HW";
238 } else if (IsHReduce()) {
239 kernel_name += "H";
240 } else if (IsWReduce()) {
241 kernel_name += "W";
242 } else if (IsCReduce()) {
243 kernel_name += "C";
244 }
245 kernel_name += GetReduceTypeStr(reduce_param->mode_);
246 std::string source = reduce_source;
247 const std::string program_name = "Reduce";
248 if (!ocl_runtime_->LoadSource(program_name, source)) {
249 MS_LOG(ERROR) << "Load source failed.";
250 return RET_ERROR;
251 }
252 auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
253 auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
254 if (ret != RET_OK) {
255 MS_LOG(ERROR) << "Build kernel failed.";
256 return ret;
257 }
258 if (SetConstArgs() != RET_OK) {
259 MS_LOG(ERROR) << "SeConstArgs failed.";
260 return RET_ERROR;
261 }
262 (void)SetGlobalLocal();
263 MS_LOG(DEBUG) << kernel_name << " Init Done!";
264 return RET_OK;
265 }
266
SetConstArgs()267 int ReduceOpenCLKernel::SetConstArgs() {
268 int h = inShape.H;
269 int w = inShape.W;
270 int c = inShape.C;
271 int c4 = UP_DIV(c, C4NUM);
272 cl_int4 size = {h, w, c4, c};
273 int arg_idx = 2;
274 if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size) != CL_SUCCESS) {
275 MS_LOG(ERROR) << "SetKernelArg failed.";
276 return RET_ERROR;
277 }
278 if (IsWCReduce() || IsCReduce()) {
279 if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, GenC4Mask()) != CL_SUCCESS) {
280 MS_LOG(ERROR) << "SetKernelArg failed.";
281 return RET_ERROR;
282 }
283 }
284 return RET_OK;
285 }
286
SetGlobalLocal()287 int ReduceOpenCLKernel::SetGlobalLocal() {
288 int h = inShape.H;
289 int w = inShape.W;
290 int c4 = inShape.Slice;
291 local_size_ = {};
292 if (use_local_) {
293 local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD};
294 }
295 if (IsHWCReduce()) {
296 global_size_ = {1, 1, 1};
297 } else if (IsHWReduce()) {
298 global_size_ = {static_cast<size_t>(c4), 1, 1};
299 } else if (IsWCReduce()) {
300 global_size_ = {static_cast<size_t>(h), 1, 1};
301 } else if (IsHReduce()) {
302 global_size_ = {static_cast<size_t>(w), static_cast<size_t>(c4)};
303 } else if (IsWReduce()) {
304 global_size_ = {static_cast<size_t>(h), static_cast<size_t>(c4)};
305 } else if (IsCReduce() && !use_local_) {
306 global_size_ = {static_cast<size_t>(h), static_cast<size_t>(w)};
307 } else {
308 global_size_ = {1, 1, 1};
309 }
310
311 AlignGlobalLocal(global_size_, local_size_);
312
313 return RET_OK;
314 }
315
Tune()316 int ReduceOpenCLKernel::Tune() {
317 if (use_local_) {
318 return RET_OK;
319 }
320 return OpenCLKernel::Tune();
321 }
322
Run()323 int ReduceOpenCLKernel::Run() {
324 MS_LOG(DEBUG) << this->name() << " Running!";
325 int arg_idx = 0;
326 if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data()) != CL_SUCCESS) {
327 MS_LOG(ERROR) << "SetKernelArg failed.";
328 return RET_ERROR;
329 }
330 if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data()) != CL_SUCCESS) {
331 MS_LOG(ERROR) << "SetKernelArg failed.";
332 return RET_ERROR;
333 }
334 if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
335 MS_LOG(ERROR) << "RunKernel failed.";
336 return RET_ERROR;
337 }
338 return RET_OK;
339 }
340
341 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ReduceFusion, OpenCLKernelCreator<ReduceOpenCLKernel>)
342 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_ReduceFusion, OpenCLKernelCreator<ReduceOpenCLKernel>)
343 } // namespace mindspore::kernel
344