1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <set>
18 #include <string>
19 #include <map>
20 #include <algorithm>
21 #include "include/errorcode.h"
22 #include "src/kernel_registry.h"
23 #include "src/runtime/kernel/opencl/kernel/reduce.h"
24 #include "src/runtime/kernel/opencl/utils.h"
25 #include "src/runtime/kernel/opencl/cl/reduce.cl.inc"
26
27 using mindspore::kernel::KERNEL_ARCH::kGPU;
28 using mindspore::lite::KernelRegistrar;
29 using mindspore::lite::RET_ERROR;
30 using mindspore::lite::RET_NULL_PTR;
31 using mindspore::lite::RET_OK;
32 using mindspore::lite::RET_PARAM_INVALID;
33 using mindspore::schema::PrimitiveType_ReduceFusion;
34 using mindspore::schema::ReduceMode;
35 using mindspore::schema::ReduceMode_ReduceMax;
36 using mindspore::schema::ReduceMode_ReduceMean;
37 using mindspore::schema::ReduceMode_ReduceMin;
38 using mindspore::schema::ReduceMode_ReduceProd;
39 using mindspore::schema::ReduceMode_ReduceSum;
40 using mindspore::schema::ReduceMode_ReduceSumSquare;
41
42 namespace mindspore::kernel {
GetReduceTypeStr(int type)43 std::string ReduceOpenCLKernel::GetReduceTypeStr(int type) {
44 static const std::map<int, std::string> reduce_type2str{
45 {ReduceMode_ReduceMean, "Mean"}, {ReduceMode_ReduceSum, "Sum"}, {ReduceMode_ReduceMin, "Min"},
46 {ReduceMode_ReduceMax, "Max"}, {ReduceMode_ReduceProd, "Prod"}, {ReduceMode_ReduceSumSquare, "SumSquare"}};
47 auto result_iter = reduce_type2str.find(type);
48 if (result_iter != reduce_type2str.end()) {
49 return result_iter->second;
50 }
51 return "";
52 }
53
GenC4Mask()54 cl_float4 ReduceOpenCLKernel::GenC4Mask() {
55 auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
56 int last_c4 = inShape.C % C4NUM;
57 if (last_c4 == 0) last_c4 = C4NUM;
58 static const std::map<int, float> reduce_type2init{
59 {ReduceMode_ReduceMean, 0.f}, {ReduceMode_ReduceSum, 0.f}, {ReduceMode_ReduceMin, 10000.f},
60 {ReduceMode_ReduceMax, -10000.f}, {ReduceMode_ReduceProd, 1.f}, {ReduceMode_ReduceSumSquare, 0.f}};
61 float init_float = reduce_type2init.find(reduce_param->mode_)->second;
62 cl_float4 mask = {0.f, 0.f, 0.f, 0.f};
63 for (int i = 0; i < last_c4; i++) {
64 mask.s[C4NUM - i - 1] = init_float;
65 }
66 return mask;
67 }
68
IsHWReduce(const bool * reduce_axes_)69 bool IsHWReduce(const bool *reduce_axes_) {
70 return !reduce_axes_[0] && reduce_axes_[1] && reduce_axes_[2] && !reduce_axes_[3];
71 }
72
IsWCReduce(const bool * reduce_axes_)73 bool IsWCReduce(const bool *reduce_axes_) {
74 return !reduce_axes_[0] && !reduce_axes_[1] && reduce_axes_[2] && reduce_axes_[3];
75 }
76
IsCReduce(const bool * reduce_axes_)77 bool IsCReduce(const bool *reduce_axes_) {
78 return !reduce_axes_[0] && !reduce_axes_[1] && !reduce_axes_[2] && reduce_axes_[3];
79 }
80
SetAxes()81 int ReduceOpenCLKernel::SetAxes() {
82 // axes is input tensor
83 // get num_axes
84 int num_axes = 0;
85 auto *axes_tensor = in_tensors_.at(1);
86 if (axes_tensor->shape().size() != 1) {
87 MS_LOG(ERROR) << "in Reduce: axes tensor's ndim should be 1.";
88 return RET_ERROR;
89 } else {
90 num_axes = axes_tensor->shape().front();
91 }
92 // check axes tensor
93 if (CheckParamLikeTensor("Reduce", "axes", axes_tensor, kNumberTypeInt32, {num_axes}) != RET_OK) {
94 return RET_ERROR;
95 }
96 // copy axes from tensor to private var
97 CHECK_NULL_RETURN(axes_tensor->data());
98 for (int i = 0; i < std::min(num_axes, MAX_SHAPE_SIZE); ++i) {
99 axes_[i] = reinterpret_cast<int *>(axes_tensor->data())[i];
100 }
101 if (num_axes > 2 || num_axes < 1) {
102 MS_LOG(ERROR) << "Unsupported reduce num axes " << num_axes;
103 return RET_PARAM_INVALID;
104 }
105
106 for (int i = 0; i < num_axes; i++) {
107 int axis = axes_[i];
108 axis = inShape.AlignAxis(axis);
109 reduce_axes_[axis] = true;
110 }
111 if (num_axes == 1) {
112 if (reduce_axes_[1] && inShape.W == 1) {
113 reduce_axes_[2] = true;
114 } else if (reduce_axes_[2]) {
115 if (inShape.H == 1) {
116 reduce_axes_[1] = true;
117 } else if (inShape.C == 1) {
118 reduce_axes_[3] = true;
119 }
120 } else if (reduce_axes_[3] && inShape.W == 1) {
121 reduce_axes_[3] = true;
122 }
123 }
124 return RET_OK;
125 }
126
CheckSpecs()127 int ReduceOpenCLKernel::CheckSpecs() {
128 if (in_tensors_.size() != INPUT_TENSOR_SIZE_2 || out_tensors_.size() != OUTPUT_TENSOR_SIZE_1) {
129 MS_LOG(WARNING) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
130 return RET_ERROR;
131 }
132 auto input = in_tensors_.at(0);
133 CHECK_NULL_RETURN(input);
134 if (input->shape()[0] > DIMENSION_1D) {
135 MS_LOG(WARNING) << "reduce op only support n = 1";
136 return RET_PARAM_INVALID;
137 }
138 inShape = GpuTensorInfo(in_tensors_[0]);
139 auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
140 CHECK_NULL_RETURN(reduce_param);
141 if (GetReduceTypeStr(reduce_param->mode_).empty()) {
142 MS_LOG(WARNING) << "not supported reduce type:" << reduce_param->mode_;
143 return RET_PARAM_INVALID;
144 }
145 auto ret = SetAxes();
146 if (ret != RET_OK) {
147 return ret;
148 }
149 hw_reduce_ = IsHWReduce(reduce_axes_);
150 wc_reduce_ = IsWCReduce(reduce_axes_);
151 c_reduce_ = IsCReduce(reduce_axes_);
152 if (!hw_reduce_ && !wc_reduce_ && !c_reduce_) {
153 MS_LOG(WARNING) << "Unsupported reduce axes";
154 return RET_PARAM_INVALID;
155 }
156 if ((c_reduce_ || wc_reduce_) && !reduce_param->keep_dims_) {
157 MS_LOG(WARNING) << "reduce axis (2,3) should keep dims";
158 return RET_PARAM_INVALID;
159 }
160 return RET_OK;
161 }
162
Prepare()163 int ReduceOpenCLKernel::Prepare() {
164 auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
165 if (reduce_param == nullptr) {
166 return RET_NULL_PTR;
167 }
168
169 std::string kernel_name;
170 use_local_ = false;
171 kernel_name = "Global";
172 if (wc_reduce_ && (inShape.W >= LOCAL_CACHE_THREAD || inShape.C >= LOCAL_CACHE_THREAD)) {
173 use_local_ = true;
174 kernel_name = "Local";
175 }
176 if (hw_reduce_ && (inShape.W >= LOCAL_CACHE_THREAD || inShape.H >= LOCAL_CACHE_THREAD)) {
177 use_local_ = true;
178 kernel_name = "Local";
179 }
180 if (wc_reduce_) {
181 kernel_name += "WC";
182 } else if (hw_reduce_) {
183 kernel_name += "HW";
184 } else if (c_reduce_) {
185 kernel_name += "C";
186 }
187 kernel_name += GetReduceTypeStr(reduce_param->mode_);
188 std::string source = reduce_source;
189 const std::string program_name = "Reduce";
190 if (!ocl_runtime_->LoadSource(program_name, source)) {
191 MS_LOG(ERROR) << "Load source failed.";
192 return RET_ERROR;
193 }
194 auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
195 auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
196 if (ret != RET_OK) {
197 MS_LOG(ERROR) << "Build kernel failed.";
198 return ret;
199 }
200 if (SetConstArgs() != RET_OK) {
201 MS_LOG(ERROR) << "SeConstArgs failed.";
202 return RET_ERROR;
203 }
204 SetGlobalLocal();
205 MS_LOG(DEBUG) << kernel_name << " Init Done!";
206 return RET_OK;
207 }
208
SetConstArgs()209 int ReduceOpenCLKernel::SetConstArgs() {
210 int h = inShape.H;
211 int w = inShape.W;
212 int c = inShape.C;
213 int c4 = UP_DIV(c, C4NUM);
214 cl_int4 size = {h, w, c4, c};
215 int arg_idx = 2;
216 if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size) != CL_SUCCESS) {
217 MS_LOG(ERROR) << "SetKernelArg failed.";
218 return RET_ERROR;
219 }
220 if (wc_reduce_ || c_reduce_) {
221 if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, GenC4Mask()) != CL_SUCCESS) {
222 MS_LOG(ERROR) << "SetKernelArg failed.";
223 return RET_ERROR;
224 }
225 }
226 return RET_OK;
227 }
228
SetGlobalLocal()229 void ReduceOpenCLKernel::SetGlobalLocal() {
230 int h = inShape.H;
231 int w = inShape.W;
232 int c4 = inShape.Slice;
233 local_size_ = {};
234 if (use_local_) {
235 local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD};
236 }
237 if (hw_reduce_) {
238 global_size_ = {static_cast<size_t>(c4), 1, 1};
239 } else if (wc_reduce_) {
240 global_size_ = {static_cast<size_t>(h), 1, 1};
241 } else if (c_reduce_ && !use_local_) {
242 global_size_ = {static_cast<size_t>(h), static_cast<size_t>(w)};
243 }
244 AlignGlobalLocal(global_size_, local_size_);
245 }
246
Tune()247 int ReduceOpenCLKernel::Tune() {
248 if (use_local_) {
249 return RET_OK;
250 }
251 return OpenCLKernel::Tune();
252 }
253
Run()254 int ReduceOpenCLKernel::Run() {
255 MS_LOG(DEBUG) << this->name() << " Running!";
256 int arg_idx = 0;
257 if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data()) != CL_SUCCESS) {
258 MS_LOG(ERROR) << "SetKernelArg failed.";
259 return RET_ERROR;
260 }
261 if (ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data()) != CL_SUCCESS) {
262 MS_LOG(ERROR) << "SetKernelArg failed.";
263 return RET_ERROR;
264 }
265 if (ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_) != RET_OK) {
266 MS_LOG(ERROR) << "RunKernel failed.";
267 return RET_ERROR;
268 }
269 return RET_OK;
270 }
271
272 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ReduceFusion, OpenCLKernelCreator<ReduceOpenCLKernel>)
273 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_ReduceFusion, OpenCLKernelCreator<ReduceOpenCLKernel>)
274 } // namespace mindspore::kernel
275