1 /**
2 * Copyright 2019-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
18
19 #include <algorithm>
20 #include <utility>
21 #include <cmath>
22
23 #include "common/thread_pool.h"
24 #include "utils/profile.h"
25
26 namespace mindspore {
27 namespace kernel {
InitInputOutputSize(const CNodePtr & kernel_node)28 void CPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
29 MS_EXCEPTION_IF_NULL(kernel_node);
30 size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
31 for (size_t input_index = 0; input_index < input_num; ++input_index) {
32 TypeId type_id = AnfAlgo::GetInputDeviceDataType(kernel_node, input_index);
33 size_t type_size = GetTypeByte(TypeIdToType(type_id));
34 std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, input_index);
35 size_t tensor_size =
36 shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
37 tensor_size = std::max(tensor_size, type_size);
38 (void)input_size_list_.emplace_back(tensor_size);
39 }
40 size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
41 for (size_t output_index = 0; output_index < output_num; ++output_index) {
42 TypeId type_id = AnfAlgo::GetOutputDeviceDataType(kernel_node, output_index);
43 size_t type_size = GetTypeByte(TypeIdToType(type_id));
44 std::vector<size_t> shape = AnfAlgo::GetOutputDeviceShape(kernel_node, output_index);
45 size_t tensor_size =
46 shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
47 tensor_size = std::max(tensor_size, type_size);
48 (void)output_size_list_.emplace_back(tensor_size);
49 }
50 }
51
Init(const CNodePtr & kernel_node)52 void CPUKernel::Init(const CNodePtr &kernel_node) {
53 InitKernel(kernel_node);
54 InitInputOutputSize(kernel_node);
55 }
56
ExpandDimsTo4(std::vector<size_t> * shape)57 void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) {
58 MS_EXCEPTION_IF_NULL(shape);
59 auto len = shape->size();
60 if (len < 4) {
61 for (size_t i = 0; i < 4 - len; ++i) {
62 (void)shape->insert(shape->begin(), 1);
63 }
64 }
65 }
66
CalcOffset(const std::vector<size_t> & shape,size_t dim0,size_t dim1,size_t dim2,size_t dim3)67 size_t CPUKernelUtils::CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2,
68 size_t dim3) {
69 size_t offset = dim0 * shape[1] * shape[2] * shape[3] + dim1 * shape[2] * shape[3] + dim2 * shape[3] + dim3;
70 return offset;
71 }
72
GetElementNumOnAxis(const std::vector<size_t> & shape,int axis)73 size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int axis) {
74 if (axis < 0) {
75 axis = axis + SizeToInt(shape.size());
76 }
77 size_t result = 1;
78 for (int j = 3; j > axis; --j) {
79 result *= shape[j];
80 }
81 return result;
82 }
83
GetElementNumEveryDim(const std::vector<size_t> & shape,std::vector<size_t> * element_num)84 void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) {
85 size_t accumulation = 1;
86 MS_EXCEPTION_IF_NULL(element_num);
87 (void)element_num->emplace_back(1);
88 for (size_t i = shape.size() - 1; i > 0; --i) {
89 accumulation *= shape[i];
90 (void)element_num->emplace_back(accumulation);
91 }
92 std::reverse(element_num->begin(), element_num->end());
93 }
94
ParallelFor(const CTask & task,size_t count,float block_size)95 void CPUKernelUtils::ParallelFor(const CTask &task, size_t count, float block_size) {
96 auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
97 size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
98 std::vector<common::Task> tasks;
99 size_t start = 0;
100 size_t once_compute_size = (count + thread_num - 1) / thread_num;
101 while (start < count) {
102 size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
103 auto block = [&, start, end]() {
104 task(start, end);
105 return common::SUCCESS;
106 };
107 (void)tasks.emplace_back(block);
108 start += once_compute_size;
109 }
110 (void)common::ThreadPool::GetInstance().SyncRun(tasks);
111 }
112
113 // Search for best block_size to get best thread num : 1 2 4 8 16 23(32)
114 // Each block_size runs 5 times to get an average cpu kernel cost time.
115 // If the speed of block_size[i] is slower than block_size[i-2], than we
116 // assume that block_size[i-2] is the best block_size.
ParallelForAutoSearch(const CTask & task,size_t count,ParallelSearchInfo * parallel_search_info)117 void CPUKernelUtils::ParallelForAutoSearch(const CTask &task, size_t count, ParallelSearchInfo *parallel_search_info) {
118 const size_t MAX_POW = 6;
119 const size_t AVG_COUNT = 5;
120 MS_EXCEPTION_IF_NULL(parallel_search_info);
121 size_t current_pow = parallel_search_info->search_count / AVG_COUNT;
122 if (current_pow < MAX_POW) {
123 if (parallel_search_info->search_count % AVG_COUNT == 0) {
124 parallel_search_info->tmp_sum_cost_time = 0;
125 }
126 float block_size = static_cast<float>(count) / std::pow(2.0f, current_pow);
127 double start_time = GetTime();
128 ParallelFor(task, count, block_size);
129 double cost_time = GetTime() - start_time;
130 parallel_search_info->tmp_sum_cost_time += cost_time;
131 parallel_search_info->search_count++;
132 if (parallel_search_info->search_count % AVG_COUNT == 0) {
133 double avg_time = parallel_search_info->tmp_sum_cost_time / AVG_COUNT;
134 if (parallel_search_info->min_cost_time > avg_time) {
135 parallel_search_info->min_cost_time = avg_time;
136 parallel_search_info->best_block_size = block_size;
137 parallel_search_info->best_pow = current_pow;
138 } else if (current_pow - parallel_search_info->best_pow >= 2) {
139 parallel_search_info->search_count = AVG_COUNT * MAX_POW;
140 }
141 }
142 } else {
143 ParallelFor(task, count, parallel_search_info->best_block_size);
144 }
145 }
146
GetActorMgrInnerThreadPool()147 ActorThreadPool *GetActorMgrInnerThreadPool() {
148 auto actor_manager = ActorMgr::GetActorMgrRef();
149 MS_EXCEPTION_IF_NULL(actor_manager);
150 auto thread_pool = actor_manager->GetActorThreadPool();
151 // Init thread_pool if env is windows or ascend, in case that it won't be init in graph_scheduler.
152 if (thread_pool == nullptr) {
153 const size_t kMaxThreadNum = 23;
154 size_t max_thread_num = std::thread::hardware_concurrency() - 1;
155 #if ENABLE_D || ENABLE_GPU
156 const size_t kDeviceNum = 8;
157 max_thread_num /= kDeviceNum;
158 #endif
159 if (max_thread_num < 1) {
160 max_thread_num = 1;
161 }
162 max_thread_num = max_thread_num < kMaxThreadNum ? max_thread_num : kMaxThreadNum;
163 (void)actor_manager->Initialize(true, 0, max_thread_num);
164 thread_pool = actor_manager->GetActorThreadPool();
165 MS_EXCEPTION_IF_NULL(thread_pool);
166 }
167 return thread_pool;
168 }
169
170 // Use threadpool of mindrt
ParallelLaunch(const CTask & task,size_t count,float block_size,Content content)171 void ParallelLaunch(const CTask &task, size_t count, float block_size, Content content) {
172 auto thread_pool = GetActorMgrInnerThreadPool();
173 size_t kernel_thread_num = thread_pool->GetKernelThreadNum();
174 if (kernel_thread_num == 0) {
175 MS_LOG(EXCEPTION) << "Actor inner pool has been init, but kernel thread is 0!";
176 }
177
178 size_t thread_num = count < block_size * kernel_thread_num ? std::ceil(count / block_size) : kernel_thread_num;
179 size_t once_compute_size = (count + thread_num - 1) / thread_num;
180 size_t task_num = count / once_compute_size;
181 if (count % once_compute_size != 0) {
182 task_num += 1;
183 }
184 auto func = [&](void *, int task_id, float, float) {
185 size_t start = task_id * once_compute_size;
186 size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
187 task(start, end);
188 return common::SUCCESS;
189 };
190 (void)thread_pool->ParallelLaunch(func, content, task_num);
191 }
192
ParallelLaunchAutoSearch(const CTask & task,size_t count,Content content,ParallelSearchInfo * parallel_search_info)193 void ParallelLaunchAutoSearch(const CTask &task, size_t count, Content content,
194 ParallelSearchInfo *parallel_search_info) {
195 const size_t MAX_POW = 6;
196 const size_t AVG_COUNT = 5;
197 size_t current_pow = parallel_search_info->search_count / AVG_COUNT;
198 if (current_pow < MAX_POW) {
199 if (parallel_search_info->search_count % AVG_COUNT == 0) {
200 parallel_search_info->tmp_sum_cost_time = 0;
201 }
202 float block_size = static_cast<float>(count) / std::pow(2.0f, current_pow);
203 double start_time = GetTime();
204 ParallelLaunch(task, count, block_size, content);
205 double cost_time = GetTime() - start_time;
206 parallel_search_info->tmp_sum_cost_time += cost_time;
207 parallel_search_info->search_count++;
208 if (parallel_search_info->search_count % AVG_COUNT == 0) {
209 double avg_time = parallel_search_info->tmp_sum_cost_time / AVG_COUNT;
210 if (parallel_search_info->min_cost_time > avg_time) {
211 parallel_search_info->min_cost_time = avg_time;
212 parallel_search_info->best_block_size = block_size;
213 parallel_search_info->best_pow = current_pow;
214 } else if (current_pow - parallel_search_info->best_pow >= 2) {
215 parallel_search_info->search_count = AVG_COUNT * MAX_POW;
216 }
217 }
218 } else {
219 ParallelLaunch(task, count, parallel_search_info->best_block_size, content);
220 }
221 }
222
FlatShapeByAxis(const std::vector<size_t> & shape,int axis)223 std::vector<size_t> CPUKernelUtils::FlatShapeByAxis(const std::vector<size_t> &shape, int axis) {
224 if (axis < 0) {
225 axis = axis + SizeToInt(shape.size());
226 }
227 size_t dim_row = 1;
228 size_t dim_col = 1;
229 for (size_t i = 0; i < shape.size(); ++i) {
230 if (SizeToInt(i) < axis) {
231 dim_row *= shape[i];
232 } else {
233 dim_col *= shape[i];
234 }
235 }
236 // referred to Copy elision https://en.cppreference.com/w/cpp/language/copy_elision
237 // returning a vector won't cause extra vector constructed or moved
238 return std::vector<size_t>{dim_row, dim_col};
239 }
240
BroadcastIterator(std::vector<size_t> input_shape_a,std::vector<size_t> input_shape_b,std::vector<size_t> output_shape)241 BroadcastIterator::BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
242 std::vector<size_t> output_shape)
243 : input_shape_a_(std::move(input_shape_a)),
244 input_shape_b_(std::move(input_shape_b)),
245 output_shape_(std::move(output_shape)) {
246 output_dimension_ = SizeToInt(output_shape_.size()); // Assign dimension to int for iterator
247 BroadcastShape();
248 // Allocate strides memory
249 input_strides_a_.resize(output_dimension_);
250 input_strides_b_.resize(output_dimension_);
251 input_back_strides_a_.resize(output_dimension_);
252 input_back_strides_b_.resize(output_dimension_);
253 coordinates_.resize(output_dimension_);
254 InitStrides();
255 }
256
SetPos(size_t pos)257 void BroadcastIterator::SetPos(size_t pos) {
258 for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) {
259 coordinates_[i] = pos % output_shape_[i];
260 input_pos_[0] += coordinates_[i] * input_strides_a_[i];
261 input_pos_[1] += coordinates_[i] * input_strides_b_[i];
262 pos /= output_shape_[i];
263 }
264 }
265
GenNextPos()266 void BroadcastIterator::GenNextPos() {
267 // Calculate output next coordinate
268 for (int i = output_dimension_ - 1; i >= 0; --i) {
269 if (coordinates_[i] + 1 == output_shape_[i]) {
270 coordinates_[i] = 0;
271 input_pos_[0] -= input_back_strides_a_[i];
272 input_pos_[1] -= input_back_strides_b_[i];
273 } else {
274 ++coordinates_[i];
275 input_pos_[0] += input_strides_a_[i];
276 input_pos_[1] += input_strides_b_[i];
277 break;
278 }
279 }
280 }
281
BroadcastShape()282 void BroadcastIterator::BroadcastShape() {
283 int input_dimension_a = input_shape_a_.size();
284 if (input_dimension_a < output_dimension_) {
285 (void)input_shape_a_.insert(input_shape_a_.begin(), IntToSize(output_dimension_ - input_dimension_a), 1);
286 }
287
288 int input_dimension_b = input_shape_b_.size();
289 if (input_dimension_b < output_dimension_) {
290 (void)input_shape_b_.insert(input_shape_b_.begin(), IntToSize(output_dimension_ - input_dimension_b), 1);
291 }
292 }
293
InitStrides()294 void BroadcastIterator::InitStrides() {
295 input_strides_a_[output_dimension_ - 1] = 1;
296 input_strides_b_[output_dimension_ - 1] = 1;
297 for (int i = output_dimension_ - 2; i >= 0; --i) {
298 input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1];
299 input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1];
300 input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1];
301 input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1];
302 }
303
304 // Update strides for broadcast
305 // While the axis value is 1, the stride is 0
306 (void)std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(),
307 input_strides_a_.begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
308 (void)std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(),
309 input_strides_b_.begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
310 }
311
TransposeIterator(std::vector<size_t> output_shape,std::vector<size_t> axes,const std::vector<size_t> & input_shape)312 TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes,
313 const std::vector<size_t> &input_shape)
314 : shape_(std::move(output_shape)), axes_(std::move(axes)) {
315 // Calculate strides
316 dimension_ = shape_.size();
317 std::vector<uint32_t> strides(dimension_, 1);
318 for (int i = dimension_ - 2; i >= 0; --i) {
319 strides[i] = input_shape[i + 1] * strides[i + 1];
320 }
321
322 // Swap shape ans strides and calculate back strides
323 strides_.resize(dimension_);
324 back_strides_.resize(dimension_);
325 for (int i = dimension_ - 1; i >= 0; --i) {
326 strides_[i] = strides[axes_[i]];
327 back_strides_[i] = (shape_[i] - 1) * strides_[i];
328 }
329
330 // Calculate coordinate by pos
331 coordinates_.resize(dimension_);
332 }
333
SetPos(size_t pos)334 void TransposeIterator::SetPos(size_t pos) {
335 for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) {
336 coordinates_[i] = pos % shape_[i];
337 pos_ += coordinates_[i] * strides_[i];
338 pos /= shape_[i];
339 }
340 }
341
GenNextPos()342 void TransposeIterator::GenNextPos() {
343 for (int i = dimension_ - 1; i >= 0; --i) {
344 if (coordinates_[i] + 1 == shape_[i]) {
345 coordinates_[i] = 0;
346 pos_ -= back_strides_[i];
347 } else {
348 coordinates_[i]++;
349 pos_ += strides_[i];
350 break;
351 }
352 }
353 }
354
GetBroadcastShape(const std::vector<size_t> & x,const std::vector<size_t> & y)355 std::vector<size_t> CPUKernelUtils::GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y) {
356 size_t x_len = x.size();
357 size_t y_len = y.size();
358 size_t length = x_len < y_len ? x_len : y_len;
359 std::vector<size_t> broadcast_shape;
360 std::vector<size_t> broadcast_shape_back;
361 for (int i = -length; i < 0; ++i) {
362 if (x[x_len + i] == 1) {
363 broadcast_shape_back.push_back(y[y_len + i]);
364 } else if (y[y_len + i] == 1) {
365 broadcast_shape_back.push_back(x[x_len + i]);
366 } else if (x[x_len + i] == y[y_len + i]) {
367 broadcast_shape_back.push_back(x[x_len + i]);
368 }
369 }
370 if (length == x_len) {
371 for (size_t i = 0; i < y_len - length; ++i) {
372 broadcast_shape.push_back(y[i]);
373 }
374 } else {
375 for (size_t i = 0; i < x_len - length; ++i) {
376 broadcast_shape.push_back(x[i]);
377 }
378 }
379 for (size_t i = 0; i < length; ++i) {
380 broadcast_shape.push_back(broadcast_shape_back[i]);
381 }
382 return broadcast_shape;
383 }
384
Init(const std::vector<size_t> & input_shape,size_t axis)385 void AxisIterator::Init(const std::vector<size_t> &input_shape, size_t axis) {
386 outer_size_ = 1;
387 for (size_t i = 0; i < axis; i++) {
388 outer_size_ *= input_shape[i];
389 }
390
391 axis_size_ = input_shape[axis];
392
393 inner_size_ = 1;
394 for (size_t i = axis + 1; i < input_shape.size(); ++i) {
395 inner_size_ *= input_shape[i];
396 }
397 }
398 } // namespace kernel
399 } // namespace mindspore
400