• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
18 
19 #include <algorithm>
20 #include <utility>
21 #include <cmath>
22 
23 #include "common/thread_pool.h"
24 #include "utils/profile.h"
25 
26 namespace mindspore {
27 namespace kernel {
InitInputOutputSize(const CNodePtr & kernel_node)28 void CPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
29   MS_EXCEPTION_IF_NULL(kernel_node);
30   size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
31   for (size_t input_index = 0; input_index < input_num; ++input_index) {
32     TypeId type_id = AnfAlgo::GetInputDeviceDataType(kernel_node, input_index);
33     size_t type_size = GetTypeByte(TypeIdToType(type_id));
34     std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, input_index);
35     size_t tensor_size =
36       shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
37     tensor_size = std::max(tensor_size, type_size);
38     (void)input_size_list_.emplace_back(tensor_size);
39   }
40   size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
41   for (size_t output_index = 0; output_index < output_num; ++output_index) {
42     TypeId type_id = AnfAlgo::GetOutputDeviceDataType(kernel_node, output_index);
43     size_t type_size = GetTypeByte(TypeIdToType(type_id));
44     std::vector<size_t> shape = AnfAlgo::GetOutputDeviceShape(kernel_node, output_index);
45     size_t tensor_size =
46       shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
47     tensor_size = std::max(tensor_size, type_size);
48     (void)output_size_list_.emplace_back(tensor_size);
49   }
50 }
51 
Init(const CNodePtr & kernel_node)52 void CPUKernel::Init(const CNodePtr &kernel_node) {
53   InitKernel(kernel_node);
54   InitInputOutputSize(kernel_node);
55 }
56 
ExpandDimsTo4(std::vector<size_t> * shape)57 void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) {
58   MS_EXCEPTION_IF_NULL(shape);
59   auto len = shape->size();
60   if (len < 4) {
61     for (size_t i = 0; i < 4 - len; ++i) {
62       (void)shape->insert(shape->begin(), 1);
63     }
64   }
65 }
66 
CalcOffset(const std::vector<size_t> & shape,size_t dim0,size_t dim1,size_t dim2,size_t dim3)67 size_t CPUKernelUtils::CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2,
68                                   size_t dim3) {
69   size_t offset = dim0 * shape[1] * shape[2] * shape[3] + dim1 * shape[2] * shape[3] + dim2 * shape[3] + dim3;
70   return offset;
71 }
72 
GetElementNumOnAxis(const std::vector<size_t> & shape,int axis)73 size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int axis) {
74   if (axis < 0) {
75     axis = axis + SizeToInt(shape.size());
76   }
77   size_t result = 1;
78   for (int j = 3; j > axis; --j) {
79     result *= shape[j];
80   }
81   return result;
82 }
83 
GetElementNumEveryDim(const std::vector<size_t> & shape,std::vector<size_t> * element_num)84 void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) {
85   size_t accumulation = 1;
86   MS_EXCEPTION_IF_NULL(element_num);
87   (void)element_num->emplace_back(1);
88   for (size_t i = shape.size() - 1; i > 0; --i) {
89     accumulation *= shape[i];
90     (void)element_num->emplace_back(accumulation);
91   }
92   std::reverse(element_num->begin(), element_num->end());
93 }
94 
ParallelFor(const CTask & task,size_t count,float block_size)95 void CPUKernelUtils::ParallelFor(const CTask &task, size_t count, float block_size) {
96   auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
97   size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
98   std::vector<common::Task> tasks;
99   size_t start = 0;
100   size_t once_compute_size = (count + thread_num - 1) / thread_num;
101   while (start < count) {
102     size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
103     auto block = [&, start, end]() {
104       task(start, end);
105       return common::SUCCESS;
106     };
107     (void)tasks.emplace_back(block);
108     start += once_compute_size;
109   }
110   (void)common::ThreadPool::GetInstance().SyncRun(tasks);
111 }
112 
113 // Search for best block_size to get best thread num : 1 2 4 8 16 23(32)
114 // Each block_size runs 5 times to get an average cpu kernel cost time.
115 // If the speed of block_size[i] is slower than block_size[i-2], than we
116 // assume that  block_size[i-2] is the best block_size.
ParallelForAutoSearch(const CTask & task,size_t count,ParallelSearchInfo * parallel_search_info)117 void CPUKernelUtils::ParallelForAutoSearch(const CTask &task, size_t count, ParallelSearchInfo *parallel_search_info) {
118   const size_t MAX_POW = 6;
119   const size_t AVG_COUNT = 5;
120   MS_EXCEPTION_IF_NULL(parallel_search_info);
121   size_t current_pow = parallel_search_info->search_count / AVG_COUNT;
122   if (current_pow < MAX_POW) {
123     if (parallel_search_info->search_count % AVG_COUNT == 0) {
124       parallel_search_info->tmp_sum_cost_time = 0;
125     }
126     float block_size = static_cast<float>(count) / std::pow(2.0f, current_pow);
127     double start_time = GetTime();
128     ParallelFor(task, count, block_size);
129     double cost_time = GetTime() - start_time;
130     parallel_search_info->tmp_sum_cost_time += cost_time;
131     parallel_search_info->search_count++;
132     if (parallel_search_info->search_count % AVG_COUNT == 0) {
133       double avg_time = parallel_search_info->tmp_sum_cost_time / AVG_COUNT;
134       if (parallel_search_info->min_cost_time > avg_time) {
135         parallel_search_info->min_cost_time = avg_time;
136         parallel_search_info->best_block_size = block_size;
137         parallel_search_info->best_pow = current_pow;
138       } else if (current_pow - parallel_search_info->best_pow >= 2) {
139         parallel_search_info->search_count = AVG_COUNT * MAX_POW;
140       }
141     }
142   } else {
143     ParallelFor(task, count, parallel_search_info->best_block_size);
144   }
145 }
146 
GetActorMgrInnerThreadPool()147 ActorThreadPool *GetActorMgrInnerThreadPool() {
148   auto actor_manager = ActorMgr::GetActorMgrRef();
149   MS_EXCEPTION_IF_NULL(actor_manager);
150   auto thread_pool = actor_manager->GetActorThreadPool();
151   // Init thread_pool if env is windows or ascend, in case that it won't be init in graph_scheduler.
152   if (thread_pool == nullptr) {
153     const size_t kMaxThreadNum = 23;
154     size_t max_thread_num = std::thread::hardware_concurrency() - 1;
155 #if ENABLE_D || ENABLE_GPU
156     const size_t kDeviceNum = 8;
157     max_thread_num /= kDeviceNum;
158 #endif
159     if (max_thread_num < 1) {
160       max_thread_num = 1;
161     }
162     max_thread_num = max_thread_num < kMaxThreadNum ? max_thread_num : kMaxThreadNum;
163     (void)actor_manager->Initialize(true, 0, max_thread_num);
164     thread_pool = actor_manager->GetActorThreadPool();
165     MS_EXCEPTION_IF_NULL(thread_pool);
166   }
167   return thread_pool;
168 }
169 
170 // Use threadpool of mindrt
ParallelLaunch(const CTask & task,size_t count,float block_size,Content content)171 void ParallelLaunch(const CTask &task, size_t count, float block_size, Content content) {
172   auto thread_pool = GetActorMgrInnerThreadPool();
173   size_t kernel_thread_num = thread_pool->GetKernelThreadNum();
174   if (kernel_thread_num == 0) {
175     MS_LOG(EXCEPTION) << "Actor inner pool has been init, but kernel thread is 0!";
176   }
177 
178   size_t thread_num = count < block_size * kernel_thread_num ? std::ceil(count / block_size) : kernel_thread_num;
179   size_t once_compute_size = (count + thread_num - 1) / thread_num;
180   size_t task_num = count / once_compute_size;
181   if (count % once_compute_size != 0) {
182     task_num += 1;
183   }
184   auto func = [&](void *, int task_id, float, float) {
185     size_t start = task_id * once_compute_size;
186     size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
187     task(start, end);
188     return common::SUCCESS;
189   };
190   (void)thread_pool->ParallelLaunch(func, content, task_num);
191 }
192 
ParallelLaunchAutoSearch(const CTask & task,size_t count,Content content,ParallelSearchInfo * parallel_search_info)193 void ParallelLaunchAutoSearch(const CTask &task, size_t count, Content content,
194                               ParallelSearchInfo *parallel_search_info) {
195   const size_t MAX_POW = 6;
196   const size_t AVG_COUNT = 5;
197   size_t current_pow = parallel_search_info->search_count / AVG_COUNT;
198   if (current_pow < MAX_POW) {
199     if (parallel_search_info->search_count % AVG_COUNT == 0) {
200       parallel_search_info->tmp_sum_cost_time = 0;
201     }
202     float block_size = static_cast<float>(count) / std::pow(2.0f, current_pow);
203     double start_time = GetTime();
204     ParallelLaunch(task, count, block_size, content);
205     double cost_time = GetTime() - start_time;
206     parallel_search_info->tmp_sum_cost_time += cost_time;
207     parallel_search_info->search_count++;
208     if (parallel_search_info->search_count % AVG_COUNT == 0) {
209       double avg_time = parallel_search_info->tmp_sum_cost_time / AVG_COUNT;
210       if (parallel_search_info->min_cost_time > avg_time) {
211         parallel_search_info->min_cost_time = avg_time;
212         parallel_search_info->best_block_size = block_size;
213         parallel_search_info->best_pow = current_pow;
214       } else if (current_pow - parallel_search_info->best_pow >= 2) {
215         parallel_search_info->search_count = AVG_COUNT * MAX_POW;
216       }
217     }
218   } else {
219     ParallelLaunch(task, count, parallel_search_info->best_block_size, content);
220   }
221 }
222 
FlatShapeByAxis(const std::vector<size_t> & shape,int axis)223 std::vector<size_t> CPUKernelUtils::FlatShapeByAxis(const std::vector<size_t> &shape, int axis) {
224   if (axis < 0) {
225     axis = axis + SizeToInt(shape.size());
226   }
227   size_t dim_row = 1;
228   size_t dim_col = 1;
229   for (size_t i = 0; i < shape.size(); ++i) {
230     if (SizeToInt(i) < axis) {
231       dim_row *= shape[i];
232     } else {
233       dim_col *= shape[i];
234     }
235   }
236   // referred to Copy elision https://en.cppreference.com/w/cpp/language/copy_elision
237   // returning a vector won't cause extra vector constructed or moved
238   return std::vector<size_t>{dim_row, dim_col};
239 }
240 
BroadcastIterator(std::vector<size_t> input_shape_a,std::vector<size_t> input_shape_b,std::vector<size_t> output_shape)241 BroadcastIterator::BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
242                                      std::vector<size_t> output_shape)
243     : input_shape_a_(std::move(input_shape_a)),
244       input_shape_b_(std::move(input_shape_b)),
245       output_shape_(std::move(output_shape)) {
246   output_dimension_ = SizeToInt(output_shape_.size());  // Assign dimension to int for iterator
247   BroadcastShape();
248   // Allocate strides memory
249   input_strides_a_.resize(output_dimension_);
250   input_strides_b_.resize(output_dimension_);
251   input_back_strides_a_.resize(output_dimension_);
252   input_back_strides_b_.resize(output_dimension_);
253   coordinates_.resize(output_dimension_);
254   InitStrides();
255 }
256 
SetPos(size_t pos)257 void BroadcastIterator::SetPos(size_t pos) {
258   for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) {
259     coordinates_[i] = pos % output_shape_[i];
260     input_pos_[0] += coordinates_[i] * input_strides_a_[i];
261     input_pos_[1] += coordinates_[i] * input_strides_b_[i];
262     pos /= output_shape_[i];
263   }
264 }
265 
GenNextPos()266 void BroadcastIterator::GenNextPos() {
267   // Calculate output next coordinate
268   for (int i = output_dimension_ - 1; i >= 0; --i) {
269     if (coordinates_[i] + 1 == output_shape_[i]) {
270       coordinates_[i] = 0;
271       input_pos_[0] -= input_back_strides_a_[i];
272       input_pos_[1] -= input_back_strides_b_[i];
273     } else {
274       ++coordinates_[i];
275       input_pos_[0] += input_strides_a_[i];
276       input_pos_[1] += input_strides_b_[i];
277       break;
278     }
279   }
280 }
281 
BroadcastShape()282 void BroadcastIterator::BroadcastShape() {
283   int input_dimension_a = input_shape_a_.size();
284   if (input_dimension_a < output_dimension_) {
285     (void)input_shape_a_.insert(input_shape_a_.begin(), IntToSize(output_dimension_ - input_dimension_a), 1);
286   }
287 
288   int input_dimension_b = input_shape_b_.size();
289   if (input_dimension_b < output_dimension_) {
290     (void)input_shape_b_.insert(input_shape_b_.begin(), IntToSize(output_dimension_ - input_dimension_b), 1);
291   }
292 }
293 
InitStrides()294 void BroadcastIterator::InitStrides() {
295   input_strides_a_[output_dimension_ - 1] = 1;
296   input_strides_b_[output_dimension_ - 1] = 1;
297   for (int i = output_dimension_ - 2; i >= 0; --i) {
298     input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1];
299     input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1];
300     input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1];
301     input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1];
302   }
303 
304   // Update strides for broadcast
305   // While the axis value is 1, the stride is 0
306   (void)std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(),
307                        input_strides_a_.begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
308   (void)std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(),
309                        input_strides_b_.begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
310 }
311 
TransposeIterator(std::vector<size_t> output_shape,std::vector<size_t> axes,const std::vector<size_t> & input_shape)312 TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes,
313                                      const std::vector<size_t> &input_shape)
314     : shape_(std::move(output_shape)), axes_(std::move(axes)) {
315   // Calculate strides
316   dimension_ = shape_.size();
317   std::vector<uint32_t> strides(dimension_, 1);
318   for (int i = dimension_ - 2; i >= 0; --i) {
319     strides[i] = input_shape[i + 1] * strides[i + 1];
320   }
321 
322   // Swap shape ans strides and calculate back strides
323   strides_.resize(dimension_);
324   back_strides_.resize(dimension_);
325   for (int i = dimension_ - 1; i >= 0; --i) {
326     strides_[i] = strides[axes_[i]];
327     back_strides_[i] = (shape_[i] - 1) * strides_[i];
328   }
329 
330   // Calculate coordinate by pos
331   coordinates_.resize(dimension_);
332 }
333 
SetPos(size_t pos)334 void TransposeIterator::SetPos(size_t pos) {
335   for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) {
336     coordinates_[i] = pos % shape_[i];
337     pos_ += coordinates_[i] * strides_[i];
338     pos /= shape_[i];
339   }
340 }
341 
GenNextPos()342 void TransposeIterator::GenNextPos() {
343   for (int i = dimension_ - 1; i >= 0; --i) {
344     if (coordinates_[i] + 1 == shape_[i]) {
345       coordinates_[i] = 0;
346       pos_ -= back_strides_[i];
347     } else {
348       coordinates_[i]++;
349       pos_ += strides_[i];
350       break;
351     }
352   }
353 }
354 
GetBroadcastShape(const std::vector<size_t> & x,const std::vector<size_t> & y)355 std::vector<size_t> CPUKernelUtils::GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y) {
356   size_t x_len = x.size();
357   size_t y_len = y.size();
358   size_t length = x_len < y_len ? x_len : y_len;
359   std::vector<size_t> broadcast_shape;
360   std::vector<size_t> broadcast_shape_back;
361   for (int i = -length; i < 0; ++i) {
362     if (x[x_len + i] == 1) {
363       broadcast_shape_back.push_back(y[y_len + i]);
364     } else if (y[y_len + i] == 1) {
365       broadcast_shape_back.push_back(x[x_len + i]);
366     } else if (x[x_len + i] == y[y_len + i]) {
367       broadcast_shape_back.push_back(x[x_len + i]);
368     }
369   }
370   if (length == x_len) {
371     for (size_t i = 0; i < y_len - length; ++i) {
372       broadcast_shape.push_back(y[i]);
373     }
374   } else {
375     for (size_t i = 0; i < x_len - length; ++i) {
376       broadcast_shape.push_back(x[i]);
377     }
378   }
379   for (size_t i = 0; i < length; ++i) {
380     broadcast_shape.push_back(broadcast_shape_back[i]);
381   }
382   return broadcast_shape;
383 }
384 
Init(const std::vector<size_t> & input_shape,size_t axis)385 void AxisIterator::Init(const std::vector<size_t> &input_shape, size_t axis) {
386   outer_size_ = 1;
387   for (size_t i = 0; i < axis; i++) {
388     outer_size_ *= input_shape[i];
389   }
390 
391   axis_size_ = input_shape[axis];
392 
393   inner_size_ = 1;
394   for (size_t i = axis + 1; i < input_shape.size(); ++i) {
395     inner_size_ *= input_shape[i];
396   }
397 }
398 }  // namespace kernel
399 }  // namespace mindspore
400