• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "plugin/device/cpu/kernel/cpu_kernel.h"
18 
19 #include <algorithm>
20 #include <utility>
21 #include <cmath>
22 #include <map>
23 #include <set>
24 #include <numeric>
25 #include "kernel/oplib/oplib.h"
26 #include "utils/profile.h"
27 #include "runtime/graph_scheduler/actor/actor_common.h"
28 #include "kernel/common_utils.h"
29 
30 namespace mindspore {
31 namespace kernel {
GetAllSupportedList(const std::string & kernel_name)32 std::vector<KernelAttr> NativeCpuKernelMod::GetAllSupportedList(const std::string &kernel_name) {
33   auto iter = support_map_.find(kernel_name);
34   if (iter == support_map_.end()) {
35     std::vector<KernelAttr> kernel_attrs;
36     auto kernel_support = GetOpSupport();
37     (void)kernel_attrs.insert(kernel_attrs.end(), kernel_support.begin(), kernel_support.end());
38     if (!kernel_attrs.empty() && kernel_attrs[0].GetSkipCheck()) {
39       (void)support_map_.emplace(kernel_name, kernel_attrs);
40       return support_map_[kernel_name];
41     }
42     if (kernel_attrs.empty()) {
43       auto oplib_support = GetSupportFromOpLib(kernel_name);
44       (void)kernel_attrs.insert(kernel_attrs.end(), oplib_support.begin(), oplib_support.end());
45     }
46     (void)support_map_.emplace(kernel_name, kernel_attrs);
47   }
48 
49   return support_map_[kernel_name];
50 }
51 
GetSupportFromOpLib(const std::string & kernel_name) const52 std::vector<KernelAttr> NativeCpuKernelMod::GetSupportFromOpLib(const std::string &kernel_name) const {
53   static std::set<std::string> same_op_name = {"Pack",   "Stack", "Split",        "Transpose",
54                                                "Unpack", "AddN",  "ConcatOffset", "DynamicStitch"};
55   std::vector<KernelAttr> support_kernel_attrs;
56   auto op_info = mindspore::kernel::OpLib::FindOp(kernel_name, kernel::OpImplyType::kImplyCPU);
57   if (op_info == nullptr) {
58     MS_LOG(WARNING) << "Not find op[" << kernel_name << "] in cpu. For more details, "
59                     << "please refer to the list of supported cpu operations at https://www.mindspore.cn.";
60     return support_kernel_attrs;
61   }
62 
63   auto inputs_ptr = op_info->inputs_ptr();
64   auto outputs_ptr = op_info->outputs_ptr();
65   if (outputs_ptr.empty()) {
66     MS_LOG(WARNING) << "The output dimension of operator '" << kernel_name << "' can not be zero.";
67     return support_kernel_attrs;
68   }
69 
70   auto support_size = outputs_ptr[0]->dtypes().size();
71   for (size_t i = 0; i < support_size; i++) {
72     KernelAttr kernel_attr;
73     for (size_t j = 0; j < inputs_ptr.size(); j++) {
74       auto input_dtypes = inputs_ptr[j]->dtypes();
75       auto input_formats = inputs_ptr[j]->formats();
76       (void)kernel_attr.AddInputAttr(kernel::DtypeToTypeId(input_dtypes[i]), input_formats[i]);
77     }
78     for (size_t j = 0; j < outputs_ptr.size(); j++) {
79       auto output_dtypes = outputs_ptr[j]->dtypes();
80       auto output_formats = outputs_ptr[j]->formats();
81       (void)kernel_attr.AddOutputAttr(kernel::DtypeToTypeId(output_dtypes[i]), output_formats[i]);
82     }
83     if (same_op_name.count(op_info->op_name()) != 0) {
84       (void)kernel_attr.AddAllSameAttr(true);
85     }
86     support_kernel_attrs.push_back(kernel_attr);
87   }
88 
89   return support_kernel_attrs;
90 }
91 
ExpandDimsTo4(ShapeVector * shape)92 void CPUKernelUtils::ExpandDimsTo4(ShapeVector *shape) {
93   MS_EXCEPTION_IF_NULL(shape);
94   auto len = shape->size();
95   const size_t expect_dims = 4;
96   if (len < expect_dims) {
97     for (size_t i = 0; i < expect_dims - len; ++i) {
98       (void)shape->insert(shape->begin(), 1);
99     }
100   }
101 }
102 
CalcOffset(const ShapeVector & shape,size_t dim0,size_t dim1,size_t dim2,size_t dim3)103 size_t CPUKernelUtils::CalcOffset(const ShapeVector &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3) {
104   size_t offset = dim0 * LongToSize(shape[1]) * LongToSize(shape[2]) * LongToSize(shape[3]) +
105                   dim1 * LongToSize(shape[2]) * LongToSize(shape[3]) + dim2 * LongToSize(shape[3]) + dim3;
106   return offset;
107 }
108 
GetElementNumOnAxis(const ShapeVector & shape,int axis)109 size_t CPUKernelUtils::GetElementNumOnAxis(const ShapeVector &shape, int axis) {
110   if (axis < 0) {
111     axis = axis + SizeToInt(shape.size());
112   }
113   int64_t result = 1;
114   for (int j = 3; j > axis; --j) {
115     result *= shape[j];
116   }
117   return LongToSize(result);
118 }
119 
GetElementNumEveryDim(const ShapeVector & shape,std::vector<size_t> * element_num)120 void CPUKernelUtils::GetElementNumEveryDim(const ShapeVector &shape, std::vector<size_t> *element_num) {
121   size_t accumulation = 1;
122   MS_EXCEPTION_IF_NULL(element_num);
123   (void)element_num->emplace_back(1);
124   for (size_t i = shape.size() - 1; i > 0; --i) {
125     accumulation *= LongToSizeClipNeg(shape[i]);
126     (void)element_num->emplace_back(accumulation);
127   }
128   std::reverse(element_num->begin(), element_num->end());
129 }
130 
ParallelFor(const CTask & task,size_t count,float block_size)131 void CPUKernelUtils::ParallelFor(const CTask &task, size_t count, float block_size) {
132   auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
133   size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
134   std::vector<common::Task> tasks;
135   size_t start = 0;
136   size_t once_compute_size = (count + thread_num - 1) / thread_num;
137   while (start < count) {
138     size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
139     auto block = [&, start, end]() {
140       task(start, end);
141       return common::SUCCESS;
142     };
143     (void)tasks.emplace_back(block);
144     start += once_compute_size;
145   }
146   ParallelLaunch(tasks);
147 }
148 
149 // Search for best block_size to get best thread num : 1 2 4 8 16 23(32)
150 // Each block_size runs 5 times to get an average cpu kernel cost time.
151 // If the speed of block_size[i] is slower than block_size[i-2], than we
152 // assume that  block_size[i-2] is the best block_size.
ParallelForAutoSearch(const CTask & task,size_t count,ParallelSearchInfo * parallel_search_info)153 void CPUKernelUtils::ParallelForAutoSearch(const CTask &task, size_t count, ParallelSearchInfo *parallel_search_info) {
154   const size_t MAX_POW = 6;
155   const size_t AVG_COUNT = 5;
156   MS_EXCEPTION_IF_NULL(parallel_search_info);
157   size_t current_pow = parallel_search_info->search_count / AVG_COUNT;
158   if (current_pow < MAX_POW) {
159     if (parallel_search_info->search_count % AVG_COUNT == 0) {
160       parallel_search_info->tmp_sum_cost_time = 0;
161     }
162     float block_size = static_cast<float>(count) / std::pow(2.0f, current_pow);
163     double start_time = GetTime();
164     ParallelFor(task, count, block_size);
165     double cost_time = GetTime() - start_time;
166     parallel_search_info->tmp_sum_cost_time += cost_time;
167     parallel_search_info->search_count++;
168     if (parallel_search_info->search_count % AVG_COUNT == 0) {
169       double avg_time = parallel_search_info->tmp_sum_cost_time / AVG_COUNT;
170       if (parallel_search_info->min_cost_time > avg_time) {
171         parallel_search_info->min_cost_time = avg_time;
172         parallel_search_info->best_block_size = block_size;
173         parallel_search_info->best_pow = current_pow;
174       } else if (current_pow - parallel_search_info->best_pow >= 2) {
175         parallel_search_info->search_count = AVG_COUNT * MAX_POW;
176       }
177     }
178   } else {
179     ParallelFor(task, count, parallel_search_info->best_block_size);
180   }
181 }
182 
GetActorMgrInnerThreadPool()183 ActorThreadPool *GetActorMgrInnerThreadPool() {
184   auto actor_manager = ActorMgr::GetActorMgrRef();
185   MS_EXCEPTION_IF_NULL(actor_manager);
186   auto thread_pool = actor_manager->GetActorThreadPool();
187   // Init thread_pool if env is windows or ascend, in case that it won't be init in graph_scheduler.
188   if (thread_pool == nullptr) {
189     size_t actor_thread_num = 0;
190     size_t actor_and_kernel_thread_num = 0;
191     runtime::ComputeThreadNums(&actor_thread_num, &actor_and_kernel_thread_num);
192     size_t actor_queue_size = 81920;
193     (void)actor_manager->Initialize(true, actor_thread_num, actor_and_kernel_thread_num, actor_queue_size);
194     thread_pool = actor_manager->GetActorThreadPool();
195     MS_EXCEPTION_IF_NULL(thread_pool);
196   }
197   thread_pool->SetKernelThreadMaxSpinCount(kDefaultKernelSpinCount);
198   return thread_pool;
199 }
200 
201 // Use threadpool of mindrt
ParallelLaunch(const CTask & task,size_t count,float block_size,Content content,ThreadPool * pool)202 void ParallelLaunch(const CTask &task, size_t count, float block_size, Content content, ThreadPool *pool) {
203   if (count == 0) {
204     return;
205   }
206   auto thread_pool = pool == nullptr ? GetActorMgrInnerThreadPool() : pool;
207   size_t kernel_thread_num = thread_pool->GetKernelThreadNum();
208   if (kernel_thread_num == 0) {
209     MS_LOG(EXCEPTION) << "Actor inner pool has been init, but kernel thread is 0!";
210   }
211 
212   size_t thread_num = count < block_size * kernel_thread_num ? std::ceil(count / block_size) : kernel_thread_num;
213   size_t once_compute_size = (count + thread_num - 1) / thread_num;
214   size_t task_num = count / once_compute_size;
215   if (count % once_compute_size != 0) {
216     task_num += 1;
217   }
218   auto func = [&](void *, int task_id, float, float) {
219     size_t start = IntToSize(task_id) * once_compute_size;
220     size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
221     task(start, end);
222     return common::SUCCESS;
223   };
224   (void)thread_pool->ParallelLaunch(func, content, task_num);
225 }
226 
ParallelLaunch(const std::vector<common::Task> & tasks,Content content,ThreadPool * pool)227 void ParallelLaunch(const std::vector<common::Task> &tasks, Content content, ThreadPool *pool) {
228   size_t count = tasks.size();
229   if (count == 0) {
230     return;
231   }
232   auto thread_pool = pool == nullptr ? GetActorMgrInnerThreadPool() : pool;
233   size_t kernel_thread_num = thread_pool->GetKernelThreadNum();
234   if (kernel_thread_num == 0) {
235     MS_LOG(EXCEPTION) << "Actor inner pool has been init, but kernel thread is 0!";
236   }
237 
238   size_t thread_num = count < kernel_thread_num ? count : kernel_thread_num;
239   size_t once_compute_size = (count + thread_num - 1) / thread_num;
240   size_t task_num = count / once_compute_size;
241   if (count % once_compute_size != 0) {
242     task_num += 1;
243   }
244   auto func = [&](void *, int task_id, float, float) {
245     size_t start = IntToSize(task_id) * once_compute_size;
246     size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
247     for (size_t i = start; i < end; ++i) {
248       (void)tasks[i]();
249     }
250     return common::SUCCESS;
251   };
252   (void)thread_pool->ParallelLaunch(func, content, task_num);
253 }
254 
ParallelLaunchAutoSearch(const CTask & task,size_t count,Content content,ParallelSearchInfo * parallel_search_info,ThreadPool * pool)255 void ParallelLaunchAutoSearch(const CTask &task, size_t count, Content content,
256                               ParallelSearchInfo *parallel_search_info, ThreadPool *pool) {
257   if (!parallel_search_info->kernel_thread_num_set) {
258     auto thread_pool = pool == nullptr ? GetActorMgrInnerThreadPool() : pool;
259     size_t kernel_thread_num = thread_pool->GetKernelThreadNum();
260     if (kernel_thread_num == 0) {
261       MS_LOG(EXCEPTION) << "Actor inner pool has been init, but kernel thread is 0!";
262     }
263     size_t max_pow_current = parallel_search_info->max_pow - 1;
264     while (std::pow(2.0f, max_pow_current) <= static_cast<float>(kernel_thread_num)) {
265       max_pow_current++;
266     }
267     parallel_search_info->max_pow = max_pow_current + 1;
268     parallel_search_info->kernel_thread_num_set = true;
269   }
270   const size_t AVG_COUNT = 5;
271   size_t current_pow = parallel_search_info->search_count / AVG_COUNT;
272   if (current_pow < parallel_search_info->max_pow) {
273     if (parallel_search_info->search_count % AVG_COUNT == 0) {
274       parallel_search_info->tmp_sum_cost_time = 0;
275     }
276     float block_size = static_cast<float>(count) / std::pow(2.0f, current_pow);
277     double start_time = GetTime();
278     ParallelLaunch(task, count, block_size, content, pool);
279     double cost_time = GetTime() - start_time;
280     parallel_search_info->tmp_sum_cost_time += cost_time;
281     parallel_search_info->search_count++;
282     if (parallel_search_info->search_count % AVG_COUNT == 0) {
283       double avg_time = parallel_search_info->tmp_sum_cost_time / AVG_COUNT;
284       if (parallel_search_info->min_cost_time > avg_time) {
285         parallel_search_info->min_cost_time = avg_time;
286         parallel_search_info->best_block_size = block_size;
287         parallel_search_info->best_pow = current_pow;
288       } else if (current_pow - parallel_search_info->best_pow >= 2) {
289         parallel_search_info->search_count = AVG_COUNT * parallel_search_info->max_pow;
290       }
291     }
292   } else {
293     ParallelLaunch(task, count, parallel_search_info->best_block_size, content, pool);
294   }
295 }
296 
FlatShapeByAxis(const ShapeVector & shape,int axis)297 ShapeVector CPUKernelUtils::FlatShapeByAxis(const ShapeVector &shape, int axis) {
298   if (axis < 0) {
299     axis = axis + SizeToInt(shape.size());
300   }
301   int64_t dim_row = 1;
302   int64_t dim_col = 1;
303   for (size_t i = 0; i < shape.size(); ++i) {
304     if (SizeToInt(i) < axis) {
305       dim_row *= shape[i];
306     } else {
307       dim_col *= shape[i];
308     }
309   }
310   // referred to Copy elision https://en.cppreference.com/w/cpp/language/copy_elision
311   // returning a vector won't cause extra vector constructed or moved
312   return ShapeVector{dim_row, dim_col};
313 }
314 
BroadcastIterator(ShapeVector input_shape_a,ShapeVector input_shape_b,ShapeVector output_shape)315 BroadcastIterator::BroadcastIterator(ShapeVector input_shape_a, ShapeVector input_shape_b, ShapeVector output_shape)
316     : input_shape_a_(std::move(input_shape_a)),
317       input_shape_b_(std::move(input_shape_b)),
318       output_shape_(std::move(output_shape)) {
319   output_dimension_ = SizeToInt(output_shape_.size());  // Assign dimension to int for iterator
320   BroadcastShape();
321   // Allocate strides memory
322   input_strides_a_.resize(output_dimension_);
323   input_strides_b_.resize(output_dimension_);
324   input_back_strides_a_.resize(output_dimension_);
325   input_back_strides_b_.resize(output_dimension_);
326   coordinates_.resize(output_dimension_);
327   InitStrides();
328 }
329 
SetPos(size_t pos)330 void BroadcastIterator::SetPos(size_t pos) {
331   for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) {
332     coordinates_[i] = pos % output_shape_[i];
333     input_pos_[0] += coordinates_[i] * input_strides_a_[i];
334     input_pos_[1] += coordinates_[i] * input_strides_b_[i];
335     pos /= output_shape_[i];
336   }
337 }
338 
GenNextPos()339 void BroadcastIterator::GenNextPos() {
340   // Calculate output next coordinate
341   for (int i = output_dimension_ - 1; i >= 0; --i) {
342     if (coordinates_[i] + 1 == output_shape_[i]) {
343       coordinates_[i] = 0;
344       input_pos_[0] -= input_back_strides_a_[i];
345       input_pos_[1] -= input_back_strides_b_[i];
346     } else {
347       ++coordinates_[i];
348       input_pos_[0] += input_strides_a_[i];
349       input_pos_[1] += input_strides_b_[i];
350       break;
351     }
352   }
353 }
354 
BroadcastShape()355 void BroadcastIterator::BroadcastShape() {
356   int input_dimension_a = input_shape_a_.size();
357   if (input_dimension_a < output_dimension_) {
358     (void)input_shape_a_.insert(input_shape_a_.begin(), IntToSize(output_dimension_ - input_dimension_a), 1);
359   }
360 
361   int input_dimension_b = input_shape_b_.size();
362   if (input_dimension_b < output_dimension_) {
363     (void)input_shape_b_.insert(input_shape_b_.begin(), IntToSize(output_dimension_ - input_dimension_b), 1);
364   }
365 }
366 
InitStrides()367 void BroadcastIterator::InitStrides() {
368   if (output_dimension_ <= 0) {
369     return;
370   }
371   input_strides_a_[output_dimension_ - 1] = 1;
372   input_strides_b_[output_dimension_ - 1] = 1;
373   for (int i = output_dimension_ - 2; i >= 0; --i) {
374     input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1];
375     input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1];
376     input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1];
377     input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1];
378   }
379 
380   // Update strides for broadcast
381   // While the axis value is 1, the stride is 0
382   (void)std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(),
383                        input_strides_a_.begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
384   (void)std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(),
385                        input_strides_b_.begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
386 }
387 
MultipleBroadcastIterator(std::vector<shape_info> multi_inputs,shape_info output_shape)388 MultipleBroadcastIterator::MultipleBroadcastIterator(std::vector<shape_info> multi_inputs, shape_info output_shape)
389     : multi_inputs_(std::move(multi_inputs)), output_shape_(std::move(output_shape)) {
390   output_dimension_ = SizeToInt(output_shape_.size());
391   // Assign dimension to int for iterator
392   BroadcastShape();
393   input_pos_.resize(multi_inputs_.size());
394   // Allocate strides memory
395   multi_inputs_strides_.resize(multi_inputs_.size(), std::vector<int64_t>(output_dimension_, 0));
396   multi_inputs_back_strides_.resize(multi_inputs_.size(), std::vector<int64_t>(output_dimension_, 0));
397   coordinates_.resize(output_dimension_);
398   InitStrides();
399 }
400 
SetPos(size_t pos)401 void MultipleBroadcastIterator::SetPos(size_t pos) {
402   for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) {
403     coordinates_[i] = pos % output_shape_[i];
404     for (size_t j = 0; j < input_pos_.size(); ++j) {
405       input_pos_[j] += coordinates_[i] * multi_inputs_strides_[j][i];
406     }
407     pos /= output_shape_[i];
408   }
409 }
410 
GenNextPos()411 void MultipleBroadcastIterator::GenNextPos() {
412   // Calculate output next coordinate
413   for (int i = output_dimension_ - 1; i >= 0; --i) {
414     if (coordinates_[i] + 1 == output_shape_[i]) {
415       coordinates_[i] = 0;
416       for (size_t j = 0; j < input_pos_.size(); ++j) {
417         input_pos_[j] -= multi_inputs_back_strides_[j][i];
418       }
419     } else {
420       ++coordinates_[i];
421       for (size_t j = 0; j < input_pos_.size(); ++j) {
422         input_pos_[j] += multi_inputs_strides_[j][i];
423       }
424       break;
425     }
426   }
427 }
428 
BroadcastShape()429 void MultipleBroadcastIterator::BroadcastShape() {
430   for (auto &multi_input : multi_inputs_) {
431     int input_dimension = SizeToInt(multi_input.size());
432     if (input_dimension < output_dimension_) {
433       (void)multi_input.insert(multi_input.begin(), IntToSize(output_dimension_ - input_dimension), 1);
434     }
435   }
436 }
437 
InitStrides()438 void MultipleBroadcastIterator::InitStrides() {
439   for (size_t i = 0; i < multi_inputs_.size(); ++i) {
440     multi_inputs_strides_[i][output_dimension_ - 1] = 1;
441     for (int j = output_dimension_ - 2; j >= 0; --j) {
442       multi_inputs_strides_[i][j] = multi_inputs_[i][j + 1] * multi_inputs_strides_[i][j + 1];
443       multi_inputs_back_strides_[i][j + 1] = (multi_inputs_[i][j + 1] - 1) * multi_inputs_strides_[i][j + 1];
444     }
445     // Update strides for broadcast
446     // While the axis value is 1, the stride is 0
447     (void)std::transform(multi_inputs_strides_[i].begin(), multi_inputs_strides_[i].end(), multi_inputs_[i].begin(),
448                          multi_inputs_strides_[i].begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
449   }
450 }
451 
TransposeIterator(ShapeVector output_shape,std::vector<size_t> axes,const ShapeVector & input_shape)452 TransposeIterator::TransposeIterator(ShapeVector output_shape, std::vector<size_t> axes, const ShapeVector &input_shape)
453     : shape_(std::move(output_shape)), axes_(std::move(axes)) {
454   // Calculate strides
455   dimension_ = shape_.size();
456   std::vector<uint32_t> strides(dimension_, 1);
457   for (int i = dimension_ - 2; i >= 0; --i) {
458     strides[i] = input_shape[i + 1] * strides[i + 1];
459   }
460 
461   // Swap shape ans strides and calculate back strides
462   strides_.resize(dimension_);
463   back_strides_.resize(dimension_);
464   for (int i = dimension_ - 1; i >= 0; --i) {
465     strides_[i] = strides[axes_[i]];
466     back_strides_[i] = (shape_[i] - 1) * strides_[i];
467   }
468 
469   // Calculate coordinate by pos
470   coordinates_.resize(dimension_);
471 }
472 
SetPos(size_t pos)473 void TransposeIterator::SetPos(size_t pos) {
474   for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) {
475     coordinates_[i] = pos % shape_[i];
476     pos_ += coordinates_[i] * strides_[i];
477     pos /= shape_[i];
478   }
479 }
480 
GenNextPos()481 void TransposeIterator::GenNextPos() {
482   for (int i = dimension_ - 1; i >= 0; --i) {
483     if (coordinates_[i] + 1 == shape_[i]) {
484       coordinates_[i] = 0;
485       pos_ -= back_strides_[i];
486     } else {
487       coordinates_[i]++;
488       pos_ += strides_[i];
489       break;
490     }
491   }
492 }
493 
GetBroadcastShape(const ShapeVector & x,const ShapeVector & y)494 ShapeVector CPUKernelUtils::GetBroadcastShape(const ShapeVector &x, const ShapeVector &y) {
495   size_t x_len = x.size();
496   size_t y_len = y.size();
497   size_t length = x_len < y_len ? x_len : y_len;
498   ShapeVector broadcast_shape;
499   ShapeVector broadcast_shape_back;
500   for (int i = -SizeToInt(length); i < 0; ++i) {
501     if (x[x_len + i] == 1) {
502       broadcast_shape_back.push_back(y[y_len + i]);
503     } else if (y[y_len + i] == 1) {
504       broadcast_shape_back.push_back(x[x_len + i]);
505     } else if (x[x_len + i] == y[y_len + i]) {
506       broadcast_shape_back.push_back(x[x_len + i]);
507     }
508   }
509   if (length == x_len) {
510     for (size_t i = 0; i < y_len - length; ++i) {
511       broadcast_shape.push_back(y[i]);
512     }
513   } else {
514     for (size_t i = 0; i < x_len - length; ++i) {
515       broadcast_shape.push_back(x[i]);
516     }
517   }
518   for (size_t i = 0; i < length; ++i) {
519     broadcast_shape.push_back(broadcast_shape_back[i]);
520   }
521   return broadcast_shape;
522 }
523 
Init(const ShapeVector & input_shape,size_t axis)524 void AxisIterator::Init(const ShapeVector &input_shape, size_t axis) {
525   outer_size_ = 1;
526   for (size_t i = 0; i < axis; i++) {
527     outer_size_ *= LongToSize(input_shape[i]);
528   }
529 
530   axis_size_ = LongToSize(input_shape[axis]);
531 
532   inner_size_ = 1;
533   for (size_t i = axis + 1; i < input_shape.size(); ++i) {
534     inner_size_ *= LongToSize(input_shape[i]);
535   }
536 }
537 
Sign(float x)538 int Sign(float x) {
539   if (x > 0) {
540     return 1;
541   }
542   if (x < 0) {
543     return -1;
544   }
545   return 0;
546 }
GetBroadCastIndex(const std::vector<size_t> & unaligned_input_shape,const std::vector<size_t> & output_shape,std::vector<size_t> * index_list)547 void GetBroadCastIndex(const std::vector<size_t> &unaligned_input_shape, const std::vector<size_t> &output_shape,
548                        std::vector<size_t> *index_list) {
549   // Given unaligned input shape and output shape, this function returns the mapping
550   // from indices of output (logical) to correspondingly real input indices (physical).
551   // The return will write to index_list, whose size is equal to total elements of output.
552   constexpr size_t kIsCloseMaxDim = 10;
553   size_t logical_shape[kIsCloseMaxDim];
554   size_t physical_shape[kIsCloseMaxDim];
555   size_t size = 0, output_size = 1;
556   // Align input shape to output shape by filling one into the outermost dimension.
557   std::vector<size_t> input_shape(output_shape.size());
558   for (size_t i = 0, j = output_shape.size() - unaligned_input_shape.size(); i < output_shape.size(); i++) {
559     input_shape[i] = i < j ? 1 : unaligned_input_shape[i - j];
560   }
561   // Get logical shape and physical shape of input. Moreover, we will merge the dimensions with same
562   // (logical or physical) property.
563   for (size_t i = output_shape.size(); i > 0;) {
564     size_t stride = 1;
565     bool change = false, is_valid = false;
566     while (i > 0 && input_shape[i - 1] == output_shape[i - 1]) {
567       stride *= output_shape[i - 1];
568       change = is_valid = true;
569       --i;
570     }
571     if (change) {
572       output_size *= stride;
573       logical_shape[size] = physical_shape[size] = stride;
574       size++;
575     }
576     change = false;
577     stride = 1;
578     while (i > 0 && input_shape[i - 1] == 1) {
579       stride *= output_shape[i - 1];
580       change = is_valid = true;
581       --i;
582     }
583     if (change) {
584       output_size *= stride;
585       logical_shape[size] = 1;
586       physical_shape[size] = stride;
587       size++;
588     }
589     if (!is_valid) {
590       MS_LOG(EXCEPTION) << "Both shape are not able to broadcast, input shape is " << unaligned_input_shape
591                         << " and output shape is " << output_shape;
592     }
593   }
594 
595   // while input address is null, whose shape like (4, 0, 5), then the output size is zero
596   if (output_size < 1) {
597     return;
598   }
599 
600   // Get the flatten input indices according to "logical_shape" and "physical_shape".
601   size_t offset = 1;
602   size_t stride = 1;
603   index_list->resize(output_size);
604   (*index_list)[0] = 0;  // First element is set to 0.
605   for (size_t i = 0; i < size; ++i) {
606     size_t increment = (logical_shape[i] == physical_shape[i] ? stride : 0);
607     for (size_t j = 0; j + offset < physical_shape[i] * offset; ++j) {
608       (*index_list)[offset + j] = (*index_list)[j] + increment;
609     }
610     offset *= physical_shape[i];
611     stride *= logical_shape[i];
612   }
613 }
614 }  // namespace kernel
615 }  // namespace mindspore
616