1 /**
2 * Copyright 2019-2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "plugin/device/cpu/kernel/cpu_kernel.h"
18
19 #include <algorithm>
20 #include <utility>
21 #include <cmath>
22 #include <map>
23 #include <set>
24 #include <numeric>
25 #include "kernel/oplib/oplib.h"
26 #include "utils/profile.h"
27 #include "runtime/graph_scheduler/actor/actor_common.h"
28 #include "kernel/common_utils.h"
29
30 namespace mindspore {
31 namespace kernel {
GetAllSupportedList(const std::string & kernel_name)32 std::vector<KernelAttr> NativeCpuKernelMod::GetAllSupportedList(const std::string &kernel_name) {
33 auto iter = support_map_.find(kernel_name);
34 if (iter == support_map_.end()) {
35 std::vector<KernelAttr> kernel_attrs;
36 auto kernel_support = GetOpSupport();
37 (void)kernel_attrs.insert(kernel_attrs.end(), kernel_support.begin(), kernel_support.end());
38 if (!kernel_attrs.empty() && kernel_attrs[0].GetSkipCheck()) {
39 (void)support_map_.emplace(kernel_name, kernel_attrs);
40 return support_map_[kernel_name];
41 }
42 if (kernel_attrs.empty()) {
43 auto oplib_support = GetSupportFromOpLib(kernel_name);
44 (void)kernel_attrs.insert(kernel_attrs.end(), oplib_support.begin(), oplib_support.end());
45 }
46 (void)support_map_.emplace(kernel_name, kernel_attrs);
47 }
48
49 return support_map_[kernel_name];
50 }
51
GetSupportFromOpLib(const std::string & kernel_name) const52 std::vector<KernelAttr> NativeCpuKernelMod::GetSupportFromOpLib(const std::string &kernel_name) const {
53 static std::set<std::string> same_op_name = {"Pack", "Stack", "Split", "Transpose",
54 "Unpack", "AddN", "ConcatOffset", "DynamicStitch"};
55 std::vector<KernelAttr> support_kernel_attrs;
56 auto op_info = mindspore::kernel::OpLib::FindOp(kernel_name, kernel::OpImplyType::kImplyCPU);
57 if (op_info == nullptr) {
58 MS_LOG(WARNING) << "Not find op[" << kernel_name << "] in cpu. For more details, "
59 << "please refer to the list of supported cpu operations at https://www.mindspore.cn.";
60 return support_kernel_attrs;
61 }
62
63 auto inputs_ptr = op_info->inputs_ptr();
64 auto outputs_ptr = op_info->outputs_ptr();
65 if (outputs_ptr.empty()) {
66 MS_LOG(WARNING) << "The output dimension of operator '" << kernel_name << "' can not be zero.";
67 return support_kernel_attrs;
68 }
69
70 auto support_size = outputs_ptr[0]->dtypes().size();
71 for (size_t i = 0; i < support_size; i++) {
72 KernelAttr kernel_attr;
73 for (size_t j = 0; j < inputs_ptr.size(); j++) {
74 auto input_dtypes = inputs_ptr[j]->dtypes();
75 auto input_formats = inputs_ptr[j]->formats();
76 (void)kernel_attr.AddInputAttr(kernel::DtypeToTypeId(input_dtypes[i]), input_formats[i]);
77 }
78 for (size_t j = 0; j < outputs_ptr.size(); j++) {
79 auto output_dtypes = outputs_ptr[j]->dtypes();
80 auto output_formats = outputs_ptr[j]->formats();
81 (void)kernel_attr.AddOutputAttr(kernel::DtypeToTypeId(output_dtypes[i]), output_formats[i]);
82 }
83 if (same_op_name.count(op_info->op_name()) != 0) {
84 (void)kernel_attr.AddAllSameAttr(true);
85 }
86 support_kernel_attrs.push_back(kernel_attr);
87 }
88
89 return support_kernel_attrs;
90 }
91
ExpandDimsTo4(ShapeVector * shape)92 void CPUKernelUtils::ExpandDimsTo4(ShapeVector *shape) {
93 MS_EXCEPTION_IF_NULL(shape);
94 auto len = shape->size();
95 const size_t expect_dims = 4;
96 if (len < expect_dims) {
97 for (size_t i = 0; i < expect_dims - len; ++i) {
98 (void)shape->insert(shape->begin(), 1);
99 }
100 }
101 }
102
CalcOffset(const ShapeVector & shape,size_t dim0,size_t dim1,size_t dim2,size_t dim3)103 size_t CPUKernelUtils::CalcOffset(const ShapeVector &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3) {
104 size_t offset = dim0 * LongToSize(shape[1]) * LongToSize(shape[2]) * LongToSize(shape[3]) +
105 dim1 * LongToSize(shape[2]) * LongToSize(shape[3]) + dim2 * LongToSize(shape[3]) + dim3;
106 return offset;
107 }
108
GetElementNumOnAxis(const ShapeVector & shape,int axis)109 size_t CPUKernelUtils::GetElementNumOnAxis(const ShapeVector &shape, int axis) {
110 if (axis < 0) {
111 axis = axis + SizeToInt(shape.size());
112 }
113 int64_t result = 1;
114 for (int j = 3; j > axis; --j) {
115 result *= shape[j];
116 }
117 return LongToSize(result);
118 }
119
GetElementNumEveryDim(const ShapeVector & shape,std::vector<size_t> * element_num)120 void CPUKernelUtils::GetElementNumEveryDim(const ShapeVector &shape, std::vector<size_t> *element_num) {
121 size_t accumulation = 1;
122 MS_EXCEPTION_IF_NULL(element_num);
123 (void)element_num->emplace_back(1);
124 for (size_t i = shape.size() - 1; i > 0; --i) {
125 accumulation *= LongToSizeClipNeg(shape[i]);
126 (void)element_num->emplace_back(accumulation);
127 }
128 std::reverse(element_num->begin(), element_num->end());
129 }
130
ParallelFor(const CTask & task,size_t count,float block_size)131 void CPUKernelUtils::ParallelFor(const CTask &task, size_t count, float block_size) {
132 auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
133 size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
134 std::vector<common::Task> tasks;
135 size_t start = 0;
136 size_t once_compute_size = (count + thread_num - 1) / thread_num;
137 while (start < count) {
138 size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
139 auto block = [&, start, end]() {
140 task(start, end);
141 return common::SUCCESS;
142 };
143 (void)tasks.emplace_back(block);
144 start += once_compute_size;
145 }
146 ParallelLaunch(tasks);
147 }
148
149 // Search for best block_size to get best thread num : 1 2 4 8 16 23(32)
150 // Each block_size runs 5 times to get an average cpu kernel cost time.
151 // If the speed of block_size[i] is slower than block_size[i-2], than we
152 // assume that block_size[i-2] is the best block_size.
ParallelForAutoSearch(const CTask & task,size_t count,ParallelSearchInfo * parallel_search_info)153 void CPUKernelUtils::ParallelForAutoSearch(const CTask &task, size_t count, ParallelSearchInfo *parallel_search_info) {
154 const size_t MAX_POW = 6;
155 const size_t AVG_COUNT = 5;
156 MS_EXCEPTION_IF_NULL(parallel_search_info);
157 size_t current_pow = parallel_search_info->search_count / AVG_COUNT;
158 if (current_pow < MAX_POW) {
159 if (parallel_search_info->search_count % AVG_COUNT == 0) {
160 parallel_search_info->tmp_sum_cost_time = 0;
161 }
162 float block_size = static_cast<float>(count) / std::pow(2.0f, current_pow);
163 double start_time = GetTime();
164 ParallelFor(task, count, block_size);
165 double cost_time = GetTime() - start_time;
166 parallel_search_info->tmp_sum_cost_time += cost_time;
167 parallel_search_info->search_count++;
168 if (parallel_search_info->search_count % AVG_COUNT == 0) {
169 double avg_time = parallel_search_info->tmp_sum_cost_time / AVG_COUNT;
170 if (parallel_search_info->min_cost_time > avg_time) {
171 parallel_search_info->min_cost_time = avg_time;
172 parallel_search_info->best_block_size = block_size;
173 parallel_search_info->best_pow = current_pow;
174 } else if (current_pow - parallel_search_info->best_pow >= 2) {
175 parallel_search_info->search_count = AVG_COUNT * MAX_POW;
176 }
177 }
178 } else {
179 ParallelFor(task, count, parallel_search_info->best_block_size);
180 }
181 }
182
GetActorMgrInnerThreadPool()183 ActorThreadPool *GetActorMgrInnerThreadPool() {
184 auto actor_manager = ActorMgr::GetActorMgrRef();
185 MS_EXCEPTION_IF_NULL(actor_manager);
186 auto thread_pool = actor_manager->GetActorThreadPool();
187 // Init thread_pool if env is windows or ascend, in case that it won't be init in graph_scheduler.
188 if (thread_pool == nullptr) {
189 size_t actor_thread_num = 0;
190 size_t actor_and_kernel_thread_num = 0;
191 runtime::ComputeThreadNums(&actor_thread_num, &actor_and_kernel_thread_num);
192 size_t actor_queue_size = 81920;
193 (void)actor_manager->Initialize(true, actor_thread_num, actor_and_kernel_thread_num, actor_queue_size);
194 thread_pool = actor_manager->GetActorThreadPool();
195 MS_EXCEPTION_IF_NULL(thread_pool);
196 }
197 thread_pool->SetKernelThreadMaxSpinCount(kDefaultKernelSpinCount);
198 return thread_pool;
199 }
200
201 // Use threadpool of mindrt
ParallelLaunch(const CTask & task,size_t count,float block_size,Content content,ThreadPool * pool)202 void ParallelLaunch(const CTask &task, size_t count, float block_size, Content content, ThreadPool *pool) {
203 if (count == 0) {
204 return;
205 }
206 auto thread_pool = pool == nullptr ? GetActorMgrInnerThreadPool() : pool;
207 size_t kernel_thread_num = thread_pool->GetKernelThreadNum();
208 if (kernel_thread_num == 0) {
209 MS_LOG(EXCEPTION) << "Actor inner pool has been init, but kernel thread is 0!";
210 }
211
212 size_t thread_num = count < block_size * kernel_thread_num ? std::ceil(count / block_size) : kernel_thread_num;
213 size_t once_compute_size = (count + thread_num - 1) / thread_num;
214 size_t task_num = count / once_compute_size;
215 if (count % once_compute_size != 0) {
216 task_num += 1;
217 }
218 auto func = [&](void *, int task_id, float, float) {
219 size_t start = IntToSize(task_id) * once_compute_size;
220 size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
221 task(start, end);
222 return common::SUCCESS;
223 };
224 (void)thread_pool->ParallelLaunch(func, content, task_num);
225 }
226
ParallelLaunch(const std::vector<common::Task> & tasks,Content content,ThreadPool * pool)227 void ParallelLaunch(const std::vector<common::Task> &tasks, Content content, ThreadPool *pool) {
228 size_t count = tasks.size();
229 if (count == 0) {
230 return;
231 }
232 auto thread_pool = pool == nullptr ? GetActorMgrInnerThreadPool() : pool;
233 size_t kernel_thread_num = thread_pool->GetKernelThreadNum();
234 if (kernel_thread_num == 0) {
235 MS_LOG(EXCEPTION) << "Actor inner pool has been init, but kernel thread is 0!";
236 }
237
238 size_t thread_num = count < kernel_thread_num ? count : kernel_thread_num;
239 size_t once_compute_size = (count + thread_num - 1) / thread_num;
240 size_t task_num = count / once_compute_size;
241 if (count % once_compute_size != 0) {
242 task_num += 1;
243 }
244 auto func = [&](void *, int task_id, float, float) {
245 size_t start = IntToSize(task_id) * once_compute_size;
246 size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
247 for (size_t i = start; i < end; ++i) {
248 (void)tasks[i]();
249 }
250 return common::SUCCESS;
251 };
252 (void)thread_pool->ParallelLaunch(func, content, task_num);
253 }
254
ParallelLaunchAutoSearch(const CTask & task,size_t count,Content content,ParallelSearchInfo * parallel_search_info,ThreadPool * pool)255 void ParallelLaunchAutoSearch(const CTask &task, size_t count, Content content,
256 ParallelSearchInfo *parallel_search_info, ThreadPool *pool) {
257 if (!parallel_search_info->kernel_thread_num_set) {
258 auto thread_pool = pool == nullptr ? GetActorMgrInnerThreadPool() : pool;
259 size_t kernel_thread_num = thread_pool->GetKernelThreadNum();
260 if (kernel_thread_num == 0) {
261 MS_LOG(EXCEPTION) << "Actor inner pool has been init, but kernel thread is 0!";
262 }
263 size_t max_pow_current = parallel_search_info->max_pow - 1;
264 while (std::pow(2.0f, max_pow_current) <= static_cast<float>(kernel_thread_num)) {
265 max_pow_current++;
266 }
267 parallel_search_info->max_pow = max_pow_current + 1;
268 parallel_search_info->kernel_thread_num_set = true;
269 }
270 const size_t AVG_COUNT = 5;
271 size_t current_pow = parallel_search_info->search_count / AVG_COUNT;
272 if (current_pow < parallel_search_info->max_pow) {
273 if (parallel_search_info->search_count % AVG_COUNT == 0) {
274 parallel_search_info->tmp_sum_cost_time = 0;
275 }
276 float block_size = static_cast<float>(count) / std::pow(2.0f, current_pow);
277 double start_time = GetTime();
278 ParallelLaunch(task, count, block_size, content, pool);
279 double cost_time = GetTime() - start_time;
280 parallel_search_info->tmp_sum_cost_time += cost_time;
281 parallel_search_info->search_count++;
282 if (parallel_search_info->search_count % AVG_COUNT == 0) {
283 double avg_time = parallel_search_info->tmp_sum_cost_time / AVG_COUNT;
284 if (parallel_search_info->min_cost_time > avg_time) {
285 parallel_search_info->min_cost_time = avg_time;
286 parallel_search_info->best_block_size = block_size;
287 parallel_search_info->best_pow = current_pow;
288 } else if (current_pow - parallel_search_info->best_pow >= 2) {
289 parallel_search_info->search_count = AVG_COUNT * parallel_search_info->max_pow;
290 }
291 }
292 } else {
293 ParallelLaunch(task, count, parallel_search_info->best_block_size, content, pool);
294 }
295 }
296
FlatShapeByAxis(const ShapeVector & shape,int axis)297 ShapeVector CPUKernelUtils::FlatShapeByAxis(const ShapeVector &shape, int axis) {
298 if (axis < 0) {
299 axis = axis + SizeToInt(shape.size());
300 }
301 int64_t dim_row = 1;
302 int64_t dim_col = 1;
303 for (size_t i = 0; i < shape.size(); ++i) {
304 if (SizeToInt(i) < axis) {
305 dim_row *= shape[i];
306 } else {
307 dim_col *= shape[i];
308 }
309 }
310 // referred to Copy elision https://en.cppreference.com/w/cpp/language/copy_elision
311 // returning a vector won't cause extra vector constructed or moved
312 return ShapeVector{dim_row, dim_col};
313 }
314
BroadcastIterator(ShapeVector input_shape_a,ShapeVector input_shape_b,ShapeVector output_shape)315 BroadcastIterator::BroadcastIterator(ShapeVector input_shape_a, ShapeVector input_shape_b, ShapeVector output_shape)
316 : input_shape_a_(std::move(input_shape_a)),
317 input_shape_b_(std::move(input_shape_b)),
318 output_shape_(std::move(output_shape)) {
319 output_dimension_ = SizeToInt(output_shape_.size()); // Assign dimension to int for iterator
320 BroadcastShape();
321 // Allocate strides memory
322 input_strides_a_.resize(output_dimension_);
323 input_strides_b_.resize(output_dimension_);
324 input_back_strides_a_.resize(output_dimension_);
325 input_back_strides_b_.resize(output_dimension_);
326 coordinates_.resize(output_dimension_);
327 InitStrides();
328 }
329
SetPos(size_t pos)330 void BroadcastIterator::SetPos(size_t pos) {
331 for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) {
332 coordinates_[i] = pos % output_shape_[i];
333 input_pos_[0] += coordinates_[i] * input_strides_a_[i];
334 input_pos_[1] += coordinates_[i] * input_strides_b_[i];
335 pos /= output_shape_[i];
336 }
337 }
338
GenNextPos()339 void BroadcastIterator::GenNextPos() {
340 // Calculate output next coordinate
341 for (int i = output_dimension_ - 1; i >= 0; --i) {
342 if (coordinates_[i] + 1 == output_shape_[i]) {
343 coordinates_[i] = 0;
344 input_pos_[0] -= input_back_strides_a_[i];
345 input_pos_[1] -= input_back_strides_b_[i];
346 } else {
347 ++coordinates_[i];
348 input_pos_[0] += input_strides_a_[i];
349 input_pos_[1] += input_strides_b_[i];
350 break;
351 }
352 }
353 }
354
BroadcastShape()355 void BroadcastIterator::BroadcastShape() {
356 int input_dimension_a = input_shape_a_.size();
357 if (input_dimension_a < output_dimension_) {
358 (void)input_shape_a_.insert(input_shape_a_.begin(), IntToSize(output_dimension_ - input_dimension_a), 1);
359 }
360
361 int input_dimension_b = input_shape_b_.size();
362 if (input_dimension_b < output_dimension_) {
363 (void)input_shape_b_.insert(input_shape_b_.begin(), IntToSize(output_dimension_ - input_dimension_b), 1);
364 }
365 }
366
InitStrides()367 void BroadcastIterator::InitStrides() {
368 if (output_dimension_ <= 0) {
369 return;
370 }
371 input_strides_a_[output_dimension_ - 1] = 1;
372 input_strides_b_[output_dimension_ - 1] = 1;
373 for (int i = output_dimension_ - 2; i >= 0; --i) {
374 input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1];
375 input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1];
376 input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1];
377 input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1];
378 }
379
380 // Update strides for broadcast
381 // While the axis value is 1, the stride is 0
382 (void)std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(),
383 input_strides_a_.begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
384 (void)std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(),
385 input_strides_b_.begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
386 }
387
MultipleBroadcastIterator(std::vector<shape_info> multi_inputs,shape_info output_shape)388 MultipleBroadcastIterator::MultipleBroadcastIterator(std::vector<shape_info> multi_inputs, shape_info output_shape)
389 : multi_inputs_(std::move(multi_inputs)), output_shape_(std::move(output_shape)) {
390 output_dimension_ = SizeToInt(output_shape_.size());
391 // Assign dimension to int for iterator
392 BroadcastShape();
393 input_pos_.resize(multi_inputs_.size());
394 // Allocate strides memory
395 multi_inputs_strides_.resize(multi_inputs_.size(), std::vector<int64_t>(output_dimension_, 0));
396 multi_inputs_back_strides_.resize(multi_inputs_.size(), std::vector<int64_t>(output_dimension_, 0));
397 coordinates_.resize(output_dimension_);
398 InitStrides();
399 }
400
SetPos(size_t pos)401 void MultipleBroadcastIterator::SetPos(size_t pos) {
402 for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) {
403 coordinates_[i] = pos % output_shape_[i];
404 for (size_t j = 0; j < input_pos_.size(); ++j) {
405 input_pos_[j] += coordinates_[i] * multi_inputs_strides_[j][i];
406 }
407 pos /= output_shape_[i];
408 }
409 }
410
GenNextPos()411 void MultipleBroadcastIterator::GenNextPos() {
412 // Calculate output next coordinate
413 for (int i = output_dimension_ - 1; i >= 0; --i) {
414 if (coordinates_[i] + 1 == output_shape_[i]) {
415 coordinates_[i] = 0;
416 for (size_t j = 0; j < input_pos_.size(); ++j) {
417 input_pos_[j] -= multi_inputs_back_strides_[j][i];
418 }
419 } else {
420 ++coordinates_[i];
421 for (size_t j = 0; j < input_pos_.size(); ++j) {
422 input_pos_[j] += multi_inputs_strides_[j][i];
423 }
424 break;
425 }
426 }
427 }
428
BroadcastShape()429 void MultipleBroadcastIterator::BroadcastShape() {
430 for (auto &multi_input : multi_inputs_) {
431 int input_dimension = SizeToInt(multi_input.size());
432 if (input_dimension < output_dimension_) {
433 (void)multi_input.insert(multi_input.begin(), IntToSize(output_dimension_ - input_dimension), 1);
434 }
435 }
436 }
437
InitStrides()438 void MultipleBroadcastIterator::InitStrides() {
439 for (size_t i = 0; i < multi_inputs_.size(); ++i) {
440 multi_inputs_strides_[i][output_dimension_ - 1] = 1;
441 for (int j = output_dimension_ - 2; j >= 0; --j) {
442 multi_inputs_strides_[i][j] = multi_inputs_[i][j + 1] * multi_inputs_strides_[i][j + 1];
443 multi_inputs_back_strides_[i][j + 1] = (multi_inputs_[i][j + 1] - 1) * multi_inputs_strides_[i][j + 1];
444 }
445 // Update strides for broadcast
446 // While the axis value is 1, the stride is 0
447 (void)std::transform(multi_inputs_strides_[i].begin(), multi_inputs_strides_[i].end(), multi_inputs_[i].begin(),
448 multi_inputs_strides_[i].begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
449 }
450 }
451
TransposeIterator(ShapeVector output_shape,std::vector<size_t> axes,const ShapeVector & input_shape)452 TransposeIterator::TransposeIterator(ShapeVector output_shape, std::vector<size_t> axes, const ShapeVector &input_shape)
453 : shape_(std::move(output_shape)), axes_(std::move(axes)) {
454 // Calculate strides
455 dimension_ = shape_.size();
456 std::vector<uint32_t> strides(dimension_, 1);
457 for (int i = dimension_ - 2; i >= 0; --i) {
458 strides[i] = input_shape[i + 1] * strides[i + 1];
459 }
460
461 // Swap shape ans strides and calculate back strides
462 strides_.resize(dimension_);
463 back_strides_.resize(dimension_);
464 for (int i = dimension_ - 1; i >= 0; --i) {
465 strides_[i] = strides[axes_[i]];
466 back_strides_[i] = (shape_[i] - 1) * strides_[i];
467 }
468
469 // Calculate coordinate by pos
470 coordinates_.resize(dimension_);
471 }
472
SetPos(size_t pos)473 void TransposeIterator::SetPos(size_t pos) {
474 for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) {
475 coordinates_[i] = pos % shape_[i];
476 pos_ += coordinates_[i] * strides_[i];
477 pos /= shape_[i];
478 }
479 }
480
GenNextPos()481 void TransposeIterator::GenNextPos() {
482 for (int i = dimension_ - 1; i >= 0; --i) {
483 if (coordinates_[i] + 1 == shape_[i]) {
484 coordinates_[i] = 0;
485 pos_ -= back_strides_[i];
486 } else {
487 coordinates_[i]++;
488 pos_ += strides_[i];
489 break;
490 }
491 }
492 }
493
GetBroadcastShape(const ShapeVector & x,const ShapeVector & y)494 ShapeVector CPUKernelUtils::GetBroadcastShape(const ShapeVector &x, const ShapeVector &y) {
495 size_t x_len = x.size();
496 size_t y_len = y.size();
497 size_t length = x_len < y_len ? x_len : y_len;
498 ShapeVector broadcast_shape;
499 ShapeVector broadcast_shape_back;
500 for (int i = -SizeToInt(length); i < 0; ++i) {
501 if (x[x_len + i] == 1) {
502 broadcast_shape_back.push_back(y[y_len + i]);
503 } else if (y[y_len + i] == 1) {
504 broadcast_shape_back.push_back(x[x_len + i]);
505 } else if (x[x_len + i] == y[y_len + i]) {
506 broadcast_shape_back.push_back(x[x_len + i]);
507 }
508 }
509 if (length == x_len) {
510 for (size_t i = 0; i < y_len - length; ++i) {
511 broadcast_shape.push_back(y[i]);
512 }
513 } else {
514 for (size_t i = 0; i < x_len - length; ++i) {
515 broadcast_shape.push_back(x[i]);
516 }
517 }
518 for (size_t i = 0; i < length; ++i) {
519 broadcast_shape.push_back(broadcast_shape_back[i]);
520 }
521 return broadcast_shape;
522 }
523
Init(const ShapeVector & input_shape,size_t axis)524 void AxisIterator::Init(const ShapeVector &input_shape, size_t axis) {
525 outer_size_ = 1;
526 for (size_t i = 0; i < axis; i++) {
527 outer_size_ *= LongToSize(input_shape[i]);
528 }
529
530 axis_size_ = LongToSize(input_shape[axis]);
531
532 inner_size_ = 1;
533 for (size_t i = axis + 1; i < input_shape.size(); ++i) {
534 inner_size_ *= LongToSize(input_shape[i]);
535 }
536 }
537
Sign(float x)538 int Sign(float x) {
539 if (x > 0) {
540 return 1;
541 }
542 if (x < 0) {
543 return -1;
544 }
545 return 0;
546 }
GetBroadCastIndex(const std::vector<size_t> & unaligned_input_shape,const std::vector<size_t> & output_shape,std::vector<size_t> * index_list)547 void GetBroadCastIndex(const std::vector<size_t> &unaligned_input_shape, const std::vector<size_t> &output_shape,
548 std::vector<size_t> *index_list) {
549 // Given unaligned input shape and output shape, this function returns the mapping
550 // from indices of output (logical) to correspondingly real input indices (physical).
551 // The return will write to index_list, whose size is equal to total elements of output.
552 constexpr size_t kIsCloseMaxDim = 10;
553 size_t logical_shape[kIsCloseMaxDim];
554 size_t physical_shape[kIsCloseMaxDim];
555 size_t size = 0, output_size = 1;
556 // Align input shape to output shape by filling one into the outermost dimension.
557 std::vector<size_t> input_shape(output_shape.size());
558 for (size_t i = 0, j = output_shape.size() - unaligned_input_shape.size(); i < output_shape.size(); i++) {
559 input_shape[i] = i < j ? 1 : unaligned_input_shape[i - j];
560 }
561 // Get logical shape and physical shape of input. Moreover, we will merge the dimensions with same
562 // (logical or physical) property.
563 for (size_t i = output_shape.size(); i > 0;) {
564 size_t stride = 1;
565 bool change = false, is_valid = false;
566 while (i > 0 && input_shape[i - 1] == output_shape[i - 1]) {
567 stride *= output_shape[i - 1];
568 change = is_valid = true;
569 --i;
570 }
571 if (change) {
572 output_size *= stride;
573 logical_shape[size] = physical_shape[size] = stride;
574 size++;
575 }
576 change = false;
577 stride = 1;
578 while (i > 0 && input_shape[i - 1] == 1) {
579 stride *= output_shape[i - 1];
580 change = is_valid = true;
581 --i;
582 }
583 if (change) {
584 output_size *= stride;
585 logical_shape[size] = 1;
586 physical_shape[size] = stride;
587 size++;
588 }
589 if (!is_valid) {
590 MS_LOG(EXCEPTION) << "Both shape are not able to broadcast, input shape is " << unaligned_input_shape
591 << " and output shape is " << output_shape;
592 }
593 }
594
595 // while input address is null, whose shape like (4, 0, 5), then the output size is zero
596 if (output_size < 1) {
597 return;
598 }
599
600 // Get the flatten input indices according to "logical_shape" and "physical_shape".
601 size_t offset = 1;
602 size_t stride = 1;
603 index_list->resize(output_size);
604 (*index_list)[0] = 0; // First element is set to 0.
605 for (size_t i = 0; i < size; ++i) {
606 size_t increment = (logical_shape[i] == physical_shape[i] ? stride : 0);
607 for (size_t j = 0; j + offset < physical_shape[i] * offset; ++j) {
608 (*index_list)[offset + j] = (*index_list)[j] + increment;
609 }
610 offset *= physical_shape[i];
611 stride *= logical_shape[i];
612 }
613 }
614 } // namespace kernel
615 } // namespace mindspore
616