• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
18 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
19 
20 #include <functional>
21 #include <memory>
22 #include <numeric>
23 #include <string>
24 #include <thread>
25 #include <utility>
26 #include <vector>
27 #include <map>
28 #include <set>
29 
30 #include "kernel/kernel.h"
31 #include "plugin/factory/ms_factory.h"
32 #include "plugin/device/cpu/kernel/cpu_kernel_mod.h"
33 #include "include/backend/anf_runtime_algorithm.h"
34 #include "include/common/utils/anfalgo.h"
35 #include "kernel/common_utils.h"
36 #include "ir/anf.h"
37 #include "actor/actormgr.h"
38 #include "include/common/thread_pool.h"
39 #include "include/backend/visible.h"
40 
41 using mindspore::kernel::Address;
42 using mindspore::kernel::AddressPtr;
43 using mindspore::kernel::KernelTensor;
44 using CTask = std::function<void(size_t, size_t)>;
45 namespace mindspore {
46 namespace kernel {
47 constexpr char KERNEL_SIZE[] = "kernel_size";
48 constexpr char VALIDATE_INDICES[] = "validate_indices";
49 constexpr char STRIDE[] = "stride";
50 constexpr char STRIDES[] = "strides";
51 constexpr char DILATION[] = "dilation";
52 constexpr char DILATIONS[] = "dilations";
53 constexpr char FORMAT[] = "format";
54 constexpr char PAD[] = "pad";
55 constexpr char PAD_LIST[] = "pad_list";
56 constexpr char PAD_MODE[] = "pad_mode";
57 constexpr char PAD_MODE_LOWER_SAME[] = "same";
58 constexpr char PAD_MODE_LOWER_VALID[] = "valid";
59 constexpr char PAD_MODE_LOWER_PAD[] = "pad";
60 constexpr char PAD_MODE_UPPER_SAME[] = "SAME";
61 constexpr char PAD_MODE_UPPER_VALID[] = "VALID";
62 constexpr char PAD_MODE_UPPER_PAD[] = "PAD";
63 constexpr char COUNT_INCLUDE_PAD[] = "count_include_pad";
64 constexpr char CEIL_MODE[] = "ceil_mode";
65 constexpr char DIVISOR_OVERRIDE[] = "divisor_override";
66 constexpr char TRANSPOSE_A[] = "transpose_a";
67 constexpr char TRANSPOSE_B[] = "transpose_b";
68 constexpr char IS_GRAD[] = "is_grad";
69 constexpr char TRANSPOSE_NO = 'N';
70 constexpr char TRANSPOSE_YES = 'T';
71 constexpr char AXIS[] = "axis";
72 constexpr char DIM[] = "dim";
73 constexpr char NUM[] = "num";
74 constexpr char BEGIN[] = "begin";
75 constexpr char END[] = "end";
76 constexpr char SIZE[] = "size";
77 constexpr char USE_NESTEROV[] = "use_nesterov";
78 constexpr char GROUP[] = "group";
79 constexpr char START[] = "start";
80 constexpr char LIMIT[] = "limit";
81 constexpr char DELTA[] = "delta";
82 constexpr char SORTED[] = "sorted";
83 constexpr char ADJ_ST[] = "adjoint_st";
84 constexpr char ADJ_dT[] = "adjoint_dt";
85 constexpr char REDUCTION[] = "reduction";
86 constexpr char NONE[] = "none";
87 constexpr char SUM[] = "sum";
88 constexpr char MEAN[] = "mean";
89 constexpr char BETA[] = "beta";
90 constexpr char EXCLUSIVE[] = "exclusive";
91 constexpr char REVERSE[] = "reverse";
92 constexpr char PCR[] = "preprocess_collapse_repeated";
93 constexpr char CTR[] = "ctc_merge_repeated";
94 constexpr char ILOTI[] = "ignore_longer_outputs_than_inputs";
95 constexpr char MOMENTUM[] = "momentum";
96 constexpr char RHO[] = "rho";
97 constexpr char EPSILON[] = "epsilon";
98 constexpr char ALIGN_CORNERS[] = "align_corners";
99 constexpr char PERIODS[] = "periods";
100 constexpr char WINDOW[] = "window";
101 constexpr char MIN_PERIODS[] = "min_periods";
102 constexpr char CENTER[] = "center";
103 constexpr char METHOD[] = "method";
104 constexpr char CLOSED[] = "closed";
105 constexpr char NA_OPTION[] = "na_option";
106 constexpr char ASCENDING[] = "ascending";
107 constexpr char PCT[] = "pct";
108 constexpr char LOWER[] = "lower";
109 constexpr char CLEAN[] = "clean";
110 constexpr char TRANS[] = "trans";
111 constexpr char MODE[] = "mode";
112 constexpr char UNIT_DIAGONAL[] = "unit_diagonal";
113 constexpr char C_EIEH_VECTOR[] = "compute_eigenvectors";
114 constexpr char COMPUTE_V[] = "compute_v";
115 constexpr char ADJOINT[] = "adjoint";
116 constexpr char ALIGNMENT[] = "alignment";
117 constexpr char NCHW[] = "NCHW";
118 constexpr char NCDHW[] = "NCDHW";
119 constexpr char USE_LOCKING[] = "use_locking";
120 constexpr char OP[] = "op";
121 constexpr char SET_OPERATION[] = "set_operation";
122 
123 constexpr size_t NC_LEN = 2;
124 constexpr size_t SHAPE_4D = 4;
125 constexpr size_t SHAPE_5D = 5;
126 constexpr size_t N_INDEX = 0;
127 constexpr size_t C_INDEX = 1;
128 constexpr size_t D_INDEX = 2;
129 constexpr size_t H_INDEX = 3;
130 constexpr size_t W_INDEX = 4;
131 
132 struct ParallelSearchInfo {
133   double min_cost_time{DBL_MAX};
134   double tmp_sum_cost_time{0.f};
135   float best_block_size{0.f};
136   size_t best_pow{0};
137   size_t search_count{0};
138   bool kernel_thread_num_set{false};
139   size_t max_pow{6};
140 };
141 
142 class BACKEND_EXPORT NativeCpuKernelMod : public CpuKernelMod {
143  public:
144   NativeCpuKernelMod() = default;
145   ~NativeCpuKernelMod() override = default;
Launch(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & workspace,const std::vector<KernelTensor * > & outputs,void *)146   bool Launch(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &workspace,
147               const std::vector<KernelTensor *> &outputs, void * /*stream_ptr*/) override {
148     return Launch(inputs, workspace, outputs);
149   }
Launch(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & workspace,const std::vector<KernelTensor * > & outputs)150   virtual bool Launch(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &workspace,
151                       const std::vector<KernelTensor *> &outputs) {
152     return true;
153   }
154   // Must be called before Init.
SetThreadPool(ThreadPool * pool)155   void SetThreadPool(ThreadPool *pool) { pool_ = pool; }
156 
GetCpuSupportedList(const std::string & kernel_name)157   static std::vector<KernelAttr> GetCpuSupportedList(const std::string &kernel_name) {
158     auto temp_mod = kernel::Factory<NativeCpuKernelMod>::Instance().Create(kernel_name);
159     if (temp_mod == nullptr) {
160       MS_LOG(INFO) << "Not register CPU kernel of operator: " << kernel_name;
161       return std::vector<KernelAttr>{};
162     }
163     return temp_mod->GetAllSupportedList(kernel_name);
164   }
165 
GetOpSupport()166   std::vector<KernelAttr> GetOpSupport() override { return {}; }
167 
GetKernelModType()168   enum KernelModType GetKernelModType() const override { return KernelModType::NativeCpuKernelMod; }
169 
170   ParallelSearchInfo parallel_search_info_;
171 
172  protected:
173   ThreadPool *pool_{nullptr};
174 
175  private:
176   std::vector<KernelAttr> GetAllSupportedList(const std::string &kernel_name);
177   std::vector<KernelAttr> GetSupportFromOpLib(const std::string &kernel_name) const;
178   inline static mindspore::HashMap<std::string, std::vector<KernelAttr>> support_map_;
179 };
180 
181 class CpuKernelFunc {
182  public:
183   CpuKernelFunc() = default;
184   virtual ~CpuKernelFunc() = default;
185   ///////////// new func ///////////////
InitFunc(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & outputs)186   virtual void InitFunc(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &outputs) {}
InitFunc(const PrimitivePtr & primitive,const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & outputs)187   virtual void InitFunc(const PrimitivePtr &primitive, const std::vector<KernelTensor *> &inputs,
188                         const std::vector<KernelTensor *> &outputs) {}
Resize(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & outputs)189   virtual int Resize(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &outputs) {
190     return KRET_OK;
191   }
RunFunc(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & workspace,const std::vector<KernelTensor * > & outputs)192   virtual bool RunFunc(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &workspace,
193                        const std::vector<KernelTensor *> &outputs) {
194     return true;
195   }
196   ///////////// old func ///////////////
InitFunc(const BaseOperatorPtr & base_operator,const std::vector<KernelTensorPtr> & inputs,const std::vector<KernelTensorPtr> & outputs)197   virtual void InitFunc(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
198                         const std::vector<KernelTensorPtr> &outputs) {}
199   virtual int Resize(
200     const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
201     const std::vector<KernelTensorPtr> &outputs,
202     const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost = std::map<uint32_t, tensor::TensorPtr>()) {
203     return KRET_OK;
204   }
RunFunc(const std::vector<AddressPtr> & inputs,const std::vector<AddressPtr> & workspace,const std::vector<AddressPtr> & outputs)205   virtual bool RunFunc(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
206                        const std::vector<AddressPtr> &outputs) {
207     return true;
208   }
209   ////////////////////////////////////
210   ParallelSearchInfo parallel_search_info_;
211 };
212 
213 class CPUKernelUtils {
214  public:
215   static void ExpandDimsTo4(ShapeVector *shape);
216   static size_t CalcOffset(const ShapeVector &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3);
217   static size_t GetElementNumOnAxis(const ShapeVector &shape, int axis);
218   static void GetElementNumEveryDim(const ShapeVector &shape, std::vector<size_t> *element_num);
219   static void ParallelFor(const CTask &task, size_t count, float block_size = 128.0);
220   static ShapeVector FlatShapeByAxis(const ShapeVector &shape, int axis);
221   static ShapeVector GetBroadcastShape(const std::vector<int64_t> &x, const std::vector<int64_t> &y);
222   static void ParallelForAutoSearch(const CTask &task, size_t count, ParallelSearchInfo *parallel_search_info);
223   template <typename T>
CalcElementNum(const std::vector<T> & shape)224   inline static T CalcElementNum(const std::vector<T> &shape) {
225     T total = std::accumulate(shape.begin(), shape.end(), T(1), std::multiplies<T>());
226     return total;
227   }
228   template <typename T>
CalcSegmentIds(const T * segment_ids_data_addr,const size_t segment_ids_num)229   inline static std::vector<int64_t> CalcSegmentIds(const T *segment_ids_data_addr, const size_t segment_ids_num) {
230     std::vector<int64_t> segments;
231     int64_t seg_tmp = 1;
232     for (size_t i = 0; i < segment_ids_num - 1; ++i) {
233       if (segment_ids_data_addr[i] == segment_ids_data_addr[i + 1]) {
234         seg_tmp++;
235       } else {
236         segments.push_back(seg_tmp);
237         seg_tmp = 1;
238       }
239       const size_t last_loc = 2;
240       if (i == segment_ids_num - last_loc) {
241         segments.push_back(seg_tmp);
242       }
243     }
244     if (segment_ids_num == 1) {
245       segments.push_back(seg_tmp);
246     }
247     return segments;
248   }
249 };
250 
251 class BroadcastIterator {
252  public:
253   BroadcastIterator(ShapeVector input_shape_a, ShapeVector input_shape_b, ShapeVector output_shape);
254   virtual ~BroadcastIterator() = default;
GetInputPosA()255   inline size_t GetInputPosA() const { return input_pos_[0]; }
GetInputPosB()256   inline size_t GetInputPosB() const { return input_pos_[1]; }
257   void SetPos(size_t pos);
258   void GenNextPos();
259 
260  private:
261   void BroadcastShape();
262   void InitStrides();
263 
264   ShapeVector coordinates_;
265   ShapeVector input_shape_a_;
266   ShapeVector input_shape_b_;
267   ShapeVector output_shape_;
268   ShapeVector input_strides_a_;
269   ShapeVector input_strides_b_;
270   ShapeVector input_back_strides_a_;
271   ShapeVector input_back_strides_b_;
272   std::array<size_t, 2> input_pos_{0};
273   int output_dimension_{0};
274 };
275 
276 void GetBroadCastIndex(const std::vector<size_t> &unaligned_input_shape, const std::vector<size_t> &output_shape,
277                        std::vector<size_t> *index_list);
278 
279 // Broadcast for multi_inputs and single output
280 class MultipleBroadcastIterator {
281  public:
282   using shape_info = ShapeVector;
283   MultipleBroadcastIterator(std::vector<shape_info> multi_inputs, shape_info output_shape);
284   virtual ~MultipleBroadcastIterator() = default;
GetInputPos(size_t index)285   inline size_t GetInputPos(size_t index) const { return LongToSize(input_pos_[index]); }
286   void SetPos(size_t pos);
287   void GenNextPos();
288 
289  private:
290   void BroadcastShape();
291   void InitStrides();
292 
293   shape_info coordinates_;
294   std::vector<shape_info> multi_inputs_;
295   shape_info output_shape_;
296   std::vector<shape_info> multi_inputs_strides_;
297   std::vector<shape_info> multi_inputs_back_strides_;
298   shape_info input_pos_;
299   int output_dimension_{0};
300 };
301 
302 class TransposeIterator {
303  public:
304   TransposeIterator(ShapeVector output_shape, std::vector<size_t> axes, const ShapeVector &input_shape);
305   virtual ~TransposeIterator() = default;
GetPos()306   inline size_t GetPos() const { return pos_; }
307   void SetPos(size_t pos);
308   void GenNextPos();
309 
310  private:
311   int dimension_{0};
312   ShapeVector coordinates_;
313   ShapeVector shape_;
314   ShapeVector strides_;
315   ShapeVector back_strides_;
316   std::vector<size_t> axes_;
317   size_t pos_{0};
318 };
319 
320 ActorThreadPool *GetActorMgrInnerThreadPool();
321 void ParallelLaunch(const CTask &task, size_t count, float block_size = 128.0, Content content = nullptr,
322                     ThreadPool *pool = nullptr);
323 void ParallelLaunch(const std::vector<common::Task> &tasks, Content content = nullptr, ThreadPool *pool = nullptr);
324 void ParallelLaunchAutoSearch(const CTask &task, size_t count, Content content,
325                               ParallelSearchInfo *parallel_search_info, ThreadPool *pool = nullptr);
326 
327 // Deal with pytorch style axis iteration, to iterate every value on specific axis
328 class AxisIterator {
329  public:
330   AxisIterator() = default;
331   virtual ~AxisIterator() = default;
332   void Init(const ShapeVector &input_shape, size_t axis);
SetOffset(size_t index)333   inline void SetOffset(size_t index) {
334     size_t outer_index = index / inner_size_;
335     size_t inner_index = index % inner_size_;
336     axis_offset_ = outer_index * axis_size_ * inner_size_ + inner_index;
337   }
338 
SetOffset(size_t outer_index,size_t inner_index)339   inline void SetOffset(size_t outer_index, size_t inner_index) {
340     axis_offset_ = outer_index * axis_size_ * inner_size_ + inner_index;
341   }
GetPos(size_t i)342   inline size_t GetPos(size_t i) const { return axis_offset_ + i * inner_size_; }
RevertPos(size_t i)343   inline size_t RevertPos(size_t i) const { return (i - axis_offset_) / inner_size_; }
344 
OuterSize()345   inline size_t OuterSize() const { return outer_size_; }
AxisSize()346   inline size_t AxisSize() const { return axis_size_; }
InnerSize()347   inline size_t InnerSize() const { return inner_size_; }
348 
349  private:
350   size_t outer_size_{0};
351   size_t axis_size_{0};
352   size_t inner_size_{0};
353   size_t axis_offset_{0};
354 };
355 
356 template <size_t Ndim>
357 class NdTensorIterator {
358  public:
359   template <typename... Indexes>
NdTensorIterator(int64_t first_dim,Indexes...rest_dims)360   NdTensorIterator(int64_t first_dim, Indexes... rest_dims)
361       : dims_{{first_dim, rest_dims...}}, size_{(first_dim * ... * rest_dims)} {
362     static_assert(sizeof...(rest_dims) + 1 == Ndim, "Input dimensions should match Ndim");
363   }
364 
365   template <typename... Indexes>
operator()366   int64_t operator()(const Indexes... dims) const {
367     static_assert(sizeof...(dims) == Ndim, "Input dimensions should match Ndim");
368     return CalIndex(0, dims...);
369   }
370 
371   template <typename... Indexes>
at(const Indexes...dims)372   int64_t at(const Indexes... dims) const {
373     static_assert(sizeof...(dims) == Ndim, "Input dimensions should match Ndim");
374     const int64_t index = CalIndex<true>(0, dims...);
375     if (index > size_) {
376       MS_LOG(ERROR) << "Pos " << index << " is larger than array size " << size_;
377     }
378     return index;
379   }
380 
381  private:
382   template <bool CheckParam = false, typename... Indexes>
CalIndex(const int64_t sum,const int64_t first_dim,const Indexes...rest_dims)383   int64_t CalIndex(const int64_t sum, const int64_t first_dim, const Indexes... rest_dims) const {
384     constexpr auto n = Ndim - sizeof...(rest_dims);
385     if constexpr (CheckParam) {
386       if (first_dim >= std::get<n - 1>(dims_)) {
387         MS_LOG(ERROR) << "Error on index " << (n - 1) << ", " << first_dim << " should be lower than "
388                       << std::get<n - 1>(dims_);
389       }
390     }
391     return CalIndex<CheckParam>((sum + first_dim) * std::get<n>(dims_), rest_dims...);
392   }
393 
394   template <bool CheckParam = false>
CalIndex(const int64_t sum,const int64_t first_dim)395   int64_t CalIndex(const int64_t sum, const int64_t first_dim) const {
396     if constexpr (CheckParam) {
397       if (first_dim >= std::get<Ndim - 1>(dims_)) {
398         MS_LOG(ERROR) << "Error on index " << (Ndim - 1) << ", " << first_dim << " should be lower than "
399                       << std::get<Ndim - 1>(dims_);
400       }
401     }
402     return sum + first_dim;
403   }
404 
405   const std::array<int64_t, Ndim> dims_;
406   const int64_t size_;
407 };
408 int Sign(float x);
409 }  // namespace kernel
410 }  // namespace mindspore
411 
412 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
413