• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
18 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
19 
20 #include <functional>
21 #include <memory>
22 #include <numeric>
23 #include <string>
24 #include <thread>
25 #include <vector>
26 
27 #include "backend/kernel_compiler/kernel.h"
28 #include "backend/session/anf_runtime_algorithm.h"
29 #include "backend/kernel_compiler/common_utils.h"
30 #include "ir/anf.h"
31 #include "runtime/framework/graph_scheduler.h"
32 #include "actor/actormgr.h"
33 #if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64)
34 #define PLATFORM_86
35 #endif
36 
37 using mindspore::kernel::Address;
38 using mindspore::kernel::AddressPtr;
39 using CTask = std::function<void(size_t, size_t)>;
40 namespace mindspore {
41 namespace kernel {
42 constexpr char KERNEL_SIZE[] = "kernel_size";
43 constexpr char STRIDE[] = "stride";
44 constexpr char STRIDES[] = "strides";
45 constexpr char DILATION[] = "dilation";
46 constexpr char DILATIONS[] = "dilations";
47 constexpr char FORMAT[] = "format";
48 constexpr char PAD[] = "pad";
49 constexpr char PAD_LIST[] = "pad_list";
50 constexpr char PAD_MODE[] = "pad_mode";
51 constexpr char PAD_MODE_LOWER_SAME[] = "same";
52 constexpr char PAD_MODE_LOWER_VALID[] = "valid";
53 constexpr char PAD_MODE_UPPER_SAME[] = "SAME";
54 constexpr char PAD_MODE_UPPER_VALID[] = "VALID";
55 constexpr char TRANSPOSE_A[] = "transpose_a";
56 constexpr char TRANSPOSE_B[] = "transpose_b";
57 constexpr char IS_GRAD[] = "is_grad";
58 constexpr char TRANSPOSE_NO = 'N';
59 constexpr char TRANSPOSE_YES = 'T';
60 constexpr char AXIS[] = "axis";
61 constexpr char DIM[] = "dim";
62 constexpr char NUM[] = "num";
63 constexpr char BEGIN[] = "begin";
64 constexpr char END[] = "end";
65 constexpr char SIZE[] = "size";
66 constexpr char USE_NESTEROV[] = "use_nesterov";
67 constexpr char GROUP[] = "group";
68 constexpr char START[] = "start";
69 constexpr char LIMIT[] = "limit";
70 constexpr char DELTA[] = "delta";
71 constexpr char SORTED[] = "sorted";
72 constexpr char ADJ_ST[] = "adjoint_st";
73 constexpr char ADJ_dT[] = "adjoint_dt";
74 constexpr char REDUCTION[] = "reduction";
75 constexpr char NONE[] = "none";
76 constexpr char SUM[] = "sum";
77 constexpr char MEAN[] = "mean";
78 constexpr char BETA[] = "beta";
79 constexpr char EXCLUSIVE[] = "exclusive";
80 constexpr char REVERSE[] = "reverse";
81 constexpr char PCR[] = "preprocess_collapse_repeated";
82 constexpr char CTR[] = "ctc_merge_repeated";
83 constexpr char ILOTI[] = "ignore_longer_outputs_than_inputs";
84 constexpr char MOMENTUM[] = "momentum";
85 constexpr char RHO[] = "rho";
86 constexpr char EPSILON[] = "epsilon";
87 constexpr char ALIGN_CORNERS[] = "align_corners";
88 constexpr char PERIODS[] = "periods";
89 constexpr char WINDOW[] = "window";
90 constexpr char MIN_PERIODS[] = "min_periods";
91 constexpr char CENTER[] = "center";
92 constexpr char METHOD[] = "method";
93 constexpr char CLOSED[] = "closed";
94 constexpr char NA_OPTION[] = "na_option";
95 constexpr char ASCENDING[] = "ascending";
96 constexpr char PCT[] = "pct";
97 
98 struct ParallelSearchInfo {
99   double min_cost_time{DBL_MAX};
100   double tmp_sum_cost_time{0};
101   float best_block_size;
102   size_t best_pow{0};
103   size_t search_count{0};
104 };
105 
106 class CPUKernel : public kernel::KernelMod {
107  public:
108   CPUKernel() = default;
109   ~CPUKernel() override = default;
110   virtual void Init(const CNodePtr &kernel_node);
111   virtual void InitKernel(const CNodePtr &kernel_node) = 0;
Launch(const std::vector<AddressPtr> & inputs,const std::vector<AddressPtr> & workspace,const std::vector<AddressPtr> & outputs,void *)112   bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
113               const std::vector<AddressPtr> &outputs, void * /*stream_ptr*/) override {
114     return Launch(inputs, workspace, outputs);
115   };
116   virtual bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
117                       const std::vector<AddressPtr> &outputs) = 0;
GetInputSizeList()118   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
GetOutputSizeList()119   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
GetWorkspaceSizeList()120   const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
121 
122  protected:
123   virtual void InitInputOutputSize(const CNodePtr &kernel_node);
124   std::vector<size_t> input_size_list_;
125   std::vector<size_t> output_size_list_;
126   std::vector<size_t> workspace_size_list_;
127   ParallelSearchInfo parallel_search_info_;
128 
129   template <typename T>
GetDeviceAddress(const std::vector<AddressPtr> & addr_list,size_t index)130   inline T *GetDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) {
131     if (index >= addr_list.size()) {
132       MS_LOG(EXCEPTION) << "Address index(" << index << ") out of range(" << addr_list.size() << ")";
133     }
134 
135     if ((addr_list[index] == nullptr) || (addr_list[index]->addr == nullptr) || (addr_list[index]->size == 0)) {
136       MS_LOG(EXCEPTION) << "The device address is empty, address index: " << index;
137     }
138 
139     return reinterpret_cast<T *>(addr_list[index]->addr);
140   }
141 };
142 
143 class CPUKernelUtils {
144  public:
145   static void ExpandDimsTo4(std::vector<size_t> *shape);
146   static size_t CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3);
147   static size_t GetElementNumOnAxis(const std::vector<size_t> &shape, int axis);
148   static void GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num);
149   static void ParallelFor(const CTask &task, size_t count, float block_size = 128.0);
150   static std::vector<size_t> FlatShapeByAxis(const std::vector<size_t> &shape, int axis);
151   static std::vector<size_t> GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y);
152   static void ParallelForAutoSearch(const CTask &task, size_t count, ParallelSearchInfo *parallel_search_info);
153 };
154 
155 class BroadcastIterator {
156  public:
157   BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
158                     std::vector<size_t> output_shape);
159   virtual ~BroadcastIterator() = default;
GetInputPosA()160   inline size_t GetInputPosA() const { return input_pos_[0]; }
GetInputPosB()161   inline size_t GetInputPosB() const { return input_pos_[1]; }
162   void SetPos(size_t pos);
163   void GenNextPos();
164 
165  private:
166   void BroadcastShape();
167   void InitStrides();
168 
169   std::vector<size_t> coordinates_;
170   std::vector<size_t> input_shape_a_;
171   std::vector<size_t> input_shape_b_;
172   std::vector<size_t> output_shape_;
173   std::vector<size_t> input_strides_a_;
174   std::vector<size_t> input_strides_b_;
175   std::vector<size_t> input_back_strides_a_;
176   std::vector<size_t> input_back_strides_b_;
177   std::array<size_t, 2> input_pos_{0};
178   int output_dimension_{0};
179 };
180 
181 class TransposeIterator {
182  public:
183   TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, const std::vector<size_t> &input_shape);
184   virtual ~TransposeIterator() = default;
GetPos()185   inline size_t GetPos() const { return pos_; }
186   void SetPos(size_t pos);
187   void GenNextPos();
188 
189  private:
190   int dimension_{0};
191   std::vector<size_t> coordinates_;
192   std::vector<size_t> shape_;
193   std::vector<size_t> strides_;
194   std::vector<size_t> back_strides_;
195   std::vector<size_t> axes_;
196   size_t pos_{0};
197 };
198 
199 ActorThreadPool *GetActorMgrInnerThreadPool();
200 void ParallelLaunch(const CTask &task, size_t count, float block_size = 128.0, Content content = nullptr);
201 void ParallelLaunchAutoSearch(const CTask &task, size_t count, Content content,
202                               ParallelSearchInfo *parallel_search_info);
203 
204 class AxisIterator {
205  public:
206   AxisIterator() = default;
207   virtual ~AxisIterator() = default;
208   void Init(const std::vector<size_t> &input_shape, size_t axis);
209 
SetOffset(size_t outer_index,size_t inner_index)210   inline void SetOffset(size_t outer_index, size_t inner_index) {
211     axis_offset_ = outer_index * axis_size_ * inner_size_ + inner_index;
212   }
GetPos(size_t i)213   inline size_t GetPos(size_t i) const { return axis_offset_ + i * inner_size_; }
RevertPos(size_t i)214   inline size_t RevertPos(size_t i) const { return (i - axis_offset_) / inner_size_; }
215 
OuterSize()216   inline size_t OuterSize() const { return outer_size_; }
AxisSize()217   inline size_t AxisSize() const { return axis_size_; }
InnerSize()218   inline size_t InnerSize() const { return inner_size_; }
219 
220  private:
221   size_t outer_size_{0};
222   size_t axis_size_{0};
223   size_t inner_size_{0};
224   size_t axis_offset_{0};
225 };
226 }  // namespace kernel
227 }  // namespace mindspore
228 
229 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
230