1 /** 2 * Copyright 2019-2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ 18 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ 19 20 #include <functional> 21 #include <memory> 22 #include <numeric> 23 #include <string> 24 #include <thread> 25 #include <vector> 26 27 #include "backend/kernel_compiler/kernel.h" 28 #include "backend/session/anf_runtime_algorithm.h" 29 #include "backend/kernel_compiler/common_utils.h" 30 #include "ir/anf.h" 31 #include "runtime/framework/graph_scheduler.h" 32 #include "actor/actormgr.h" 33 #if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64) 34 #define PLATFORM_86 35 #endif 36 37 using mindspore::kernel::Address; 38 using mindspore::kernel::AddressPtr; 39 using CTask = std::function<void(size_t, size_t)>; 40 namespace mindspore { 41 namespace kernel { 42 constexpr char KERNEL_SIZE[] = "kernel_size"; 43 constexpr char STRIDE[] = "stride"; 44 constexpr char STRIDES[] = "strides"; 45 constexpr char DILATION[] = "dilation"; 46 constexpr char DILATIONS[] = "dilations"; 47 constexpr char FORMAT[] = "format"; 48 constexpr char PAD[] = "pad"; 49 constexpr char PAD_LIST[] = "pad_list"; 50 constexpr char PAD_MODE[] = "pad_mode"; 51 constexpr char PAD_MODE_LOWER_SAME[] = "same"; 52 constexpr char PAD_MODE_LOWER_VALID[] = "valid"; 53 constexpr char PAD_MODE_UPPER_SAME[] = "SAME"; 54 constexpr char PAD_MODE_UPPER_VALID[] = "VALID"; 55 constexpr char TRANSPOSE_A[] = "transpose_a"; 56 constexpr char TRANSPOSE_B[] = "transpose_b"; 57 constexpr char IS_GRAD[] = "is_grad"; 58 constexpr char TRANSPOSE_NO = 'N'; 59 constexpr char TRANSPOSE_YES = 'T'; 60 constexpr char AXIS[] = "axis"; 61 constexpr char DIM[] = "dim"; 62 constexpr char NUM[] = "num"; 63 constexpr char BEGIN[] = "begin"; 64 constexpr char END[] = "end"; 65 constexpr char SIZE[] = "size"; 66 constexpr char USE_NESTEROV[] = "use_nesterov"; 67 constexpr char GROUP[] = "group"; 68 constexpr char START[] = "start"; 69 constexpr char LIMIT[] = "limit"; 70 constexpr char DELTA[] = "delta"; 71 constexpr char SORTED[] = "sorted"; 72 constexpr char ADJ_ST[] = "adjoint_st"; 73 constexpr char ADJ_dT[] = "adjoint_dt"; 74 constexpr char REDUCTION[] = "reduction"; 75 constexpr char NONE[] = "none"; 76 constexpr char SUM[] = "sum"; 77 constexpr char MEAN[] = "mean"; 78 constexpr char BETA[] = "beta"; 79 constexpr char EXCLUSIVE[] = "exclusive"; 80 constexpr char REVERSE[] = "reverse"; 81 constexpr char PCR[] = "preprocess_collapse_repeated"; 82 constexpr char CTR[] = "ctc_merge_repeated"; 83 constexpr char ILOTI[] = "ignore_longer_outputs_than_inputs"; 84 constexpr char MOMENTUM[] = "momentum"; 85 constexpr char RHO[] = "rho"; 86 constexpr char EPSILON[] = "epsilon"; 87 constexpr char ALIGN_CORNERS[] = "align_corners"; 88 constexpr char PERIODS[] = "periods"; 89 constexpr char WINDOW[] = "window"; 90 constexpr char MIN_PERIODS[] = "min_periods"; 91 constexpr char CENTER[] = "center"; 92 constexpr char METHOD[] = "method"; 93 constexpr char CLOSED[] = "closed"; 94 constexpr char NA_OPTION[] = "na_option"; 95 constexpr char ASCENDING[] = "ascending"; 96 constexpr char PCT[] = "pct"; 97 98 struct ParallelSearchInfo { 99 double min_cost_time{DBL_MAX}; 100 double tmp_sum_cost_time{0}; 101 float best_block_size; 102 size_t best_pow{0}; 103 size_t search_count{0}; 104 }; 105 106 class CPUKernel : public kernel::KernelMod { 107 public: 108 CPUKernel() = default; 109 ~CPUKernel() override = default; 110 virtual void Init(const CNodePtr &kernel_node); 111 virtual void InitKernel(const CNodePtr &kernel_node) = 0; Launch(const std::vector<AddressPtr> & inputs,const std::vector<AddressPtr> & workspace,const std::vector<AddressPtr> & outputs,void *)112 bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, 113 const std::vector<AddressPtr> &outputs, void * /*stream_ptr*/) override { 114 return Launch(inputs, workspace, outputs); 115 }; 116 virtual bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, 117 const std::vector<AddressPtr> &outputs) = 0; GetInputSizeList()118 const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } GetOutputSizeList()119 const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } GetWorkspaceSizeList()120 const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } 121 122 protected: 123 virtual void InitInputOutputSize(const CNodePtr &kernel_node); 124 std::vector<size_t> input_size_list_; 125 std::vector<size_t> output_size_list_; 126 std::vector<size_t> workspace_size_list_; 127 ParallelSearchInfo parallel_search_info_; 128 129 template <typename T> GetDeviceAddress(const std::vector<AddressPtr> & addr_list,size_t index)130 inline T *GetDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) { 131 if (index >= addr_list.size()) { 132 MS_LOG(EXCEPTION) << "Address index(" << index << ") out of range(" << addr_list.size() << ")"; 133 } 134 135 if ((addr_list[index] == nullptr) || (addr_list[index]->addr == nullptr) || (addr_list[index]->size == 0)) { 136 MS_LOG(EXCEPTION) << "The device address is empty, address index: " << index; 137 } 138 139 return reinterpret_cast<T *>(addr_list[index]->addr); 140 } 141 }; 142 143 class CPUKernelUtils { 144 public: 145 static void ExpandDimsTo4(std::vector<size_t> *shape); 146 static size_t CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3); 147 static size_t GetElementNumOnAxis(const std::vector<size_t> &shape, int axis); 148 static void GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num); 149 static void ParallelFor(const CTask &task, size_t count, float block_size = 128.0); 150 static std::vector<size_t> FlatShapeByAxis(const std::vector<size_t> &shape, int axis); 151 static std::vector<size_t> GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y); 152 static void ParallelForAutoSearch(const CTask &task, size_t count, ParallelSearchInfo *parallel_search_info); 153 }; 154 155 class BroadcastIterator { 156 public: 157 BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b, 158 std::vector<size_t> output_shape); 159 virtual ~BroadcastIterator() = default; GetInputPosA()160 inline size_t GetInputPosA() const { return input_pos_[0]; } GetInputPosB()161 inline size_t GetInputPosB() const { return input_pos_[1]; } 162 void SetPos(size_t pos); 163 void GenNextPos(); 164 165 private: 166 void BroadcastShape(); 167 void InitStrides(); 168 169 std::vector<size_t> coordinates_; 170 std::vector<size_t> input_shape_a_; 171 std::vector<size_t> input_shape_b_; 172 std::vector<size_t> output_shape_; 173 std::vector<size_t> input_strides_a_; 174 std::vector<size_t> input_strides_b_; 175 std::vector<size_t> input_back_strides_a_; 176 std::vector<size_t> input_back_strides_b_; 177 std::array<size_t, 2> input_pos_{0}; 178 int output_dimension_{0}; 179 }; 180 181 class TransposeIterator { 182 public: 183 TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, const std::vector<size_t> &input_shape); 184 virtual ~TransposeIterator() = default; GetPos()185 inline size_t GetPos() const { return pos_; } 186 void SetPos(size_t pos); 187 void GenNextPos(); 188 189 private: 190 int dimension_{0}; 191 std::vector<size_t> coordinates_; 192 std::vector<size_t> shape_; 193 std::vector<size_t> strides_; 194 std::vector<size_t> back_strides_; 195 std::vector<size_t> axes_; 196 size_t pos_{0}; 197 }; 198 199 ActorThreadPool *GetActorMgrInnerThreadPool(); 200 void ParallelLaunch(const CTask &task, size_t count, float block_size = 128.0, Content content = nullptr); 201 void ParallelLaunchAutoSearch(const CTask &task, size_t count, Content content, 202 ParallelSearchInfo *parallel_search_info); 203 204 class AxisIterator { 205 public: 206 AxisIterator() = default; 207 virtual ~AxisIterator() = default; 208 void Init(const std::vector<size_t> &input_shape, size_t axis); 209 SetOffset(size_t outer_index,size_t inner_index)210 inline void SetOffset(size_t outer_index, size_t inner_index) { 211 axis_offset_ = outer_index * axis_size_ * inner_size_ + inner_index; 212 } GetPos(size_t i)213 inline size_t GetPos(size_t i) const { return axis_offset_ + i * inner_size_; } RevertPos(size_t i)214 inline size_t RevertPos(size_t i) const { return (i - axis_offset_) / inner_size_; } 215 OuterSize()216 inline size_t OuterSize() const { return outer_size_; } AxisSize()217 inline size_t AxisSize() const { return axis_size_; } InnerSize()218 inline size_t InnerSize() const { return inner_size_; } 219 220 private: 221 size_t outer_size_{0}; 222 size_t axis_size_{0}; 223 size_t inner_size_{0}; 224 size_t axis_offset_{0}; 225 }; 226 } // namespace kernel 227 } // namespace mindspore 228 229 #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ 230