1 /** 2 * Copyright 2019-2022 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ 18 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ 19 20 #include <functional> 21 #include <memory> 22 #include <numeric> 23 #include <string> 24 #include <thread> 25 #include <utility> 26 #include <vector> 27 #include <map> 28 #include <set> 29 30 #include "kernel/kernel.h" 31 #include "plugin/factory/ms_factory.h" 32 #include "plugin/device/cpu/kernel/cpu_kernel_mod.h" 33 #include "include/backend/anf_runtime_algorithm.h" 34 #include "include/common/utils/anfalgo.h" 35 #include "kernel/common_utils.h" 36 #include "ir/anf.h" 37 #include "actor/actormgr.h" 38 #include "include/common/thread_pool.h" 39 #include "include/backend/visible.h" 40 41 using mindspore::kernel::Address; 42 using mindspore::kernel::AddressPtr; 43 using mindspore::kernel::KernelTensor; 44 using CTask = std::function<void(size_t, size_t)>; 45 namespace mindspore { 46 namespace kernel { 47 constexpr char KERNEL_SIZE[] = "kernel_size"; 48 constexpr char VALIDATE_INDICES[] = "validate_indices"; 49 constexpr char STRIDE[] = "stride"; 50 constexpr char STRIDES[] = "strides"; 51 constexpr char DILATION[] = "dilation"; 52 constexpr char DILATIONS[] = "dilations"; 53 constexpr char FORMAT[] = "format"; 54 constexpr char PAD[] = "pad"; 55 constexpr char PAD_LIST[] = "pad_list"; 56 constexpr char PAD_MODE[] = "pad_mode"; 57 constexpr char PAD_MODE_LOWER_SAME[] = "same"; 58 constexpr char PAD_MODE_LOWER_VALID[] = "valid"; 59 constexpr char PAD_MODE_LOWER_PAD[] = "pad"; 60 constexpr char PAD_MODE_UPPER_SAME[] = "SAME"; 61 constexpr char PAD_MODE_UPPER_VALID[] = "VALID"; 62 constexpr char PAD_MODE_UPPER_PAD[] = "PAD"; 63 constexpr char COUNT_INCLUDE_PAD[] = "count_include_pad"; 64 constexpr char CEIL_MODE[] = "ceil_mode"; 65 constexpr char DIVISOR_OVERRIDE[] = "divisor_override"; 66 constexpr char TRANSPOSE_A[] = "transpose_a"; 67 constexpr char TRANSPOSE_B[] = "transpose_b"; 68 constexpr char IS_GRAD[] = "is_grad"; 69 constexpr char TRANSPOSE_NO = 'N'; 70 constexpr char TRANSPOSE_YES = 'T'; 71 constexpr char AXIS[] = "axis"; 72 constexpr char DIM[] = "dim"; 73 constexpr char NUM[] = "num"; 74 constexpr char BEGIN[] = "begin"; 75 constexpr char END[] = "end"; 76 constexpr char SIZE[] = "size"; 77 constexpr char USE_NESTEROV[] = "use_nesterov"; 78 constexpr char GROUP[] = "group"; 79 constexpr char START[] = "start"; 80 constexpr char LIMIT[] = "limit"; 81 constexpr char DELTA[] = "delta"; 82 constexpr char SORTED[] = "sorted"; 83 constexpr char ADJ_ST[] = "adjoint_st"; 84 constexpr char ADJ_dT[] = "adjoint_dt"; 85 constexpr char REDUCTION[] = "reduction"; 86 constexpr char NONE[] = "none"; 87 constexpr char SUM[] = "sum"; 88 constexpr char MEAN[] = "mean"; 89 constexpr char BETA[] = "beta"; 90 constexpr char EXCLUSIVE[] = "exclusive"; 91 constexpr char REVERSE[] = "reverse"; 92 constexpr char PCR[] = "preprocess_collapse_repeated"; 93 constexpr char CTR[] = "ctc_merge_repeated"; 94 constexpr char ILOTI[] = "ignore_longer_outputs_than_inputs"; 95 constexpr char MOMENTUM[] = "momentum"; 96 constexpr char RHO[] = "rho"; 97 constexpr char EPSILON[] = "epsilon"; 98 constexpr char ALIGN_CORNERS[] = "align_corners"; 99 constexpr char PERIODS[] = "periods"; 100 constexpr char WINDOW[] = "window"; 101 constexpr char MIN_PERIODS[] = "min_periods"; 102 constexpr char CENTER[] = "center"; 103 constexpr char METHOD[] = "method"; 104 constexpr char CLOSED[] = "closed"; 105 constexpr char NA_OPTION[] = "na_option"; 106 constexpr char ASCENDING[] = "ascending"; 107 constexpr char PCT[] = "pct"; 108 constexpr char LOWER[] = "lower"; 109 constexpr char CLEAN[] = "clean"; 110 constexpr char TRANS[] = "trans"; 111 constexpr char MODE[] = "mode"; 112 constexpr char UNIT_DIAGONAL[] = "unit_diagonal"; 113 constexpr char C_EIEH_VECTOR[] = "compute_eigenvectors"; 114 constexpr char COMPUTE_V[] = "compute_v"; 115 constexpr char ADJOINT[] = "adjoint"; 116 constexpr char ALIGNMENT[] = "alignment"; 117 constexpr char NCHW[] = "NCHW"; 118 constexpr char NCDHW[] = "NCDHW"; 119 constexpr char USE_LOCKING[] = "use_locking"; 120 constexpr char OP[] = "op"; 121 constexpr char SET_OPERATION[] = "set_operation"; 122 123 constexpr size_t NC_LEN = 2; 124 constexpr size_t SHAPE_4D = 4; 125 constexpr size_t SHAPE_5D = 5; 126 constexpr size_t N_INDEX = 0; 127 constexpr size_t C_INDEX = 1; 128 constexpr size_t D_INDEX = 2; 129 constexpr size_t H_INDEX = 3; 130 constexpr size_t W_INDEX = 4; 131 132 struct ParallelSearchInfo { 133 double min_cost_time{DBL_MAX}; 134 double tmp_sum_cost_time{0.f}; 135 float best_block_size{0.f}; 136 size_t best_pow{0}; 137 size_t search_count{0}; 138 bool kernel_thread_num_set{false}; 139 size_t max_pow{6}; 140 }; 141 142 class BACKEND_EXPORT NativeCpuKernelMod : public CpuKernelMod { 143 public: 144 NativeCpuKernelMod() = default; 145 ~NativeCpuKernelMod() override = default; Launch(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & workspace,const std::vector<KernelTensor * > & outputs,void *)146 bool Launch(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &workspace, 147 const std::vector<KernelTensor *> &outputs, void * /*stream_ptr*/) override { 148 return Launch(inputs, workspace, outputs); 149 } Launch(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & workspace,const std::vector<KernelTensor * > & outputs)150 virtual bool Launch(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &workspace, 151 const std::vector<KernelTensor *> &outputs) { 152 return true; 153 } 154 // Must be called before Init. SetThreadPool(ThreadPool * pool)155 void SetThreadPool(ThreadPool *pool) { pool_ = pool; } 156 GetCpuSupportedList(const std::string & kernel_name)157 static std::vector<KernelAttr> GetCpuSupportedList(const std::string &kernel_name) { 158 auto temp_mod = kernel::Factory<NativeCpuKernelMod>::Instance().Create(kernel_name); 159 if (temp_mod == nullptr) { 160 MS_LOG(INFO) << "Not register CPU kernel of operator: " << kernel_name; 161 return std::vector<KernelAttr>{}; 162 } 163 return temp_mod->GetAllSupportedList(kernel_name); 164 } 165 GetOpSupport()166 std::vector<KernelAttr> GetOpSupport() override { return {}; } 167 GetKernelModType()168 enum KernelModType GetKernelModType() const override { return KernelModType::NativeCpuKernelMod; } 169 170 ParallelSearchInfo parallel_search_info_; 171 172 protected: 173 ThreadPool *pool_{nullptr}; 174 175 private: 176 std::vector<KernelAttr> GetAllSupportedList(const std::string &kernel_name); 177 std::vector<KernelAttr> GetSupportFromOpLib(const std::string &kernel_name) const; 178 inline static mindspore::HashMap<std::string, std::vector<KernelAttr>> support_map_; 179 }; 180 181 class CpuKernelFunc { 182 public: 183 CpuKernelFunc() = default; 184 virtual ~CpuKernelFunc() = default; 185 ///////////// new func /////////////// InitFunc(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & outputs)186 virtual void InitFunc(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &outputs) {} InitFunc(const PrimitivePtr & primitive,const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & outputs)187 virtual void InitFunc(const PrimitivePtr &primitive, const std::vector<KernelTensor *> &inputs, 188 const std::vector<KernelTensor *> &outputs) {} Resize(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & outputs)189 virtual int Resize(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &outputs) { 190 return KRET_OK; 191 } RunFunc(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & workspace,const std::vector<KernelTensor * > & outputs)192 virtual bool RunFunc(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &workspace, 193 const std::vector<KernelTensor *> &outputs) { 194 return true; 195 } 196 ///////////// old func /////////////// InitFunc(const BaseOperatorPtr & base_operator,const std::vector<KernelTensorPtr> & inputs,const std::vector<KernelTensorPtr> & outputs)197 virtual void InitFunc(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs, 198 const std::vector<KernelTensorPtr> &outputs) {} 199 virtual int Resize( 200 const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs, 201 const std::vector<KernelTensorPtr> &outputs, 202 const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost = std::map<uint32_t, tensor::TensorPtr>()) { 203 return KRET_OK; 204 } RunFunc(const std::vector<AddressPtr> & inputs,const std::vector<AddressPtr> & workspace,const std::vector<AddressPtr> & outputs)205 virtual bool RunFunc(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, 206 const std::vector<AddressPtr> &outputs) { 207 return true; 208 } 209 //////////////////////////////////// 210 ParallelSearchInfo parallel_search_info_; 211 }; 212 213 class CPUKernelUtils { 214 public: 215 static void ExpandDimsTo4(ShapeVector *shape); 216 static size_t CalcOffset(const ShapeVector &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3); 217 static size_t GetElementNumOnAxis(const ShapeVector &shape, int axis); 218 static void GetElementNumEveryDim(const ShapeVector &shape, std::vector<size_t> *element_num); 219 static void ParallelFor(const CTask &task, size_t count, float block_size = 128.0); 220 static ShapeVector FlatShapeByAxis(const ShapeVector &shape, int axis); 221 static ShapeVector GetBroadcastShape(const std::vector<int64_t> &x, const std::vector<int64_t> &y); 222 static void ParallelForAutoSearch(const CTask &task, size_t count, ParallelSearchInfo *parallel_search_info); 223 template <typename T> CalcElementNum(const std::vector<T> & shape)224 inline static T CalcElementNum(const std::vector<T> &shape) { 225 T total = std::accumulate(shape.begin(), shape.end(), T(1), std::multiplies<T>()); 226 return total; 227 } 228 template <typename T> CalcSegmentIds(const T * segment_ids_data_addr,const size_t segment_ids_num)229 inline static std::vector<int64_t> CalcSegmentIds(const T *segment_ids_data_addr, const size_t segment_ids_num) { 230 std::vector<int64_t> segments; 231 int64_t seg_tmp = 1; 232 for (size_t i = 0; i < segment_ids_num - 1; ++i) { 233 if (segment_ids_data_addr[i] == segment_ids_data_addr[i + 1]) { 234 seg_tmp++; 235 } else { 236 segments.push_back(seg_tmp); 237 seg_tmp = 1; 238 } 239 const size_t last_loc = 2; 240 if (i == segment_ids_num - last_loc) { 241 segments.push_back(seg_tmp); 242 } 243 } 244 if (segment_ids_num == 1) { 245 segments.push_back(seg_tmp); 246 } 247 return segments; 248 } 249 }; 250 251 class BroadcastIterator { 252 public: 253 BroadcastIterator(ShapeVector input_shape_a, ShapeVector input_shape_b, ShapeVector output_shape); 254 virtual ~BroadcastIterator() = default; GetInputPosA()255 inline size_t GetInputPosA() const { return input_pos_[0]; } GetInputPosB()256 inline size_t GetInputPosB() const { return input_pos_[1]; } 257 void SetPos(size_t pos); 258 void GenNextPos(); 259 260 private: 261 void BroadcastShape(); 262 void InitStrides(); 263 264 ShapeVector coordinates_; 265 ShapeVector input_shape_a_; 266 ShapeVector input_shape_b_; 267 ShapeVector output_shape_; 268 ShapeVector input_strides_a_; 269 ShapeVector input_strides_b_; 270 ShapeVector input_back_strides_a_; 271 ShapeVector input_back_strides_b_; 272 std::array<size_t, 2> input_pos_{0}; 273 int output_dimension_{0}; 274 }; 275 276 void GetBroadCastIndex(const std::vector<size_t> &unaligned_input_shape, const std::vector<size_t> &output_shape, 277 std::vector<size_t> *index_list); 278 279 // Broadcast for multi_inputs and single output 280 class MultipleBroadcastIterator { 281 public: 282 using shape_info = ShapeVector; 283 MultipleBroadcastIterator(std::vector<shape_info> multi_inputs, shape_info output_shape); 284 virtual ~MultipleBroadcastIterator() = default; GetInputPos(size_t index)285 inline size_t GetInputPos(size_t index) const { return LongToSize(input_pos_[index]); } 286 void SetPos(size_t pos); 287 void GenNextPos(); 288 289 private: 290 void BroadcastShape(); 291 void InitStrides(); 292 293 shape_info coordinates_; 294 std::vector<shape_info> multi_inputs_; 295 shape_info output_shape_; 296 std::vector<shape_info> multi_inputs_strides_; 297 std::vector<shape_info> multi_inputs_back_strides_; 298 shape_info input_pos_; 299 int output_dimension_{0}; 300 }; 301 302 class TransposeIterator { 303 public: 304 TransposeIterator(ShapeVector output_shape, std::vector<size_t> axes, const ShapeVector &input_shape); 305 virtual ~TransposeIterator() = default; GetPos()306 inline size_t GetPos() const { return pos_; } 307 void SetPos(size_t pos); 308 void GenNextPos(); 309 310 private: 311 int dimension_{0}; 312 ShapeVector coordinates_; 313 ShapeVector shape_; 314 ShapeVector strides_; 315 ShapeVector back_strides_; 316 std::vector<size_t> axes_; 317 size_t pos_{0}; 318 }; 319 320 ActorThreadPool *GetActorMgrInnerThreadPool(); 321 void ParallelLaunch(const CTask &task, size_t count, float block_size = 128.0, Content content = nullptr, 322 ThreadPool *pool = nullptr); 323 void ParallelLaunch(const std::vector<common::Task> &tasks, Content content = nullptr, ThreadPool *pool = nullptr); 324 void ParallelLaunchAutoSearch(const CTask &task, size_t count, Content content, 325 ParallelSearchInfo *parallel_search_info, ThreadPool *pool = nullptr); 326 327 // Deal with pytorch style axis iteration, to iterate every value on specific axis 328 class AxisIterator { 329 public: 330 AxisIterator() = default; 331 virtual ~AxisIterator() = default; 332 void Init(const ShapeVector &input_shape, size_t axis); SetOffset(size_t index)333 inline void SetOffset(size_t index) { 334 size_t outer_index = index / inner_size_; 335 size_t inner_index = index % inner_size_; 336 axis_offset_ = outer_index * axis_size_ * inner_size_ + inner_index; 337 } 338 SetOffset(size_t outer_index,size_t inner_index)339 inline void SetOffset(size_t outer_index, size_t inner_index) { 340 axis_offset_ = outer_index * axis_size_ * inner_size_ + inner_index; 341 } GetPos(size_t i)342 inline size_t GetPos(size_t i) const { return axis_offset_ + i * inner_size_; } RevertPos(size_t i)343 inline size_t RevertPos(size_t i) const { return (i - axis_offset_) / inner_size_; } 344 OuterSize()345 inline size_t OuterSize() const { return outer_size_; } AxisSize()346 inline size_t AxisSize() const { return axis_size_; } InnerSize()347 inline size_t InnerSize() const { return inner_size_; } 348 349 private: 350 size_t outer_size_{0}; 351 size_t axis_size_{0}; 352 size_t inner_size_{0}; 353 size_t axis_offset_{0}; 354 }; 355 356 template <size_t Ndim> 357 class NdTensorIterator { 358 public: 359 template <typename... Indexes> NdTensorIterator(int64_t first_dim,Indexes...rest_dims)360 NdTensorIterator(int64_t first_dim, Indexes... rest_dims) 361 : dims_{{first_dim, rest_dims...}}, size_{(first_dim * ... * rest_dims)} { 362 static_assert(sizeof...(rest_dims) + 1 == Ndim, "Input dimensions should match Ndim"); 363 } 364 365 template <typename... Indexes> operator()366 int64_t operator()(const Indexes... dims) const { 367 static_assert(sizeof...(dims) == Ndim, "Input dimensions should match Ndim"); 368 return CalIndex(0, dims...); 369 } 370 371 template <typename... Indexes> at(const Indexes...dims)372 int64_t at(const Indexes... dims) const { 373 static_assert(sizeof...(dims) == Ndim, "Input dimensions should match Ndim"); 374 const int64_t index = CalIndex<true>(0, dims...); 375 if (index > size_) { 376 MS_LOG(ERROR) << "Pos " << index << " is larger than array size " << size_; 377 } 378 return index; 379 } 380 381 private: 382 template <bool CheckParam = false, typename... Indexes> CalIndex(const int64_t sum,const int64_t first_dim,const Indexes...rest_dims)383 int64_t CalIndex(const int64_t sum, const int64_t first_dim, const Indexes... rest_dims) const { 384 constexpr auto n = Ndim - sizeof...(rest_dims); 385 if constexpr (CheckParam) { 386 if (first_dim >= std::get<n - 1>(dims_)) { 387 MS_LOG(ERROR) << "Error on index " << (n - 1) << ", " << first_dim << " should be lower than " 388 << std::get<n - 1>(dims_); 389 } 390 } 391 return CalIndex<CheckParam>((sum + first_dim) * std::get<n>(dims_), rest_dims...); 392 } 393 394 template <bool CheckParam = false> CalIndex(const int64_t sum,const int64_t first_dim)395 int64_t CalIndex(const int64_t sum, const int64_t first_dim) const { 396 if constexpr (CheckParam) { 397 if (first_dim >= std::get<Ndim - 1>(dims_)) { 398 MS_LOG(ERROR) << "Error on index " << (Ndim - 1) << ", " << first_dim << " should be lower than " 399 << std::get<Ndim - 1>(dims_); 400 } 401 } 402 return sum + first_dim; 403 } 404 405 const std::array<int64_t, Ndim> dims_; 406 const int64_t size_; 407 }; 408 int Sign(float x); 409 } // namespace kernel 410 } // namespace mindspore 411 412 #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ 413