• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1From a4c343574d6d6998a6f1b95f436401c8eb8a2c90 Mon Sep 17 00:00:00 2001
2From: zhangyanhui <zhangyanhui17@huawei.com>
3Date: Mon, 1 Jul 2024 21:12:15 +0800
4Subject: [PATCH] auto-apply 0015-bugfix-for-cpu-kernel.patch
5
6---
7 .../cpu/kernel/nnacl/infer/where_infer.c      | 66 ++++++-------
8 .../device/cpu/kernel/nnacl/kernel/clip.c     |  2 +
9 .../src/litert/kernel/cpu/fp32/prelu_fp32.cc  | 12 +--
10 .../src/litert/kernel/cpu/fp32/where_fp32.cc  | 96 ++++++++++++++++---
11 .../src/litert/kernel/cpu/fp32/where_fp32.h   |  2 +
12 .../lite/tools/optimizer/fusion/glu_fusion.h  |  4 +-
13 6 files changed, 124 insertions(+), 58 deletions(-)
14
15diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/where_infer.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/where_infer.c
16index f6d4e1b2..c714627a 100644
17--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/where_infer.c
18+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/where_infer.c
19@@ -17,18 +17,19 @@
20 #include "nnacl/infer/where_infer.h"
21 #include "nnacl/infer/infer_register.h"
22 #include "nnacl/tensor_c_utils.h"
23+#include "nnacl/infer/broadcast_to_infer.h"
24
25-static size_t GetAxisout(const TensorC *input0, const TensorC *input1, const TensorC *input2, size_t index) {
26-  if (input0->shape_[index] == input1->shape_[index] && input0->shape_[index] != input2->shape_[index]) {
27-    return index;
28+int WhereBroadCastInferShape(const int input_shape0_size, const int input_shape1_size, const int *input_shape0,
29+                             const int *input_shape1, int *ndim, int *in_shape0, int *in_shape1, int *out_shape,
30+                             bool *has_broad_cast) {
31+  if (input_shape0_size > MAX_SHAPE_SIZE || input_shape1_size > MAX_SHAPE_SIZE) {
32+    return NNACL_ERR;
33   }
34-  if (input0->shape_[index] == input2->shape_[index] && input0->shape_[index] != input1->shape_[index]) {
35-    return index;
36-  }
37-  if (input1->shape_[index] == input2->shape_[index] && input0->shape_[index] != input1->shape_[index]) {
38-    return index;
39+  MakeUpInputShapes(input_shape0_size, input_shape1_size, input_shape0, input_shape1, ndim, in_shape0, in_shape1);
40+  if (*ndim >= MAX_SHAPE_SIZE) {
41+    return NNACL_INFER_INVALID;
42   }
43-  return MAX_SHAPE_SIZE + 1;
44+  return BroadCastOutputShape(in_shape0, in_shape1, *ndim, out_shape, has_broad_cast);
45 }
46
47 int WhereInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
48@@ -59,35 +60,28 @@ int WhereInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **
49   if (!InferFlag(inputs, inputs_size)) {
50     return NNACL_INFER_INVALID;
51   }
52-
53-  int num = GetElementNum(input0);
54-  int num1 = GetElementNum(input1);
55-  int num2 = GetElementNum(input2);
56-  int nummax = num > num1 ? num : (num1 > num2 ? num1 : num2);
57-  size_t min_input_shape_size = input1->shape_size_ < input2->shape_size_ ? input1->shape_size_ : input2->shape_size_;
58-  size_t axisout = MAX_SHAPE_SIZE + 1;
59-  size_t temp = 0;
60-  for (size_t j = 0; j < input0->shape_size_; j++) {
61-    if (j >= MAX_SHAPE_SIZE) {
62-      return NNACL_ERR;
63-    }
64-    if (j < min_input_shape_size) {
65-      axisout = GetAxisout(input0, input1, input2, j);
66-      if (axisout != MAX_SHAPE_SIZE + 1) {
67-        break;
68-      }
69-    }
70-    temp += 1;
71-    if (temp == input0->shape_size_) {
72-      SetShapeTensor(output, input);
73-      return NNACL_OK;
74-    }
75+  int in_shape0[MAX_SHAPE_SIZE] = {0};
76+  int in_shape1[MAX_SHAPE_SIZE] = {0};
77+  int in_shape2[MAX_SHAPE_SIZE] = {0};
78+  int output_shape[MAX_SHAPE_SIZE] = {0};
79+  size_t input_shape0_size = input0->shape_size_;
80+  size_t input_shape1_size = input1->shape_size_;
81+  size_t input_shape2_size = input2->shape_size_;
82+  const int *input_shape0 = input0->shape_;
83+  const int *input_shape1 = input1->shape_;
84+  const int *input_shape2 = input2->shape_;
85+  int ndim = (int)input_shape0_size;
86+  bool has_broad_cast_1 = false;
87+  bool has_broad_cast_2 = false;
88+  if (WhereBroadCastInferShape(input_shape0_size, input_shape1_size, input_shape0, input_shape1, &ndim, in_shape0,
89+                               in_shape1, output_shape, &has_broad_cast_1) != NNACL_OK) {
90+    return NNACL_ERR;
91   }
92-
93-  ShapeSet(output->shape_, &output->shape_size_, input0->shape_, input0->shape_size_);
94-  if (axisout != MAX_SHAPE_SIZE + 1) {
95-    output->shape_[axisout] = nummax;
96+  if (WhereBroadCastInferShape(ndim, input_shape2_size, output_shape, input_shape2, &ndim, in_shape0, in_shape2,
97+                               output_shape, &has_broad_cast_2) != NNACL_OK) {
98+    return NNACL_ERR;
99   }
100+  ShapeSet(output->shape_, &output->shape_size_, output_shape, ndim);
101   return NNACL_OK;
102 }
103
104diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/clip.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/clip.c
105index ece0eff0..ae8ac5d8 100644
106--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/clip.c
107+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/clip.c
108@@ -81,6 +81,8 @@ int ClipCompute(struct KernelBase *self) {
109   NNACL_CHECK_NULL_RETURN_ERR(clip);
110   ClipParameter *param = (ClipParameter *)clip->base_.param_;
111   NNACL_CHECK_NULL_RETURN_ERR(param);
112+  clip->min_val_ = param->min_val_;
113+  clip->max_val_ = param->max_val_;
114
115   int ret = NNACL_OK;
116   if (clip->base_.in_size_ > ONE_TENSOR) {
117diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/prelu_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/prelu_fp32.cc
118index cae491f5..74639503 100644
119--- a/mindspore/lite/src/litert/kernel/cpu/fp32/prelu_fp32.cc
120+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/prelu_fp32.cc
121@@ -44,12 +44,6 @@ int PReluCPUKernel::Prepare() {
122   CHECK_NULL_RETURN(in_tensors_[kInputIndex]);
123   CHECK_NULL_RETURN(in_tensors_[kSlopeIndex]);
124   CHECK_NULL_RETURN(out_tensors_[kOutputIndex]);
125-  auto slope_shapes = in_tensors_[C1NUM]->ElementsNum();
126-  auto input_channel = in_tensors_[C0NUM]->Channel();
127-  if ((slope_shapes != C1NUM) && (slope_shapes != input_channel)) {
128-    MS_LOG(ERROR) << "slope_shapes: " << slope_shapes << " is not equal to 1 or input_channel: " << input_channel;
129-    return lite::RET_ERROR;
130-  }
131   if (in_tensors_[1]->ElementsNum() == 1) {
132     param_->channelShared = true;
133   } else {
134@@ -83,6 +77,12 @@ int PReluCPUKernel::DoExcute(int task_id) const {
135 }
136
137 int PReluCPUKernel::ReSize() {
138+  auto slope_shapes = in_tensors_[C1NUM]->ElementsNum();
139+  auto input_channel = in_tensors_[C0NUM]->Channel();
140+  if ((slope_shapes != C1NUM) && (slope_shapes != input_channel)) {
141+    MS_LOG(ERROR) << "slope_shapes: " << slope_shapes << " is not equal to 1 or input_channel: " << input_channel;
142+    return lite::RET_ERROR;
143+  }
144   auto &input = in_tensors_[kInputIndex];
145   param_->input_num_ = input->ElementsNum();
146   CHECK_NOT_EQUAL_RETURN(out_tensors_.front()->ElementsNum(), param_->input_num_);
147diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.cc
148index d7c987e3..a73fda7c 100644
149--- a/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.cc
150+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.cc
151@@ -20,6 +20,7 @@
152 #include "src/litert/kernel_registry.h"
153 #include "include/errorcode.h"
154 #include "nnacl/common_func.h"
155+#include "nnacl/base/broadcast_to.h"
156
157 using mindspore::kernel::KERNEL_ARCH;
158 using mindspore::lite::KernelRegistrar;
159@@ -153,36 +154,58 @@ int WhereCPUKernel::RunWithSingleInput() {
160 }
161
162 int WhereCPUKernel::RunWithTripleInputs() {
163-  auto condition = in_tensors_.at(0);
164+  TensorC *condition = in_tensors_.at(0)->ConvertToTensorC();
165   CHECK_NULL_RETURN(condition);
166-  auto x = in_tensors_.at(1);
167+  TensorC *x = in_tensors_.at(1)->ConvertToTensorC();
168   CHECK_NULL_RETURN(x);
169-  auto y = in_tensors_.at(C2NUM);
170+  TensorC *y = in_tensors_.at(C2NUM)->ConvertToTensorC();
171   CHECK_NULL_RETURN(y);
172-  int condition_nums = condition->ElementsNum();
173-  int x_num = x->ElementsNum();
174-  int y_num = y->ElementsNum();
175-  int out_num = out_tensors_.front()->ElementsNum();
176+  TensorC *output = out_tensors_.at(0)->ConvertToTensorC();
177+  CHECK_NULL_RETURN(output);
178+  int condition_nums = GetElementNum(condition);
179+  int x_num = GetElementNum(x);
180+  int y_num = GetElementNum(y);
181+  int out_num = GetElementNum(output);
182
183-  condition_ = reinterpret_cast<bool *>(condition->data());
184+  condition_ = reinterpret_cast<bool *>(condition->data_);
185   CHECK_NULL_RETURN(condition_);
186-  x_ = x->data();
187+  x_ = x->data_;
188   CHECK_NULL_RETURN(x_);
189-  y_ = y->data();
190+  y_ = y->data_;
191   CHECK_NULL_RETURN(y_);
192-  output_data_ = out_tensors_.at(0)->data();
193+  output_data_ = output->data_;
194   int num_max = condition_nums > x_num ? condition_nums : (x_num > y_num ? x_num : y_num);
195   where_param_->condition_num_ = condition_nums;
196   where_param_->x_num_ = x_num;
197   where_param_->y_num_ = y_num;
198   where_param_->max_num_ = num_max;
199-
200+  void *condition_broadcast_buf = nullptr;
201+  void *x_broadcast_buf = nullptr;
202+  void *y_broadcast_buf = nullptr;
203   CHECK_LESS_RETURN(out_num, num_max);
204
205   if (((condition_nums != 1) && (condition_nums != num_max)) || ((x_num != 1) && (x_num != num_max)) ||
206       ((y_num != 1) && (y_num != num_max))) {
207-    MS_LOG(ERROR) << "The length of three inputs are not equal to 1 or length of output, which is unacceptable";
208-    return RET_ERROR;
209+    if (condition_nums != GetElementNum(y)) {
210+      int ret =
211+        BroadcastForInput(condition, x, y, &condition_broadcast_buf, &x_broadcast_buf, &y_broadcast_buf, output);
212+      if (ret != RET_OK) {
213+        MS_LOG(ERROR) << "BroadcastForInput failed.";
214+        return RET_ERROR;
215+      }
216+      int max_num = GetElementNum(output);
217+      condition_ = reinterpret_cast<bool *>(condition_broadcast_buf);
218+      x_ = x_broadcast_buf;
219+      y_ = y_broadcast_buf;
220+      output_data_ = output->data_;
221+      where_param_->condition_num_ = max_num;
222+      where_param_->x_num_ = max_num;
223+      where_param_->y_num_ = max_num;
224+      where_param_->max_num_ = max_num;
225+    } else {
226+      MS_LOG(ERROR) << "The length of three inputs are not equal to 1 or length of output, which is unacceptable";
227+      return RET_ERROR;
228+    }
229   }
230   if (num_max <= 0) {
231     MS_LOG(ERROR) << "Error, inputs' length are zero !!!";
232@@ -193,6 +216,9 @@ int WhereCPUKernel::RunWithTripleInputs() {
233     MS_LOG(ERROR) << "WhereDwRun error: error_code[" << ret << "]";
234     return RET_ERROR;
235   }
236+  ms_context_->allocator->Free(condition_broadcast_buf);
237+  ms_context_->allocator->Free(x_broadcast_buf);
238+  ms_context_->allocator->Free(y_broadcast_buf);
239   return RET_OK;
240 }
241
242@@ -214,6 +240,48 @@ int WhereCPUKernel::Run() {
243   return ret;
244 }
245
246+int WhereCPUKernel::BroadcastForInput(TensorC *condition, TensorC *x, TensorC *y, void **condition_broadcast_buf,
247+                                      void **x_broadcast_buf, void **y_broadcast_buf, TensorC *output) {
248+  size_t broad_cast_buf_size = GetSize(output);
249+  BroadcastShapeInfo condition_info;
250+  condition_info.input_shape_size_ = condition->shape_size_;
251+  condition_info.output_shape_size_ = output->shape_size_;
252+  (void)memcpy(condition_info.input_shape_, condition->shape_, condition->shape_size_ * sizeof(int));
253+  (void)memcpy(condition_info.output_shape_, output->shape_, output->shape_size_ * sizeof(int));
254+  BroadcastShapeInfo x_info;
255+  x_info.input_shape_size_ = x->shape_size_;
256+  x_info.output_shape_size_ = output->shape_size_;
257+  (void)memcpy(x_info.input_shape_, x->shape_, x->shape_size_ * sizeof(int));
258+  (void)memcpy(x_info.output_shape_, output->shape_, output->shape_size_ * sizeof(int));
259+  BroadcastShapeInfo y_info;
260+  y_info.input_shape_size_ = y->shape_size_;
261+  y_info.output_shape_size_ = output->shape_size_;
262+  (void)memcpy(y_info.input_shape_, y->shape_, y->shape_size_ * sizeof(int));
263+  (void)memcpy(y_info.output_shape_, output->shape_, output->shape_size_ * sizeof(int));
264+
265+  *condition_broadcast_buf = ms_context_->allocator->Malloc(broad_cast_buf_size);
266+  CHECK_NULL_RETURN(*condition_broadcast_buf);
267+  BroadcastToSize8(condition->data_, &condition_info, *condition_broadcast_buf);
268+
269+  *x_broadcast_buf = ms_context_->allocator->Malloc(broad_cast_buf_size);
270+  if (*x_broadcast_buf == nullptr) {
271+    ms_context_->allocator->Free(*condition_broadcast_buf);
272+    MS_LOG(ERROR) << "malloc x_broadcast_buf error";
273+    return RET_ERROR;
274+  }
275+  BroadcastToSize32(x->data_, &x_info, *x_broadcast_buf);
276+
277+  *y_broadcast_buf = ms_context_->allocator->Malloc(broad_cast_buf_size);
278+  if (*y_broadcast_buf == nullptr) {
279+    ms_context_->allocator->Free(*condition_broadcast_buf);
280+    ms_context_->allocator->Free(*x_broadcast_buf);
281+    MS_LOG(ERROR) << "malloc y_broadcast_buf error";
282+    return RET_ERROR;
283+  }
284+  BroadcastToSize32(y->data_, &y_info, *y_broadcast_buf);
285+  return RET_OK;
286+}
287+
288 REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Where, LiteKernelCreator<WhereCPUKernel>)
289 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Where, LiteKernelCreator<WhereCPUKernel>)
290 REG_KERNEL(kCPU, kNumberTypeBool, PrimitiveType_Where, LiteKernelCreator<WhereCPUKernel>)
291diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.h b/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.h
292index 0d785732..ae6e3eba 100644
293--- a/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.h
294+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.h
295@@ -51,6 +51,8 @@ class WhereCPUKernel : public LiteKernel {
296  private:
297   int RunWithSingleInput();
298   int RunWithTripleInputs();
299+  int BroadcastForInput(TensorC *condition, TensorC *x, TensorC *y, void **condition_broadcast_buf,
300+                        void **x_broadcast_buf, void **y_broadcast_buf, TensorC *output);
301 };
302 }  // namespace mindspore::kernel
303 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_WHERE_FP32_H_
304diff --git a/mindspore/lite/tools/optimizer/fusion/glu_fusion.h b/mindspore/lite/tools/optimizer/fusion/glu_fusion.h
305index 5e6a7e79..513a49d9 100644
306--- a/mindspore/lite/tools/optimizer/fusion/glu_fusion.h
307+++ b/mindspore/lite/tools/optimizer/fusion/glu_fusion.h
308@@ -1,5 +1,5 @@
309 /**
310- * Copyright 2021 Huawei Technologies Co., Ltd
311+ * Copyright 2021~2024 Huawei Technologies Co., Ltd
312  *
313  * Licensed under the Apache License, Version 2.0 (the "License");
314  * you may not use this file except in compliance with the License.
315@@ -26,7 +26,7 @@ namespace mindspore {
316 namespace opt {
317 class GLUFusion : public LitePatternProcessPass {
318  public:
319-  explicit GLUFusion(const std::string &name = "glu_fusion", bool multigraph = true)
320+  explicit GLUFusion(const std::string &name = "GLUFusion", bool multigraph = true)
321       : LitePatternProcessPass(name, multigraph) {}
322
323   ~GLUFusion() override = default;
324--
3252.25.1
326
327