From 7aa0a7bde120a88562e851b943d7682e5cf6ebf7 Mon Sep 17 00:00:00 2001
From: zhangyanhui <zhangyanhui17@huawei.com>
Date: Tue, 9 Apr 2024 19:45:33 +0800
Subject: [PATCH] auto-apply 0015-bugfix-for-cpu-kernel.patch

---
 .../cpu/kernel/nnacl/infer/where_infer.c      | 66 ++++++-------
 .../src/litert/kernel/cpu/fp32/where_fp32.cc  | 96 ++++++++++++++++---
 .../src/litert/kernel/cpu/fp32/where_fp32.h   |  2 +
 .../lite/tools/optimizer/fusion/glu_fusion.h  |  4 +-
 4 files changed, 116 insertions(+), 52 deletions(-)

diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/where_infer.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/where_infer.c
index f6d4e1b2..c714627a 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/where_infer.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/where_infer.c
@@ -17,18 +17,19 @@
 #include "nnacl/infer/where_infer.h"
 #include "nnacl/infer/infer_register.h"
 #include "nnacl/tensor_c_utils.h"
+#include "nnacl/infer/broadcast_to_infer.h"
 
-static size_t GetAxisout(const TensorC *input0, const TensorC *input1, const TensorC *input2, size_t index) {
-  if (input0->shape_[index] == input1->shape_[index] && input0->shape_[index] != input2->shape_[index]) {
-    return index;
+int WhereBroadCastInferShape(const int input_shape0_size, const int input_shape1_size, const int *input_shape0,
+                             const int *input_shape1, int *ndim, int *in_shape0, int *in_shape1, int *out_shape,
+                             bool *has_broad_cast) {
+  if (input_shape0_size > MAX_SHAPE_SIZE || input_shape1_size > MAX_SHAPE_SIZE) {
+    return NNACL_ERR;
   }
-  if (input0->shape_[index] == input2->shape_[index] && input0->shape_[index] != input1->shape_[index]) {
-    return index;
-  }
-  if (input1->shape_[index] == input2->shape_[index] && input0->shape_[index] != input1->shape_[index]) {
-    return index;
+  MakeUpInputShapes(input_shape0_size, input_shape1_size, input_shape0, input_shape1, ndim, in_shape0, in_shape1);
+  if (*ndim >= MAX_SHAPE_SIZE) {
+    return NNACL_INFER_INVALID;
   }
-  return MAX_SHAPE_SIZE + 1;
+  return BroadCastOutputShape(in_shape0, in_shape1, *ndim, out_shape, has_broad_cast);
 }
 
 int WhereInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
@@ -59,35 +60,28 @@ int WhereInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **
   if (!InferFlag(inputs, inputs_size)) {
     return NNACL_INFER_INVALID;
   }
-
-  int num = GetElementNum(input0);
-  int num1 = GetElementNum(input1);
-  int num2 = GetElementNum(input2);
-  int nummax = num > num1 ? num : (num1 > num2 ? num1 : num2);
-  size_t min_input_shape_size = input1->shape_size_ < input2->shape_size_ ? input1->shape_size_ : input2->shape_size_;
-  size_t axisout = MAX_SHAPE_SIZE + 1;
-  size_t temp = 0;
-  for (size_t j = 0; j < input0->shape_size_; j++) {
-    if (j >= MAX_SHAPE_SIZE) {
-      return NNACL_ERR;
-    }
-    if (j < min_input_shape_size) {
-      axisout = GetAxisout(input0, input1, input2, j);
-      if (axisout != MAX_SHAPE_SIZE + 1) {
-        break;
-      }
-    }
-    temp += 1;
-    if (temp == input0->shape_size_) {
-      SetShapeTensor(output, input);
-      return NNACL_OK;
-    }
+  int in_shape0[MAX_SHAPE_SIZE] = {0};
+  int in_shape1[MAX_SHAPE_SIZE] = {0};
+  int in_shape2[MAX_SHAPE_SIZE] = {0};
+  int output_shape[MAX_SHAPE_SIZE] = {0};
+  size_t input_shape0_size = input0->shape_size_;
+  size_t input_shape1_size = input1->shape_size_;
+  size_t input_shape2_size = input2->shape_size_;
+  const int *input_shape0 = input0->shape_;
+  const int *input_shape1 = input1->shape_;
+  const int *input_shape2 = input2->shape_;
+  int ndim = (int)input_shape0_size;
+  bool has_broad_cast_1 = false;
+  bool has_broad_cast_2 = false;
+  if (WhereBroadCastInferShape(input_shape0_size, input_shape1_size, input_shape0, input_shape1, &ndim, in_shape0,
+                               in_shape1, output_shape, &has_broad_cast_1) != NNACL_OK) {
+    return NNACL_ERR;
   }
-
-  ShapeSet(output->shape_, &output->shape_size_, input0->shape_, input0->shape_size_);
-  if (axisout != MAX_SHAPE_SIZE + 1) {
-    output->shape_[axisout] = nummax;
+  if (WhereBroadCastInferShape(ndim, input_shape2_size, output_shape, input_shape2, &ndim, in_shape0, in_shape2,
+                               output_shape, &has_broad_cast_2) != NNACL_OK) {
+    return NNACL_ERR;
   }
+  ShapeSet(output->shape_, &output->shape_size_, output_shape, ndim);
   return NNACL_OK;
 }
 
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.cc
index d7c987e3..a73fda7c 100644
--- a/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.cc
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.cc
@@ -20,6 +20,7 @@
 #include "src/litert/kernel_registry.h"
 #include "include/errorcode.h"
 #include "nnacl/common_func.h"
+#include "nnacl/base/broadcast_to.h"
 
 using mindspore::kernel::KERNEL_ARCH;
 using mindspore::lite::KernelRegistrar;
@@ -153,36 +154,58 @@ int WhereCPUKernel::RunWithSingleInput() {
 }
 
 int WhereCPUKernel::RunWithTripleInputs() {
-  auto condition = in_tensors_.at(0);
+  TensorC *condition = in_tensors_.at(0)->ConvertToTensorC();
   CHECK_NULL_RETURN(condition);
-  auto x = in_tensors_.at(1);
+  TensorC *x = in_tensors_.at(1)->ConvertToTensorC();
   CHECK_NULL_RETURN(x);
-  auto y = in_tensors_.at(C2NUM);
+  TensorC *y = in_tensors_.at(C2NUM)->ConvertToTensorC();
   CHECK_NULL_RETURN(y);
-  int condition_nums = condition->ElementsNum();
-  int x_num = x->ElementsNum();
-  int y_num = y->ElementsNum();
-  int out_num = out_tensors_.front()->ElementsNum();
+  TensorC *output = out_tensors_.at(0)->ConvertToTensorC();
+  CHECK_NULL_RETURN(output);
+  int condition_nums = GetElementNum(condition);
+  int x_num = GetElementNum(x);
+  int y_num = GetElementNum(y);
+  int out_num = GetElementNum(output);
 
-  condition_ = reinterpret_cast<bool *>(condition->data());
+  condition_ = reinterpret_cast<bool *>(condition->data_);
   CHECK_NULL_RETURN(condition_);
-  x_ = x->data();
+  x_ = x->data_;
   CHECK_NULL_RETURN(x_);
-  y_ = y->data();
+  y_ = y->data_;
   CHECK_NULL_RETURN(y_);
-  output_data_ = out_tensors_.at(0)->data();
+  output_data_ = output->data_;
   int num_max = condition_nums > x_num ? condition_nums : (x_num > y_num ? x_num : y_num);
   where_param_->condition_num_ = condition_nums;
   where_param_->x_num_ = x_num;
   where_param_->y_num_ = y_num;
   where_param_->max_num_ = num_max;
-
+  void *condition_broadcast_buf = nullptr;
+  void *x_broadcast_buf = nullptr;
+  void *y_broadcast_buf = nullptr;
   CHECK_LESS_RETURN(out_num, num_max);
 
   if (((condition_nums != 1) && (condition_nums != num_max)) || ((x_num != 1) && (x_num != num_max)) ||
       ((y_num != 1) && (y_num != num_max))) {
-    MS_LOG(ERROR) << "The length of three inputs are not equal to 1 or length of output, which is unacceptable";
-    return RET_ERROR;
+    if (condition_nums != GetElementNum(y)) {
+      int ret =
+        BroadcastForInput(condition, x, y, &condition_broadcast_buf, &x_broadcast_buf, &y_broadcast_buf, output);
+      if (ret != RET_OK) {
+        MS_LOG(ERROR) << "BroadcastForInput failed.";
+        return RET_ERROR;
+      }
+      int max_num = GetElementNum(output);
+      condition_ = reinterpret_cast<bool *>(condition_broadcast_buf);
+      x_ = x_broadcast_buf;
+      y_ = y_broadcast_buf;
+      output_data_ = output->data_;
+      where_param_->condition_num_ = max_num;
+      where_param_->x_num_ = max_num;
+      where_param_->y_num_ = max_num;
+      where_param_->max_num_ = max_num;
+    } else {
+      MS_LOG(ERROR) << "The length of three inputs are not equal to 1 or length of output, which is unacceptable";
+      return RET_ERROR;
+    }
   }
   if (num_max <= 0) {
     MS_LOG(ERROR) << "Error, inputs' length are zero !!!";
@@ -193,6 +216,9 @@ int WhereCPUKernel::RunWithTripleInputs() {
     MS_LOG(ERROR) << "WhereDwRun error: error_code[" << ret << "]";
     return RET_ERROR;
   }
+  ms_context_->allocator->Free(condition_broadcast_buf);
+  ms_context_->allocator->Free(x_broadcast_buf);
+  ms_context_->allocator->Free(y_broadcast_buf);
   return RET_OK;
 }
 
@@ -214,6 +240,48 @@ int WhereCPUKernel::Run() {
   return ret;
 }
 
+int WhereCPUKernel::BroadcastForInput(TensorC *condition, TensorC *x, TensorC *y, void **condition_broadcast_buf,
+                                      void **x_broadcast_buf, void **y_broadcast_buf, TensorC *output) {
+  size_t broad_cast_buf_size = GetSize(output);
+  BroadcastShapeInfo condition_info;
+  condition_info.input_shape_size_ = condition->shape_size_;
+  condition_info.output_shape_size_ = output->shape_size_;
+  (void)memcpy(condition_info.input_shape_, condition->shape_, condition->shape_size_ * sizeof(int));
+  (void)memcpy(condition_info.output_shape_, output->shape_, output->shape_size_ * sizeof(int));
+  BroadcastShapeInfo x_info;
+  x_info.input_shape_size_ = x->shape_size_;
+  x_info.output_shape_size_ = output->shape_size_;
+  (void)memcpy(x_info.input_shape_, x->shape_, x->shape_size_ * sizeof(int));
+  (void)memcpy(x_info.output_shape_, output->shape_, output->shape_size_ * sizeof(int));
+  BroadcastShapeInfo y_info;
+  y_info.input_shape_size_ = y->shape_size_;
+  y_info.output_shape_size_ = output->shape_size_;
+  (void)memcpy(y_info.input_shape_, y->shape_, y->shape_size_ * sizeof(int));
+  (void)memcpy(y_info.output_shape_, output->shape_, output->shape_size_ * sizeof(int));
+
+  *condition_broadcast_buf = ms_context_->allocator->Malloc(broad_cast_buf_size);
+  CHECK_NULL_RETURN(*condition_broadcast_buf);
+  BroadcastToSize8(condition->data_, &condition_info, *condition_broadcast_buf);
+
+  *x_broadcast_buf = ms_context_->allocator->Malloc(broad_cast_buf_size);
+  if (*x_broadcast_buf == nullptr) {
+    ms_context_->allocator->Free(*condition_broadcast_buf);
+    MS_LOG(ERROR) << "malloc x_broadcast_buf error";
+    return RET_ERROR;
+  }
+  BroadcastToSize32(x->data_, &x_info, *x_broadcast_buf);
+
+  *y_broadcast_buf = ms_context_->allocator->Malloc(broad_cast_buf_size);
+  if (*y_broadcast_buf == nullptr) {
+    ms_context_->allocator->Free(*condition_broadcast_buf);
+    ms_context_->allocator->Free(*x_broadcast_buf);
+    MS_LOG(ERROR) << "malloc y_broadcast_buf error";
+    return RET_ERROR;
+  }
+  BroadcastToSize32(y->data_, &y_info, *y_broadcast_buf);
+  return RET_OK;
+}
+
 REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Where, LiteKernelCreator<WhereCPUKernel>)
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Where, LiteKernelCreator<WhereCPUKernel>)
 REG_KERNEL(kCPU, kNumberTypeBool, PrimitiveType_Where, LiteKernelCreator<WhereCPUKernel>)
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.h b/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.h
index 0d785732..ae6e3eba 100644
--- a/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.h
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/where_fp32.h
@@ -51,6 +51,8 @@ class WhereCPUKernel : public LiteKernel {
  private:
   int RunWithSingleInput();
   int RunWithTripleInputs();
+  int BroadcastForInput(TensorC *condition, TensorC *x, TensorC *y, void **condition_broadcast_buf,
+                        void **x_broadcast_buf, void **y_broadcast_buf, TensorC *output);
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_WHERE_FP32_H_
diff --git a/mindspore/lite/tools/optimizer/fusion/glu_fusion.h b/mindspore/lite/tools/optimizer/fusion/glu_fusion.h
index 5e6a7e79..513a49d9 100644
--- a/mindspore/lite/tools/optimizer/fusion/glu_fusion.h
+++ b/mindspore/lite/tools/optimizer/fusion/glu_fusion.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021 Huawei Technologies Co., Ltd
+ * Copyright 2021~2024 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ namespace mindspore {
 namespace opt {
 class GLUFusion : public LitePatternProcessPass {
  public:
-  explicit GLUFusion(const std::string &name = "glu_fusion", bool multigraph = true)
+  explicit GLUFusion(const std::string &name = "GLUFusion", bool multigraph = true)
       : LitePatternProcessPass(name, multigraph) {}
 
   ~GLUFusion() override = default;
-- 
2.25.1