1 /**
2 * Copyright 2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.h"
18
19 namespace mindspore {
20 namespace kernel {
21 namespace {
22 constexpr size_t kL2NormalizeGradInputsNum = 3;
23 constexpr size_t kL2NormalizeGradOutputsNum = 1;
24 } // namespace
25
26 template <typename T>
InitKernel(const CNodePtr & kernel_node)27 void L2NormalizeGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
28 MS_EXCEPTION_IF_NULL(kernel_node);
29 kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
30 for (size_t i = 0; i < kL2NormalizeGradInputsNum; i++) {
31 (void)input_shape_list_.emplace_back(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i));
32 }
33 auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
34 CheckInputShape(output_shape);
35
36 int output_dim_length = output_shape.size();
37 dim_elem_num_list_.resize(output_dim_length, 1);
38 for (int i = output_dim_length - 2; i >= 0; i--) { // from -2 to 0 dim
39 dim_elem_num_list_[i] = output_shape[i + 1] * dim_elem_num_list_[i + 1];
40 }
41
42 int axis = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "axis"));
43 int input_dim_length = SizeToInt(input_shape_list_[0].size());
44 axis_ = axis < 0 ? (axis + input_dim_length) : axis;
45 epsilon_ = static_cast<T>(AnfAlgo::GetNodeAttr<float>(kernel_node, "epsilon"));
46 }
47
48 template <typename T>
Launch(const std::vector<AddressPtr> & inputs,const std::vector<AddressPtr> & workspace,const std::vector<AddressPtr> & outputs)49 bool L2NormalizeGradCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs,
50 const std::vector<AddressPtr> &workspace,
51 const std::vector<AddressPtr> &outputs) {
52 CHECK_KERNEL_INPUTS_NUM(inputs.size(), kL2NormalizeGradInputsNum, kernel_name_);
53 CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kL2NormalizeGradOutputsNum, kernel_name_);
54 auto input_x = reinterpret_cast<T *>(inputs[0]->addr);
55 auto y = reinterpret_cast<T *>(inputs[1]->addr);
56 auto dout = reinterpret_cast<T *>(inputs[2]->addr);
57 auto output = reinterpret_cast<T *>(outputs[0]->addr);
58 auto output_size = outputs[0]->size / sizeof(T);
59 auto task = [this, input_x, y, dout, output](size_t start, size_t end) {
60 for (size_t i = start; i < end; i++) {
61 std::vector<size_t> high_dim_index = OneDimIndexToHighDimIndex(i);
62 std::vector<T> input_x_vector = GetVector(high_dim_index, input_x);
63 std::vector<T> dout_vector = GetVector(high_dim_index, dout);
64 std::vector<T> y_vector = GetVector(high_dim_index, y);
65 GetOutput(input_x_vector, y_vector, dout_vector, high_dim_index, &output[i]);
66 }
67 };
68 CPUKernelUtils::ParallelFor(task, output_size);
69 return true;
70 }
71
72 template <typename T>
CheckInputShape(const std::vector<size_t> & output_shape)73 void L2NormalizeGradCPUKernel<T>::CheckInputShape(const std::vector<size_t> &output_shape) {
74 for (const auto &shape : input_shape_list_) {
75 if (output_shape != shape) {
76 MS_LOG(EXCEPTION) << "Input shape and output shape should be same.";
77 }
78 }
79 auto input_x_shape = input_shape_list_[0];
80 if (input_x_shape.size() != 0) {
81 if (std::any_of(input_x_shape.begin(), input_x_shape.end(), [](size_t i) { return i == 0; })) {
82 MS_LOG(EXCEPTION) << "L2NormalizeCPUKernel input is null.";
83 }
84 }
85 }
86
87 template <typename T>
OneDimIndexToHighDimIndex(size_t one_dim_index)88 std::vector<size_t> L2NormalizeGradCPUKernel<T>::OneDimIndexToHighDimIndex(size_t one_dim_index) {
89 std::vector<size_t> high_dim_index;
90 high_dim_index.reserve(dim_elem_num_list_.size());
91 for (const auto &item : dim_elem_num_list_) {
92 high_dim_index.push_back(one_dim_index / item);
93 one_dim_index %= item;
94 }
95 // referred to Copy elision https://en.cppreference.com/w/cpp/language/copy_elision
96 // returning a vector won't cause extra vector constructed or moved
97 return high_dim_index;
98 }
99
100 template <typename T>
HighDimIndexToOneDimIndex(size_t * one_dim_index,const std::vector<size_t> & high_dim_index)101 void L2NormalizeGradCPUKernel<T>::HighDimIndexToOneDimIndex(size_t *one_dim_index,
102 const std::vector<size_t> &high_dim_index) {
103 *one_dim_index = 0;
104 int len = high_dim_index.size();
105 for (int i = 0; i < len; i++) {
106 *one_dim_index += high_dim_index[i] * dim_elem_num_list_[i];
107 }
108 }
109
110 template <typename T>
GetVector(const std::vector<size_t> & high_dim_index,const T * x)111 std::vector<T> L2NormalizeGradCPUKernel<T>::GetVector(const std::vector<size_t> &high_dim_index, const T *x) {
112 auto x_shape = input_shape_list_[0];
113 std::vector<T> x_vector;
114 x_vector.reserve(x_shape[axis_]);
115 for (size_t i = 0; i < x_shape[axis_]; i++) {
116 size_t oneDimIndex = 0;
117 std::vector<size_t> tmp_high_dim_index = high_dim_index;
118 tmp_high_dim_index[axis_] = i;
119 HighDimIndexToOneDimIndex(&oneDimIndex, tmp_high_dim_index);
120 (void)x_vector.emplace_back(x[oneDimIndex]);
121 }
122 // referred to Copy elision https://en.cppreference.com/w/cpp/language/copy_elision
123 // returning a vector won't cause extra vector constructed or moved
124 return x_vector;
125 }
126
127 template <typename T>
GetSumOfProduct(const std::vector<T> & x_vector,const std::vector<T> & y_vector,T * ss)128 void L2NormalizeGradCPUKernel<T>::GetSumOfProduct(const std::vector<T> &x_vector, const std::vector<T> &y_vector,
129 T *ss) {
130 size_t len = x_vector.size();
131 std::vector<T> tmp_vector(len);
132 for (size_t i = 0; i < len; i++) {
133 tmp_vector[i] = x_vector[i] * y_vector[i];
134 }
135 const size_t half = 2;
136 if (len % half == 1) {
137 tmp_vector[0] += tmp_vector[len - 1];
138 }
139 for (size_t stride = len / half; stride > 0; stride >>= 1) {
140 for (size_t i = 0; i < stride; i++) {
141 tmp_vector[i] += tmp_vector[i + stride];
142 }
143 if (stride > half && stride % half == 1) {
144 tmp_vector[0] += tmp_vector[stride - 1];
145 }
146 }
147 *ss = tmp_vector[0];
148 }
149
150 template <typename T>
GetOutput(const std::vector<T> & input_x_vector,const std::vector<T> & y_vector,const std::vector<T> & dout_vector,const std::vector<size_t> & high_dim_index,T * output)151 void L2NormalizeGradCPUKernel<T>::GetOutput(const std::vector<T> &input_x_vector, const std::vector<T> &y_vector,
152 const std::vector<T> &dout_vector,
153 const std::vector<size_t> &high_dim_index, T *output) {
154 size_t axis_index = high_dim_index[axis_];
155 T dout = dout_vector[axis_index];
156 T y = y_vector[axis_index];
157 T tmp_sum1;
158 GetSumOfProduct(y_vector, dout_vector, &tmp_sum1);
159 T tmp_sum2;
160 GetSumOfProduct(input_x_vector, input_x_vector, &tmp_sum2);
161 tmp_sum2 = sqrt(tmp_sum2);
162 if (tmp_sum2 >= epsilon_) {
163 *output = (dout - y * tmp_sum1) / tmp_sum2;
164 } else {
165 *output = (dout - y * tmp_sum1) / epsilon_;
166 }
167 }
168 } // namespace kernel
169 } // namespace mindspore
170