1 /**
2 * Copyright 2020-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "backend/kernel_compiler/cpu/sparse_apply_lazy_adam_cpu_kernel.h"
18 #include "backend/kernel_compiler/common_utils.h"
19 #include "runtime/device/cpu/cpu_device_address.h"
20
21 namespace mindspore {
22 namespace kernel {
23 namespace {
24 constexpr size_t kSparseApplyLazyAdamInputsNum = 11;
25 constexpr size_t kSparseApplyLazyAdamWorkspaceSize = 4;
26
27 template <typename T>
ComputeLazyAdam(MultiThreadComputeParams<T> * input_params,size_t start,size_t end)28 void ComputeLazyAdam(MultiThreadComputeParams<T> *input_params, size_t start, size_t end) {
29 MS_EXCEPTION_IF_NULL(input_params);
30 auto var = input_params->var_;
31 auto m = input_params->m_;
32 auto v = input_params->v_;
33 const auto lr = input_params->lr_;
34 const auto beta1 = input_params->beta1_;
35 const auto beta2 = input_params->beta2_;
36 const auto epsilon = input_params->epsilon_;
37 const auto use_nesterov = input_params->use_nesterov_;
38 const auto unique_sparse_grad = input_params->sparse_grad_;
39 const auto var_first_dim_size = input_params->var_first_dim_size_;
40 const auto var_outer_dim_size = input_params->var_outer_dim_size_;
41 for (size_t i = start; i < end; ++i) {
42 T index = unique_sparse_grad.indices_[i];
43 if (index < 0 || LongToSize(index) >= var_first_dim_size) {
44 MS_LOG(EXCEPTION) << "Index " << index << " in indices is out of range";
45 }
46 size_t start_index = var_outer_dim_size * static_cast<size_t>(index);
47 size_t end_index = start_index + var_outer_dim_size;
48 for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
49 auto summed_grad = unique_sparse_grad.value_[k];
50 m[j] = beta1 * m[j] + (1 - beta1) * summed_grad;
51 v[j] = beta2 * v[j] + (1 - beta2) * summed_grad * summed_grad;
52 if (use_nesterov) {
53 var[j] -= lr * (m[j] * beta1 + (1 - beta1) * summed_grad) / (std::sqrt(v[j]) + epsilon);
54 } else {
55 var[j] -= lr * m[j] / (std::sqrt(v[j]) + epsilon);
56 }
57 }
58 }
59 }
60 } // namespace
61
62 template <typename T>
InitWorkspaceSize()63 void SparseApplyLazyAdamCPUKernel::InitWorkspaceSize() {
64 (void)workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
65 (void)workspace_size_list_.emplace_back(indices_size_ * sizeof(T));
66 (void)workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
67 (void)workspace_size_list_.emplace_back(indices_size_ * sizeof(T));
68 }
69
InitInputOutputSize(const CNodePtr & kernel_node)70 void SparseApplyLazyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
71 CPUKernel::InitInputOutputSize(kernel_node);
72 if (indices_data_type_ == kNumberTypeInt32) {
73 InitWorkspaceSize<int>();
74 } else if (indices_data_type_ == kNumberTypeInt64) {
75 InitWorkspaceSize<int64_t>();
76 } else {
77 MS_LOG(EXCEPTION) << "Input data type " << indices_data_type_ << " is unsupported";
78 }
79 }
80
InitKernel(const CNodePtr & kernel_node)81 void SparseApplyLazyAdamCPUKernel::InitKernel(const CNodePtr &kernel_node) {
82 MS_EXCEPTION_IF_NULL(kernel_node);
83 kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
84 std::vector<size_t> var_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
85 std::vector<size_t> m_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
86 std::vector<size_t> v_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
87 std::vector<size_t> grad_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 9);
88 std::vector<size_t> indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 10);
89 if (var_shape.empty()) {
90 MS_LOG(EXCEPTION) << "var must be at least 1D";
91 }
92 if (!IsSameShape(var_shape, m_shape)) {
93 MS_LOG(EXCEPTION) << "var and m should have the same shape";
94 }
95 if (!IsSameShape(var_shape, v_shape)) {
96 MS_LOG(EXCEPTION) << "var and v should have the same shape";
97 }
98 if (var_shape.size() != grad_shape.size()) {
99 MS_LOG(EXCEPTION) << "var and grad should have the same shape size";
100 }
101
102 var_first_dim_size_ = var_shape[0];
103 for (size_t i = 1; i < var_shape.size(); ++i) {
104 if (var_shape[i] != grad_shape[i]) {
105 MS_LOG(EXCEPTION) << "The shape of var and grad must equal in dimension " << i;
106 }
107 var_outer_dim_size_ *= var_shape[i];
108 }
109 if (indices_shape.size() != 1) {
110 MS_LOG(EXCEPTION) << "Indices must be 1D";
111 }
112 indices_size_ = indices_shape[0];
113 if (grad_shape[0] != indices_size_) {
114 MS_LOG(EXCEPTION) << "The first dimension of grad shape must be equal to indices";
115 }
116 if (AnfAlgo::HasNodeAttr(USE_NESTEROV, kernel_node)) {
117 use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, USE_NESTEROV);
118 }
119 indices_data_type_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 10);
120 }
121
122 template <typename T>
LaunchKernel(const std::vector<kernel::AddressPtr> & inputs,const std::vector<kernel::AddressPtr> & workspace) const123 void SparseApplyLazyAdamCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
124 const std::vector<kernel::AddressPtr> &workspace) const {
125 auto *var = reinterpret_cast<float *>(inputs[0]->addr);
126 auto *m = reinterpret_cast<float *>(inputs[1]->addr);
127 auto *v = reinterpret_cast<float *>(inputs[2]->addr);
128 auto beta1_power = reinterpret_cast<float *>(inputs[3]->addr)[0];
129 if (beta1_power == 1) {
130 MS_LOG(EXCEPTION) << "The beta1_power should not be 1";
131 }
132 auto beta2_power = reinterpret_cast<float *>(inputs[4]->addr)[0];
133 auto lr = reinterpret_cast<float *>(inputs[5]->addr)[0];
134 auto beta1 = reinterpret_cast<float *>(inputs[6]->addr)[0];
135 auto beta2 = reinterpret_cast<float *>(inputs[7]->addr)[0];
136 auto epsilon = reinterpret_cast<float *>(inputs[8]->addr)[0];
137 auto *grad = reinterpret_cast<float *>(inputs[9]->addr);
138 auto *indices = reinterpret_cast<T *>(inputs[10]->addr);
139 auto *new_grad = reinterpret_cast<float *>(workspace[0]->addr);
140 auto *new_indices = reinterpret_cast<T *>(workspace[1]->addr);
141 auto *workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
142 auto *workspace_indices = reinterpret_cast<T *>(workspace[3]->addr);
143
144 SparseGradient<T> unique_sparse_grad({new_grad, new_indices, indices_size_});
145 SparseGradient<T> workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
146 SparseGradient<T> input_sparse_grad({grad, indices, indices_size_});
147 ReduceSparseGradientParam<T> param;
148 param.input_grad_ = &input_sparse_grad;
149 param.workspace_grad_ = &workspace_sparse_grad;
150 param.output_grad_ = &unique_sparse_grad;
151 param.max_index_ = var_first_dim_size_;
152 param.value_stride_ = var_outer_dim_size_;
153 BucketReduceSparseGradient(param);
154
155 lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
156 MultiThreadComputeParams<T> input_params;
157 input_params.var_ = var;
158 input_params.m_ = m;
159 input_params.v_ = v;
160 input_params.lr_ = lr;
161 input_params.beta1_ = beta1;
162 input_params.beta2_ = beta2;
163 input_params.epsilon_ = epsilon;
164 input_params.use_nesterov_ = use_nesterov_;
165 input_params.sparse_grad_ = unique_sparse_grad;
166 input_params.var_first_dim_size_ = var_first_dim_size_;
167 input_params.var_outer_dim_size_ = var_outer_dim_size_;
168 MultiThreadCompute<T>(ComputeLazyAdam<T>, &input_params, unique_sparse_grad.indices_size_);
169 }
170
Launch(const std::vector<kernel::AddressPtr> & inputs,const std::vector<kernel::AddressPtr> & workspace,const std::vector<kernel::AddressPtr> &)171 bool SparseApplyLazyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
172 const std::vector<kernel::AddressPtr> &workspace,
173 const std::vector<kernel::AddressPtr> &) {
174 CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSparseApplyLazyAdamInputsNum, kernel_name_);
175 CHECK_KERNEL_WORKSPACE_SIZE(workspace.size(), kSparseApplyLazyAdamWorkspaceSize, kernel_name_);
176 if (indices_data_type_ == kNumberTypeInt32) {
177 LaunchKernel<int>(inputs, workspace);
178 } else if (indices_data_type_ == kNumberTypeInt64) {
179 LaunchKernel<int64_t>(inputs, workspace);
180 } else {
181 MS_LOG(EXCEPTION) << "Unsupported indices data type: " << indices_data_type_;
182 }
183 return true;
184 }
185 } // namespace kernel
186 } // namespace mindspore
187