1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "ps/optimizer_info.h"
18 #include <map>
19 #include <memory>
20 #include <string>
21 #include <functional>
22 #include "ps/util.h"
23
24 namespace mindspore {
25 namespace ps {
AddWorkspace(const AddressPtr & workspace)26 void OptimizerInfo::AddWorkspace(const AddressPtr &workspace) {
27 MS_EXCEPTION_IF_NULL(workspace);
28 workspaces_.push_back(workspace);
29 }
30
inputs() const31 const std::vector<AddressPtr> &OptimizerInfo::inputs() const { return inputs_; }
32
workspaces() const33 const std::vector<AddressPtr> &OptimizerInfo::workspaces() const { return workspaces_; }
34
outputs() const35 const std::vector<AddressPtr> &OptimizerInfo::outputs() const { return outputs_; }
36
IsSparse() const37 bool OptimizerInfo::IsSparse() const { return false; }
38
indice_size() const39 const size_t OptimizerInfo::indice_size() const { return 0; }
40
grad_index()41 size_t OptimizerInfo::grad_index() { return 0; }
42
indices_index()43 size_t OptimizerInfo::indices_index() { return 0; }
44
45 template <typename T>
UpdateOptimInputValue(const std::string & optim_type,const std::string & input_name,void * data,const Lengths & lens)46 void OptimizerInfo::UpdateOptimInputValue(const std::string &optim_type, const std::string &input_name, void *data,
47 const Lengths &lens) {
48 MS_EXCEPTION_IF_NULL(data);
49 if (kOptimToOriginIdx.count(optim_type) == 0 || kOptimToPSSendIdx.count(optim_type) == 0) {
50 MS_LOG(EXCEPTION) << "Optimizer type " << optim_type << " in not supported.";
51 }
52 const OptimOriginIdx &origin_input_map = kOptimToOriginIdx.at(optim_type);
53 const OptimPSSendIdx &ps_send_index_map = kOptimToPSSendIdx.at(optim_type);
54 if (ps_send_index_map.count(input_name) == 0 || origin_input_map.count(input_name) == 0) {
55 MS_LOG(EXCEPTION) << "Optimizer " << optim_type << " has no input for " << input_name;
56 }
57
58 size_t origin_index = origin_input_map.at(input_name);
59 size_t ps_send_index = ps_send_index_map.at(input_name);
60 if (ps_send_index >= lens.size() || origin_index >= inputs_.size()) {
61 MS_LOG(EXCEPTION) << "Index is out of bound for optimizer " << optim_type << ", origin_index:" << origin_index
62 << ", ps_send_index:" << ps_send_index;
63 }
64 EXC_IF_VEC_IDX_OOB(lens, ps_send_index);
65 size_t size = IntToSize(lens[ps_send_index]) * sizeof(T);
66 int offset = std::accumulate(lens.begin(), lens.begin() + SizeToInt(ps_send_index), 0, std::plus<int>());
67 AddressPtr optim_input = inputs_[origin_index];
68 MS_EXCEPTION_IF_NULL(optim_input);
69
70 void *dst_data = optim_input->addr;
71 T *src_data = reinterpret_cast<T *>(data) + offset;
72 MS_EXCEPTION_IF_NULL(dst_data);
73 MS_EXCEPTION_IF_NULL(src_data);
74 int64_t ret = memcpy_s(optim_input->addr, optim_input->size, src_data, size);
75 if (ret != 0) {
76 MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
77 return;
78 }
79 return;
80 }
81
Accumulate(const Values & values,const Lengths & lengths)82 void DenseOptimInfo::Accumulate(const Values &values, const Lengths &lengths) {
83 MS_EXCEPTION_IF_NULL(gradient()->addr);
84 float *accum_grad_data = reinterpret_cast<float *>(gradient()->addr);
85 size_t size = gradient()->size / sizeof(float);
86 size_t grad_index = this->grad_index();
87 size_t grad_offset = 0;
88 for (size_t i = 0; i < grad_index; i++) {
89 grad_offset += IntToSize(lengths[i]);
90 }
91 float *grad_data = const_cast<float *>(values.data()) + grad_offset;
92 MS_EXCEPTION_IF_NULL(grad_data);
93 #define google mindspore_private
94 CHECK_EQ(size, IntToSize(lengths[grad_index]));
95 #undef google
96 for (size_t i = 0; i < size; i++) {
97 accum_grad_data[i] += grad_data[i];
98 }
99 }
100
ComputeMean(const std::vector<std::vector<size_t>> &,size_t n,size_t,size_t)101 void DenseOptimInfo::ComputeMean(const std::vector<std::vector<size_t>> &, size_t n, size_t, size_t) {
102 if (n > 1) {
103 MS_EXCEPTION_IF_NULL(gradient()->addr);
104 float *accum_grad_data = reinterpret_cast<float *>(gradient()->addr);
105 size_t size = gradient()->size / sizeof(float);
106 for (size_t i = 0; i < size; i++) {
107 accum_grad_data[i] /= n;
108 }
109 }
110 }
111
Reset()112 void DenseOptimInfo::Reset() {
113 MS_EXCEPTION_IF_NULL(gradient()->addr);
114 int64_t ret = memset_s(gradient()->addr, gradient()->size, 0x00, gradient()->size);
115 if (ret != 0) {
116 MS_LOG(EXCEPTION) << "memset_s error, errorno(" << ret << ")";
117 return;
118 }
119 }
120
Accumulate(const Values & values,const Lengths & lengths)121 void SparseOptimInfo::Accumulate(const Values &values, const Lengths &lengths) {
122 // Append grad data to the end
123 MS_EXCEPTION_IF_NULL(gradient()->addr);
124 float *accum_grad_data = reinterpret_cast<float *>(gradient()->addr);
125
126 size_t grad_index = this->grad_index();
127 size_t grad_offset = 0;
128 for (size_t i = 0; i < grad_index; i++) {
129 grad_offset += IntToSize(lengths[i]);
130 }
131 float *incr_grad_data = const_cast<float *>(values.data()) + grad_offset;
132 MS_EXCEPTION_IF_NULL(incr_grad_data);
133
134 size_t incr_grad_size = IntToSize(lengths[grad_index]) * sizeof(float);
135 size_t dst_size = incr_grad_size;
136 size_t src_size = incr_grad_size;
137 void *dst_data = accum_grad_data + grads_offset_;
138 void *src_data = incr_grad_data;
139 MS_EXCEPTION_IF_NULL(dst_data);
140 MS_EXCEPTION_IF_NULL(src_data);
141 int64_t ret = memcpy_s(dst_data, dst_size, src_data, src_size);
142 if (ret != 0) {
143 MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
144 return;
145 }
146 grads_offset_ += IntToSize(lengths[grad_index]);
147 gradient()->size += incr_grad_size;
148
149 // Append indice data to the end
150 MS_EXCEPTION_IF_NULL(indices()->addr);
151 int *accum_indices_data = reinterpret_cast<int *>(indices()->addr);
152 MS_EXCEPTION_IF_NULL(accum_indices_data);
153
154 size_t indices_index = this->indices_index();
155 size_t indice_offset = 0;
156 for (size_t i = 0; i < indices_index; i++) {
157 indice_offset += IntToSize(lengths[i]);
158 }
159
160 void *incr_indice_data_temp = const_cast<float *>(values.data()) + indice_offset;
161 MS_EXCEPTION_IF_NULL(incr_indice_data_temp);
162 int *incr_indice_data = reinterpret_cast<int *>(incr_indice_data_temp);
163 MS_EXCEPTION_IF_NULL(incr_indice_data);
164
165 size_t incr_indice_size = lengths[indices_index];
166 size_t incr_indice_data_size = incr_indice_size * sizeof(int);
167 dst_size = incr_indice_data_size;
168 src_size = incr_indice_data_size;
169 dst_data = accum_indices_data + indices_offset_;
170 src_data = incr_indice_data;
171 MS_EXCEPTION_IF_NULL(dst_data);
172 MS_EXCEPTION_IF_NULL(src_data);
173 auto ret2 = memcpy_s(dst_data, dst_size, src_data, src_size);
174 if (ret2 != 0) {
175 MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret2 << ")";
176 return;
177 }
178 indices_offset_ += IntToSize(lengths[indices_index]);
179 indices()->size += incr_indice_data_size;
180 }
181
ComputeMean(const std::vector<std::vector<size_t>> & shapes,size_t n,size_t server_num,size_t rank_id)182 void SparseOptimInfo::ComputeMean(const std::vector<std::vector<size_t>> &shapes, size_t n, size_t server_num,
183 size_t rank_id) {
184 if (n == 0 || indices()->size == 0) {
185 MS_LOG(EXCEPTION) << "The size of shapes or indices are 0.";
186 }
187 size_t indices_size = static_cast<size_t>(indices()->size / sizeof(int));
188 size_t segment_size = gradient()->size / indices()->size;
189
190 std::vector<float> new_grad(indices_size * segment_size);
191 std::vector<int> new_indices(indices_size);
192 mindspore::kernel::SparseGradient<int> unique_sparse_grad({new_grad.data(), new_indices.data(), indices_size});
193
194 if (shapes.size() < 2 || shapes[1].empty()) {
195 MS_LOG(EXCEPTION) << "No input shape found";
196 }
197 auto input_shapes = shapes[1];
198 if (input_shapes.size() == 0) {
199 MS_LOG(EXCEPTION) << "Invalid input shapes";
200 }
201 size_t first_dim_size = input_shapes.front();
202 size_t outer_dim_size = segment_size;
203
204 if (first_dim_size == 0 || outer_dim_size == 0) {
205 MS_LOG(ERROR) << "Invalid first dim size";
206 }
207
208 MS_EXCEPTION_IF_NULL(gradient()->addr);
209 MS_EXCEPTION_IF_NULL(indices()->addr);
210 float *grad_data = reinterpret_cast<float *>(gradient()->addr);
211 int *indices_data = reinterpret_cast<int *>(indices()->addr);
212
213 if (sharded_) {
214 size_t original_row_count = input_shapes.front();
215 if (original_row_count > 0) {
216 size_t offset = 0;
217 std::map<int64_t, int64_t> rank_dims =
218 Util::AllRankLocalShard(SizeToLong(original_row_count), SizeToLong(rank_id), SizeToLong(server_num));
219 for (size_t i = 0; i < rank_id; i++) {
220 if (rank_dims.count(i) == 0) {
221 MS_LOG(EXCEPTION) << "No local shard number for rank " << i;
222 }
223 offset += LongToSize(rank_dims[i]);
224 }
225 for (size_t j = 0; j < indices_size; j++) {
226 indices_data[j] -= SizeToInt(offset);
227 }
228 }
229 }
230
231 Util::ReduceSparseGradient(grad_data, indices_data, indices_size, segment_size, first_dim_size, outer_dim_size,
232 &unique_sparse_grad);
233
234 size_t reduced_grad_size = unique_sparse_grad.indices_size_ * segment_size * sizeof(float);
235 MS_EXCEPTION_IF_NULL(unique_sparse_grad.value_);
236 int ret = memcpy_s(gradient()->addr, gradient()->size, unique_sparse_grad.value_, reduced_grad_size);
237 if (ret != 0) {
238 MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
239 return;
240 }
241
242 size_t reduced_indice_size = unique_sparse_grad.indices_size_ * sizeof(int);
243 MS_EXCEPTION_IF_NULL(unique_sparse_grad.indices_);
244 ret = memcpy_s(indices()->addr, indices()->size, unique_sparse_grad.indices_, reduced_indice_size);
245 if (ret != 0) {
246 MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
247 return;
248 }
249
250 gradient()->size = reduced_grad_size;
251 indices()->size = reduced_indice_size;
252
253 for (size_t i = 0; i < unique_sparse_grad.indices_size_ * segment_size; i++) {
254 grad_data[i] = grad_data[i] / n;
255 }
256 }
257
Reset()258 void SparseOptimInfo::Reset() {
259 gradient()->size = 0;
260 indices()->size = 0;
261 grads_offset_ = 0;
262 indices_offset_ = 0;
263 }
264
MomentumOptimInfo(const AddressPtr & weight,const AddressPtr & accumulate,const AddressPtr & learning_rate,const AddressPtr & gradient,const AddressPtr & momentum)265 MomentumOptimInfo::MomentumOptimInfo(const AddressPtr &weight, const AddressPtr &accumulate,
266 const AddressPtr &learning_rate, const AddressPtr &gradient,
267 const AddressPtr &momentum) {
268 MS_EXCEPTION_IF_NULL(weight);
269 MS_EXCEPTION_IF_NULL(accumulate);
270 MS_EXCEPTION_IF_NULL(learning_rate);
271 MS_EXCEPTION_IF_NULL(gradient);
272 MS_EXCEPTION_IF_NULL(momentum);
273 inputs_.push_back(weight);
274 inputs_.push_back(accumulate);
275 inputs_.push_back(learning_rate);
276 inputs_.push_back(gradient);
277 inputs_.push_back(momentum);
278 }
279
Update(const Values & values,const Lengths & lens)280 void MomentumOptimInfo::Update(const Values &values, const Lengths &lens) {
281 UpdateOptimInputValue<float>(kApplyMomentum, "lr", const_cast<float *>(values.data()), lens);
282 }
283
indice_size() const284 const size_t SparseOptimInfo::indice_size() const { return indices_offset_; }
285
gradient()286 const AddressPtr &MomentumOptimInfo::gradient() {
287 size_t origin_grad_index = kMomentumOriginIdx.at("grad");
288 EXC_IF_VEC_IDX_OOB(inputs_, origin_grad_index);
289 MS_EXCEPTION_IF_NULL(inputs_[origin_grad_index]);
290 return inputs_[origin_grad_index];
291 }
292
indices()293 const AddressPtr &MomentumOptimInfo::indices() {
294 size_t origin_grad_index = kMomentumOriginIdx.at("grad");
295 EXC_IF_VEC_IDX_OOB(inputs_, origin_grad_index);
296 MS_EXCEPTION_IF_NULL(inputs_[origin_grad_index]);
297 return inputs_[origin_grad_index];
298 }
299
grad_index()300 size_t MomentumOptimInfo::grad_index() {
301 size_t ps_grad_index = kMomentumPSSendIdx.at("grad");
302 return ps_grad_index;
303 }
304
SparseAdamOptimInfo(const AddressPtr & weight,const AddressPtr & m,const AddressPtr & v,const AddressPtr & beta1_power,const AddressPtr & beta2_power,const AddressPtr & learning_rate,const AddressPtr & beta1,const AddressPtr & beta2,const AddressPtr & epsilon,const AddressPtr & grad,const AddressPtr & indices,bool sharded)305 SparseAdamOptimInfo::SparseAdamOptimInfo(const AddressPtr &weight, const AddressPtr &m, const AddressPtr &v,
306 const AddressPtr &beta1_power, const AddressPtr &beta2_power,
307 const AddressPtr &learning_rate, const AddressPtr &beta1,
308 const AddressPtr &beta2, const AddressPtr &epsilon, const AddressPtr &grad,
309 const AddressPtr &indices, bool sharded) {
310 MS_EXCEPTION_IF_NULL(weight);
311 MS_EXCEPTION_IF_NULL(m);
312 MS_EXCEPTION_IF_NULL(v);
313 MS_EXCEPTION_IF_NULL(beta1_power);
314 MS_EXCEPTION_IF_NULL(beta2_power);
315 MS_EXCEPTION_IF_NULL(learning_rate);
316 MS_EXCEPTION_IF_NULL(beta1);
317 MS_EXCEPTION_IF_NULL(beta2);
318 MS_EXCEPTION_IF_NULL(epsilon);
319 MS_EXCEPTION_IF_NULL(grad);
320 MS_EXCEPTION_IF_NULL(indices);
321 inputs_.push_back(weight);
322 inputs_.push_back(m);
323 inputs_.push_back(v);
324 inputs_.push_back(beta1_power);
325 inputs_.push_back(beta2_power);
326 inputs_.push_back(learning_rate);
327 inputs_.push_back(beta1);
328 inputs_.push_back(beta2);
329 inputs_.push_back(epsilon);
330 inputs_.push_back(grad);
331 inputs_.push_back(indices);
332 grads_offset_ = grad->size / sizeof(float);
333 indices_offset_ = indices->size / sizeof(int);
334 sharded_ = sharded;
335 }
336
Update(const Values & values,const Lengths & lens)337 void SparseAdamOptimInfo::Update(const Values &values, const Lengths &lens) {
338 UpdateOptimInputValue<float>(kSparseAdam, "beta1_power", const_cast<float *>(values.data()), lens);
339 UpdateOptimInputValue<float>(kSparseAdam, "beta2_power", const_cast<float *>(values.data()), lens);
340 UpdateOptimInputValue<float>(kSparseAdam, "lr", const_cast<float *>(values.data()), lens);
341 UpdateOptimInputValue<float>(kSparseAdam, "beta1", const_cast<float *>(values.data()), lens);
342 UpdateOptimInputValue<float>(kSparseAdam, "beta2", const_cast<float *>(values.data()), lens);
343 UpdateOptimInputValue<float>(kSparseAdam, "eps", const_cast<float *>(values.data()), lens);
344 }
345
gradient()346 const AddressPtr &SparseAdamOptimInfo::gradient() {
347 size_t origin_grad_index = kSparseAdamOriginIdx.at("grad");
348 EXC_IF_VEC_IDX_OOB(inputs_, origin_grad_index);
349 MS_EXCEPTION_IF_NULL(inputs_[origin_grad_index]);
350 return inputs_[origin_grad_index];
351 }
352
indices()353 const AddressPtr &SparseAdamOptimInfo::indices() {
354 size_t origin_indices_index = kSparseAdamOriginIdx.at("indices");
355 EXC_IF_VEC_IDX_OOB(inputs_, origin_indices_index);
356 MS_EXCEPTION_IF_NULL(inputs_[origin_indices_index]);
357 return inputs_[origin_indices_index];
358 }
359
IsSparse() const360 bool SparseAdamOptimInfo::IsSparse() const { return true; }
361
grad_index()362 size_t SparseAdamOptimInfo::grad_index() {
363 size_t ps_grad_index = kSparseAdamPSSendIdx.at("grad");
364 return ps_grad_index;
365 }
366
indices_index()367 size_t SparseAdamOptimInfo::indices_index() {
368 size_t ps_indices_index = kSparseAdamPSSendIdx.at("indices");
369 return ps_indices_index;
370 }
371
SparseFtrlOptimInfo(const AddressPtr & weight,const AddressPtr & accum,const AddressPtr & linear,const AddressPtr & grad,const AddressPtr & indices,bool sharded)372 SparseFtrlOptimInfo::SparseFtrlOptimInfo(const AddressPtr &weight, const AddressPtr &accum, const AddressPtr &linear,
373 const AddressPtr &grad, const AddressPtr &indices, bool sharded) {
374 MS_EXCEPTION_IF_NULL(weight);
375 MS_EXCEPTION_IF_NULL(accum);
376 MS_EXCEPTION_IF_NULL(linear);
377 MS_EXCEPTION_IF_NULL(grad);
378 MS_EXCEPTION_IF_NULL(indices);
379 inputs_.push_back(weight);
380 inputs_.push_back(accum);
381 inputs_.push_back(linear);
382 inputs_.push_back(grad);
383 inputs_.push_back(indices);
384 grads_offset_ = grad->size / sizeof(float);
385 indices_offset_ = indices->size / sizeof(int);
386 sharded_ = sharded;
387 }
388
gradient()389 const AddressPtr &SparseFtrlOptimInfo::gradient() {
390 size_t origin_grad_index = kSparseFtrlOriginIdx.at("grad");
391 EXC_IF_VEC_IDX_OOB(inputs_, origin_grad_index);
392 MS_EXCEPTION_IF_NULL(inputs_[origin_grad_index]);
393 return inputs_[origin_grad_index];
394 }
395
indices()396 const AddressPtr &SparseFtrlOptimInfo::indices() {
397 size_t origin_indices_index = kSparseFtrlOriginIdx.at("indices");
398 EXC_IF_VEC_IDX_OOB(inputs_, origin_indices_index);
399 MS_EXCEPTION_IF_NULL(inputs_[origin_indices_index]);
400 return inputs_[origin_indices_index];
401 }
402
IsSparse() const403 bool SparseFtrlOptimInfo::IsSparse() const { return true; }
404
grad_index()405 size_t SparseFtrlOptimInfo::grad_index() {
406 size_t ps_grad_index = kSparseFtrlPSSendIdx.at("grad");
407 return ps_grad_index;
408 }
409
indices_index()410 size_t SparseFtrlOptimInfo::indices_index() {
411 size_t ps_indices_index = kSparseFtrlPSSendIdx.at("indices");
412 return ps_indices_index;
413 }
414 } // namespace ps
415 } // namespace mindspore
416