# Copyright 2020-2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """adafactor""" from mindspore.common import dtype as mstype from mindspore.log import logging from mindspore.common.initializer import initializer from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.ops import functional as F from mindspore.common.parameter import Parameter, ParameterTuple from mindspore.common.tensor import Tensor from mindspore._checkparam import Validator as validator from mindspore._checkparam import Rel from mindspore.nn.optim.optimizer import opt_init_args_register from .optimizer import Optimizer def _get_lr(step, rms, learning_rate, relative_step, warmup_init, scale_parameter, eps): """update optimizer learning rete""" rel_step_sz = learning_rate if relative_step: if warmup_init: min_step = 1e-6 * step * 1.0 else: min_step = 1e-2 * 1.0 rel_step_sz = P.Minimum()(min_step, 1.0 / P.Sqrt()(step * 1.0)) param_scale = 1.0 if scale_parameter: param_scale = P.Maximum()(eps[1], rms) return rel_step_sz * param_scale * F.ones_like(rms) def _rms(update_tensor): """calculate rms""" return F.sqrt(P.ReduceMean(False)(F.square(update_tensor))) def _approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col): """Approximation of exponential moving average of square of gradient""" reduce_mean = P.ReduceMean(keep_dims=True)(exp_avg_sq_row, -1) div_val = 1.0 / P.Sqrt()(P.Div()(exp_avg_sq_row, reduce_mean)) r_factor = (P.ExpandDims()(div_val, -1)) exp_avg_sq_col = P.ExpandDims()(exp_avg_sq_col, -2) c_factor = 1.0 / P.Sqrt()(exp_avg_sq_col) return P.Mul()(r_factor, c_factor) _adam_opt = C.MultitypeFuncGraph("adam_opt") @_adam_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool", "Bool", "Bool", "Bool", "Bool", "Bool", "Bool", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor") def _run_opt_with_one_number(eps, clip_threshold, decay_rate, beta1, weight_decay, scale_lr, scale_parameter, relative_step, warmup_init, compression, use_first_moment, weight_decay_flag, learning_rate, step, grad, param, exp_avg, exp_avg_sq_row, exp_avg_sq_col, exp_avg_sq): """Apply ada factor optimizer to the weight parameter using Tensor.""" success = True grad_dtype = F.dtype(grad) grad_shape = F.shape(grad) if grad_dtype == mstype.float16: grad = F.cast(grad, mstype.float32) p_data_fp32 = param if F.dtype(p_data_fp32) == mstype.float16: p_data_fp32 = F.cast(p_data_fp32, mstype.float32) factored = len(grad_shape) >= 2 # State Initialization exp_avg_update = exp_avg exp_avg_sq_update = exp_avg_sq exp_avg_sq_row_update = exp_avg_sq_row exp_avg_sq_col_update = exp_avg_sq_col if use_first_moment: if compression: exp_avg_update = F.cast(exp_avg, mstype.float16) if factored: exp_avg_sq_row_update = F.cast(exp_avg_sq_row, grad_dtype) exp_avg_sq_col_update = F.cast(exp_avg_sq_col, grad_dtype) else: exp_avg_sq_update = F.cast(exp_avg_sq, grad_dtype) if scale_lr: rms = _rms(p_data_fp32) learning_rate_update = _get_lr(step, rms, learning_rate, relative_step, warmup_init, scale_parameter, eps) learning_rate_update = F.assign(learning_rate, F.cast(learning_rate_update, F.dtype(learning_rate))) else: learning_rate_update = learning_rate * 1.0 beta2t = 1.0 - P.Pow()(step, decay_rate) update = (grad ** 2) + eps[0] if factored: exp_avg_sq_row_update = P.Mul()(exp_avg_sq_row_update, beta2t) update_mean = P.ReduceMean()(update, -1) * (1.0 - beta2t) exp_avg_sq_row_update = P.Add()(exp_avg_sq_row_update, update_mean) exp_avg_sq_row_update = F.assign(exp_avg_sq_row, F.cast(exp_avg_sq_row_update, F.dtype(exp_avg_sq_row))) exp_avg_sq_col_update = P.Mul()(exp_avg_sq_col_update, beta2t) update_mean = P.ReduceMean()(update, -2) * (1.0 - beta2t) exp_avg_sq_col_update = P.Add()(exp_avg_sq_col_update, update_mean) exp_avg_sq_col_update = F.assign(exp_avg_sq_col, F.cast(exp_avg_sq_col_update, F.dtype(exp_avg_sq_col))) update = _approx_sq_grad(exp_avg_sq_row_update, exp_avg_sq_col_update) update = P.Mul()(update, grad) else: update = update * (1.0 - beta2t) exp_avg_sq_update = P.Add()(P.Mul()(exp_avg_sq_update, beta2t), update) exp_avg_sq_update = F.assign(exp_avg_sq, F.cast(exp_avg_sq_update, F.dtype(exp_avg_sq))) exp_avg_sq_update = 1.0 / P.Sqrt()(exp_avg_sq_update) update = P.Mul()(exp_avg_sq_update, grad) update_rms_thres = _rms(update) / clip_threshold update_coff = P.Maximum()(update_rms_thres, P.OnesLike()(update_rms_thres)) update = P.Mul()(P.Div()(update, update_coff), learning_rate_update) if use_first_moment: if compression: exp_avg_update = F.cast(exp_avg_update, grad_dtype) exp_avg_update = P.Add()(P.Mul()(exp_avg_update, beta1), update * (1 - beta1)) update = F.assign(exp_avg, F.cast(exp_avg_update, F.dtype(exp_avg))) if weight_decay_flag: p_data_fp32_coff = p_data_fp32 * -weight_decay * learning_rate_update p_data_fp32 = P.Add()(p_data_fp32, p_data_fp32_coff) p_data_fp32 = P.Sub()(p_data_fp32, update) P.Assign()(param, F.cast(p_data_fp32, F.dtype(param))) return success def trans_to_tensor(paras, is_tuple=False, fp32=True): if paras is None or isinstance(paras, bool): return paras data_type = mstype.float32 if fp32 else mstype.float16 if is_tuple: new_paras = [Tensor(ele, data_type) for ele in paras] return tuple(new_paras) return Tensor(paras, data_type) class AdaFactor(Optimizer): r""" Updates gradients by the Adaptive Learning Rates with Sublinear Memory Cost (Adafactor) algorithm. The Adafactor algorithm is proposed in `Adafactor: Adafactor: Adaptive Learning Rates with Sublinear Memory Cost `_. .. warning:: This is an experimental prototype that is subject to change and/or deletion. Adafactor for weight vector are as follows, .. math:: \begin{array}{l} \\ \alpha_{t}=\max \left(\epsilon_{2}, \operatorname{RMS}\left(X_{t-1}\right)\right) \rho_{t} \\ G_{t}=\nabla f_{t}\left(X_{t-1}\right) \\ \hat{V}_{t}=\hat{\beta}_{2} \hat{V}_{t-1}+\left(1-\hat{\beta}_{2_{t}}\right)\left(G_{t}^{2}+ \\ \epsilon_{1} 1_{n}\right) \\ U_{t}=G_{t} / \sqrt{\hat{V}_{t}} \\ \hat{U}_{t}=U_{t} / \max \left(1, \operatorname{RMS}\left(U_{t}\right) / d\right) \\ X_{t}=X_{t-1}-\alpha_{t} \hat{U}_{t} \end{array} Adafactor for weight matrices are as follows, .. math:: \begin{array}{l} \\ \alpha_{t}=\max \left(\epsilon_{2}, \operatorname{RMS}\left(X_{t-1}\right)\right) \rho_{t} \\ G_{t}=\nabla f_{t}\left(X_{t-1}\right) \\ R_{t}=\hat{\beta}_{2 t} R_{t-1}+\left(1-\hat{\beta}_{2 t}\right)\left(G_{t}^{2}+ \\ \epsilon_{1} 1_{n} 1_{m}^{\top}\right) 1_{m} \\ C_{t}=\hat{\beta}_{2 t} C_{t-1}+\left(1-\hat{\beta}_{2 t}\right) 1_{n}^{\top}\left(G_{t}^{2}+ \\ \epsilon_{1} 1_{n} 1_{m}^{\top}\right) \\ \hat{V}_{t}=R_{t} C_{t} / 1_{n}^{\top} R_{t} \\ U_{t}=G_{t} / \sqrt{\hat{V}_{t}} \\ \hat{U}_{t}=U_{t} / \max \left(1, \operatorname{RMS}\left(U_{t}\right) / d\right) \\ X_{t}=X_{t-1}-\alpha_{t} U_{t} \end{array} Where RMS is: .. math:: \operatorname{RMS}\left(U_{t}\right)=\operatorname{RMS}_{x \in X}\left(u_{x t}\right)= \\ \sqrt{\operatorname{Mean}_{x \in X}\left(\frac{\left(g_{x t}\right)^{2}}{\hat{v}_{x t}}\right)} :math:`x` is each individual parameter, :math:`t` is assumed to be the current number of steps, :math:`a_{t}` is the learning rate, :math:`f(X)` is the loss function, :math:`\epsilon1` and :math:`\epsilon2` is a small positive number to prevent errors, :math:`d` is the clipping threshold, :math:`\beta_{2}` is the moment decay, :math:`\rho` is the relative step size, :math:`R` is the running averages of the row sums of the squared gradient, :math:`C` is the running averages of the column sums of the squared gradient. Note: The learning rate depending of this optimizer will be control by the *scale_parameter*, *relative_step* and *warmup_init* options. To use a manual (external) learning rate schedule, it should be set `scale_parameter=False` and `relative_step=False`. If parameters is not used in the network, please do not add it to the optimizer, otherwise the calculation result will be abnormal. To improve parameter groups performance, the customized order of parameters is supported. Args: params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, the element in `params` must be class `Parameter`. learning_rate (Union[float, Tensor]): A value or a graph for the learning rate. When the learning_rate is a Tensor in a 1D dimension. If the type of `learning_rate` is int, it will be converted to float. Default: None. eps (float): The regularization constans for square gradient and parameter scale respectively. default: (1e-30, 1e-3) clip_threshold (Union[float, Tensor]): The threshold of root mean square of final gradient update. default: 1.0 decay_rate (Union[float, Tensor]): The coefficient used to compute running averages of square gradient. default: 0.8 beta1 (float): The coefficient to computing running averages of gradient. Should be in range (0.0, 1.0). Default: None. weight_decay (float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0. scale_parameter (bool): If True, learning rate is scaled by root mean square of parameter. default: True relative_step (bool): If True, time-dependent learning rate is computed instead of external learning rate. default: True warmup_init (bool): The time-dependent learning rate computation depends on whether warm-up initialization is being used. default: False compression (bool): If True, the data type of the running averages exponent will be compression to float16. default: False loss_scale (float): A floating point value for the loss scale. Should be greater than 0. In general, use the default value. Only when `FixedLossScaleManager` is used for training and the `drop_overflow_update` in `FixedLossScaleManager` is set to False, then this value needs to be the same as the `loss_scale` in `FixedLossScaleManager`. Refer to class :class:`mindspore.FixedLossScaleManager` for more details. Default: 1.0. Inputs: - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`. Outputs: Tensor[bool], the value is True. Raises: TypeError: If `learning_rate` is not one of int, float, Tensor, Iterable, LearningRateSchedule. TypeError: If element of `parameters` is neither Parameter nor dict. TypeError: If `beta1`, `beta2`, `eps` or `loss_scale` is not a float. TypeError: If `weight_decay` is neither float nor int. TypeError: If `use_locking` or `use_nesterov` is not a bool. ValueError: If `loss_scale` or `eps` is less than or equal to 0. ValueError: If `beta1`, `beta2` is not in range (0.0, 1.0). ValueError: If `weight_decay` is less than 0. Supported Platforms: ``Ascend`` Examples: >>> net = Net() >>> #1) Parameters use the default learning rate with None and weight decay with 0. >>> optim = nn.AdaFactor(params=net.trainable_params()) >>> >>> #2) Use parameter groups >>> all_params = net.trainable_params() >>> group_params = [{'params': [all_params[0]]}, {'params': [all_params[1]]}] >>> optim = nn.AdaFactor(group_params, learning_rate=0.1, weight_decay=0.0, relative_step=False) >>> loss = nn.SoftmaxCrossEntropyWithLogits() >>> model = Model(net, loss_fn=loss, optimizer=optim) """ @opt_init_args_register def __init__(self, params, learning_rate=None, eps=(1e-30, 1e-3), clip_threshold=1.0, decay_rate=0.8, beta1=0.9, weight_decay=0.0, scale_parameter=True, relative_step=True, warmup_init=False, compression=False, loss_scale=1.0): if learning_rate is not None and relative_step: raise ValueError("Cannot combine manual lr and relative_step options", learning_rate) if warmup_init and not relative_step: raise ValueError("warmup_init requires relative_step=True") if learning_rate is None and not relative_step: raise ValueError("Cannot learning_rate is None and relative_step=False") if learning_rate is None: learning_rate = 0.0 if beta1 is None: beta1 = 0.0 self.scale_lr = True if not isinstance(learning_rate, (float, int)) and learning_rate is not None: self.scale_lr = False if relative_step or scale_parameter: logging.warning("When learning_rate is learning scheduler, it not support update learning rate!") super(AdaFactor, self).__init__(learning_rate, params, weight_decay, loss_scale) validator.check_value_type("eps", eps, [list, tuple], self.cls_name) if len(eps) != 2: raise ValueError("eps must have 2 value: (eps1, eps2).") for i, ele in enumerate(eps): validator.check_value_type("eps{}".format(i), ele, [float], self.cls_name) validator.check_non_negative_float(ele, "eps{}".format(i), self.cls_name) validator.check_value_type("clip_threshold", clip_threshold, [float], self.cls_name) validator.check_non_negative_float(clip_threshold, "clip_threshold", self.cls_name) validator.check_value_type("decay_rate", decay_rate, [float], self.cls_name) validator.check_float_range(decay_rate, 0, 1, Rel.INC_NEITHER, "decay_rate", self.cls_name) validator.check_float_range(weight_decay, 0, 1, Rel.INC_LEFT, "weight_decay", self.cls_name) validator.check_value_type("scale_parameter", scale_parameter, [bool], self.cls_name) validator.check_value_type("relative_step", relative_step, [bool], self.cls_name) validator.check_value_type("compression", compression, [bool], self.cls_name) validator.check_value_type("beta1", beta1, [int, float], self.cls_name) validator.check_non_negative_float(float(beta1), "beta1", self.cls_name) self.eps = trans_to_tensor(eps) self.clip_threshold = trans_to_tensor(clip_threshold) self.decay_rate = trans_to_tensor(-decay_rate) self.beta1 = trans_to_tensor(beta1) self.weight_decay = trans_to_tensor(weight_decay) self.weight_decay_flag = bool(weight_decay) self.step = Parameter(Tensor(0, dtype=mstype.float32), name="train_step") self.scale_parameter = scale_parameter self.relative_step = relative_step self.warmup_init = warmup_init self.compression = compression self.init_ada_factor_state(beta1) self.step = Parameter(initializer(0, [1], mstype.float32), name='afactor_step') print("AdaFactor init completed", self.learning_rate) def init_ada_factor_state(self, beta1): """init adafactor variables""" if beta1 > 0: self.use_first_moment = True self.exp_avg = self.parameters.clone(prefix="exp_avg", init='zeros') else: self.use_first_moment = False self.exp_avg = ParameterTuple([Parameter(Tensor(0.0))] * len(self.parameters)) self.exp_avg_sq = [] self.exp_avg_sq_col = [] self.exp_avg_sq_row = [] for paras in self.parameters: paras_dtype = paras.dtype paras_shape = paras.shape paras_name = paras.name if len(paras_shape) > 1: self.exp_avg_sq_row.append(Parameter(initializer(0, shape=paras_shape[:-1], dtype=paras_dtype), name="exp_avg_sq_row_{}".format(paras_name))) self.exp_avg_sq_col.append(Parameter(initializer(0, shape=paras_shape[:-2] + paras_shape[-1:], dtype=paras_dtype), name="exp_avg_sq_col_{}".format(paras_name))) if self.compression: self.exp_avg_sq.append(Parameter(initializer(0, shape=(1,), dtype=mstype.float16), name="exp_avg_sq_{}".format(paras_name))) else: self.exp_avg_sq.append(Parameter(initializer(0, shape=(1,), dtype=paras_dtype), name="exp_avg_sq_{}".format(paras_name))) else: self.exp_avg_sq_row.append(Parameter(initializer(0, shape=(1,), dtype=paras_dtype), name="exp_avg_sq_row_{}".format(paras_name))) self.exp_avg_sq_col.append(Parameter(initializer(0, shape=(1,), dtype=paras_dtype), name="exp_avg_sq_col_{}".format(paras_name))) if self.compression: self.exp_avg_sq.append(Parameter(initializer(0, shape=paras_shape, dtype=mstype.float16), name="exp_avg_sq_{}".format(paras_name))) else: self.exp_avg_sq.append(Parameter(initializer(0, shape=paras_shape, dtype=paras_dtype), name="exp_avg_sq_{}".format(paras_name))) self.exp_avg_sq_row = ParameterTuple(self.exp_avg_sq_row) self.exp_avg_sq_col = ParameterTuple(self.exp_avg_sq_col) self.exp_avg_sq = ParameterTuple(self.exp_avg_sq) @property def supports_memory_efficient_fp16(self): return True @property def supports_flat_params(self): return False def construct(self, gradients): lr = self.get_lr() step = F.assign_add(self.step, 1) success = self.hyper_map(F.partial(_adam_opt, self.eps, self.clip_threshold, self.decay_rate, self.beta1, self.weight_decay, self.scale_lr, self.scale_parameter, self.relative_step, self.warmup_init, self.compression, self.use_first_moment, self.weight_decay_flag, lr, step), gradients, self.parameters, self.exp_avg, self.exp_avg_sq_row, self.exp_avg_sq_col, self.exp_avg_sq) return success @Optimizer.target.setter def target(self, value): """ If the input value is set to "CPU", the parameters will be updated on the host using the Fused optimizer operation. """ self._set_base_target(value)