1# Copyright 2020 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================ 15 16import numpy as np 17import pytest 18 19import mindspore.context as context 20import mindspore.nn as nn 21from mindspore import Tensor 22from mindspore.common.api import ms_function 23from mindspore.ops import operations as P 24from mindspore.ops import functional as F 25from mindspore.common import dtype as mstype 26from mindspore.common.parameter import Parameter 27 28 29class Net(nn.Cell): 30 def __init__(self, decay_flag=True): 31 super(Net, self).__init__() 32 self.decay_flag = decay_flag 33 self.op_mul = P.Mul() 34 self.op_square = P.Square() 35 self.op_sqrt = P.Sqrt() 36 self.op_cast = P.Cast() 37 self.op_reshape = P.Reshape() 38 self.op_shape = P.Shape() 39 self.param = Parameter( 40 Tensor(np.array([1, 3, 5]).astype(np.float32)), name='param') 41 self.m = Parameter( 42 Tensor(np.array([0.11, 0.33, 0.55]).astype(np.float32)), name='m') 43 self.v = Parameter( 44 Tensor(np.array([1.2, 3.4, 5.6]).astype(np.float32)), name='v') 45 46 @ms_function 47 def construct(self, beta1, beta2, one_sub_beta_1, one_sub_beta_2, gradient, eps, weight_decay_tensor, lr): 48 param_fp32 = self.op_cast(self.param, mstype.float32) 49 m_fp32 = self.op_cast(self.m, mstype.float32) 50 v_fp32 = self.op_cast(self.v, mstype.float32) 51 gradient_fp32 = self.op_cast(gradient, mstype.float32) 52 53 next_m = self.op_mul(beta1, m_fp32) + \ 54 self.op_mul(self.op_cast(one_sub_beta_1, 55 mstype.float32), gradient_fp32) 56 next_v = self.op_mul(beta2, v_fp32) + self.op_mul(self.op_cast(one_sub_beta_2, 57 mstype.float32), self.op_square(gradient_fp32)) 58 update = next_m / (eps + self.op_sqrt(next_v)) 59 if self.decay_flag: 60 update = self.op_mul(weight_decay_tensor, param_fp32) + update 61 update_with_lr = self.op_mul(lr, update) 62 next_param = param_fp32 - \ 63 self.op_reshape(update_with_lr, self.op_shape(param_fp32)) 64 65 depend_v = F.depend(next_param, F.assign(self.param, next_param)) 66 depend_v = F.depend(depend_v, F.assign(self.m, next_m)) 67 depend_v = F.depend(depend_v, F.assign(self.v, next_v)) 68 return depend_v 69 70 71class SideEffectFusedAdamNet(nn.Cell): 72 def __init__(self, decay_flag=True): 73 super(SideEffectFusedAdamNet, self).__init__() 74 self.decay_flag = decay_flag 75 self.op_mul = P.Mul() 76 self.op_square = P.Square() 77 self.op_sqrt = P.Sqrt() 78 self.op_cast = P.Cast() 79 self.op_reshape = P.Reshape() 80 self.op_shape = P.Shape() 81 self.param = Parameter( 82 Tensor(np.array([0, 0, 0]).astype(np.float32)), name='param') 83 self.m = Parameter( 84 Tensor(np.array([0.11, 0.33, 0.55]).astype(np.float32)), name='m') 85 self.v = Parameter( 86 Tensor(np.array([1.2, 3.4, 5.6]).astype(np.float32)), name='v') 87 self.x = Parameter( 88 Tensor(np.array([1, 3, 5]).astype(np.float32)), name='x') 89 90 @ms_function 91 def construct(self, beta1, beta2, one_sub_beta_1, one_sub_beta_2, gradient, eps, weight_decay_tensor, lr): 92 F.assign(self.param, self.x) 93 94 param_fp32 = self.op_cast(self.param, mstype.float32) 95 m_fp32 = self.op_cast(self.m, mstype.float32) 96 v_fp32 = self.op_cast(self.v, mstype.float32) 97 gradient_fp32 = self.op_cast(gradient, mstype.float32) 98 99 next_m = self.op_mul(beta1, m_fp32) + \ 100 self.op_mul(self.op_cast(one_sub_beta_1, 101 mstype.float32), gradient_fp32) 102 next_v = self.op_mul(beta2, v_fp32) + self.op_mul(self.op_cast(one_sub_beta_2, 103 mstype.float32), self.op_square(gradient_fp32)) 104 update = next_m / (eps + self.op_sqrt(next_v)) 105 if self.decay_flag: 106 update = self.op_mul(weight_decay_tensor, param_fp32) + update 107 update_with_lr = self.op_mul(lr, update) 108 next_param = param_fp32 - \ 109 self.op_reshape(update_with_lr, self.op_shape(param_fp32)) 110 111 depend_v = F.depend(next_param, F.assign(self.param, next_param)) 112 depend_v = F.depend(depend_v, F.assign(self.m, next_m)) 113 depend_v = F.depend(depend_v, F.assign(self.v, next_v)) 114 115 F.assign(self.x, self.m) 116 return depend_v 117 118 119def CalFusedAdam(beta1, beta2, one_sub_beta_1, one_sub_beta_2, gradient, eps, weight_decay_tensor, lr, param, m, v, 120 is_weight_decay=False): 121 m_expect = beta1 * m + one_sub_beta_1 * gradient 122 v_expect = beta2 * v + one_sub_beta_2 * gradient * gradient 123 update = m_expect / (np.sqrt(v_expect) + eps) 124 if is_weight_decay: 125 update += weight_decay_tensor * param 126 param_expect = param - lr * update 127 return param_expect, m_expect, v_expect 128 129 130def test_adam(): 131 np.random.seed(0) 132 beta1 = np.array([0.9]).astype(np.float32) 133 beta2 = np.array([0.999]).astype(np.float32) 134 one_sub_beta_1 = (np.array([1.0]) - np.array([0.9])).astype(np.float32) 135 one_sub_beta_2 = (np.array([1.0]) - np.array([0.999])).astype(np.float32) 136 lr = np.array([0.012]).astype(np.float32) 137 eps = np.array([1e-6]).astype(np.float32) 138 weight_decay_tensor = np.array([0.021]).astype(np.float32) 139 140 gradient = np.array([0.01, 0.03, 0.05]).astype(np.float32) 141 m = np.array([0.11, 0.33, 0.55]).astype(np.float32) 142 v = np.array([1.2, 3.4, 5.6]).astype(np.float32) 143 param = np.array([1, 3, 5]).astype(np.float32) 144 is_weight_decay = False 145 opt = Net(is_weight_decay) 146 _ = opt(Tensor(beta1), Tensor(beta2), Tensor(one_sub_beta_1), Tensor(one_sub_beta_2), Tensor(gradient), Tensor(eps), 147 Tensor(weight_decay_tensor), Tensor(lr)) 148 param_expect, m_expect, v_expect = CalFusedAdam( 149 beta1, beta2, one_sub_beta_1, one_sub_beta_2, gradient, eps, weight_decay_tensor, lr, 150 param, m, v, is_weight_decay) 151 assert np.allclose(opt.param.data.asnumpy(), param_expect, 152 rtol=1.e-4, atol=1.e-8, equal_nan=True) 153 assert np.allclose(opt.m.data.asnumpy(), m_expect, 154 rtol=1.e-4, atol=1.e-8, equal_nan=True) 155 assert np.allclose(opt.v.data.asnumpy(), v_expect, 156 rtol=1.e-4, atol=1.e-8, equal_nan=True) 157 158 159def test_adam_weight_decay(): 160 np.random.seed(0) 161 beta1 = np.array([0.9]).astype(np.float32) 162 beta2 = np.array([0.999]).astype(np.float32) 163 one_sub_beta_1 = (np.array([1.0]) - np.array([0.9])).astype(np.float32) 164 one_sub_beta_2 = (np.array([1.0]) - np.array([0.999])).astype(np.float32) 165 lr = np.array([0.012]).astype(np.float32) 166 eps = np.array([1e-6]).astype(np.float32) 167 weight_decay_tensor = np.array([0.021]).astype(np.float32) 168 169 gradient = np.array([0.01, 0.03, 0.05]).astype(np.float32) 170 m = np.array([0.11, 0.33, 0.55]).astype(np.float32) 171 v = np.array([1.2, 3.4, 5.6]).astype(np.float32) 172 param = np.array([1, 3, 5]).astype(np.float32) 173 is_weight_decay = True 174 opt = Net(is_weight_decay) 175 _ = opt(Tensor(beta1), Tensor(beta2), Tensor(one_sub_beta_1), Tensor(one_sub_beta_2), Tensor(gradient), Tensor(eps), 176 Tensor(weight_decay_tensor), Tensor(lr)) 177 param_expect, m_expect, v_expect = CalFusedAdam( 178 beta1, beta2, one_sub_beta_1, one_sub_beta_2, gradient, eps, weight_decay_tensor, lr, 179 param, m, v, is_weight_decay) 180 181 assert np.allclose(opt.param.data.asnumpy(), param_expect, 182 rtol=1.e-4, atol=1.e-8, equal_nan=True) 183 assert np.allclose(opt.m.data.asnumpy(), m_expect, 184 rtol=1.e-4, atol=1.e-8, equal_nan=True) 185 assert np.allclose(opt.v.data.asnumpy(), v_expect, 186 rtol=1.e-4, atol=1.e-8, equal_nan=True) 187 188 189def test_adam_side_effect(): 190 np.random.seed(0) 191 beta1 = np.array([0.9]).astype(np.float32) 192 beta2 = np.array([0.999]).astype(np.float32) 193 one_sub_beta_1 = (np.array([1.0]) - np.array([0.9])).astype(np.float32) 194 one_sub_beta_2 = (np.array([1.0]) - np.array([0.999])).astype(np.float32) 195 lr = np.array([0.012]).astype(np.float32) 196 eps = np.array([1e-6]).astype(np.float32) 197 weight_decay_tensor = np.array([0.021]).astype(np.float32) 198 199 gradient = np.array([0.01, 0.03, 0.05]).astype(np.float32) 200 m = np.array([0.11, 0.33, 0.55]).astype(np.float32) 201 v = np.array([1.2, 3.4, 5.6]).astype(np.float32) 202 param = np.array([1, 3, 5]).astype(np.float32) 203 is_weight_decay = False 204 opt = SideEffectFusedAdamNet(is_weight_decay) 205 _ = opt(Tensor(beta1), Tensor(beta2), Tensor(one_sub_beta_1), Tensor(one_sub_beta_2), Tensor(gradient), Tensor(eps), 206 Tensor(weight_decay_tensor), Tensor(lr)) 207 param_expect, m_expect, v_expect = CalFusedAdam( 208 beta1, beta2, one_sub_beta_1, one_sub_beta_2, gradient, eps, weight_decay_tensor, lr, 209 param, m, v, is_weight_decay) 210 assert np.allclose(opt.param.data.asnumpy(), param_expect, 211 rtol=1.e-4, atol=1.e-8, equal_nan=True) 212 assert np.allclose(opt.m.data.asnumpy(), m_expect, 213 rtol=1.e-4, atol=1.e-8, equal_nan=True) 214 assert np.allclose(opt.v.data.asnumpy(), v_expect, 215 rtol=1.e-4, atol=1.e-8, equal_nan=True) 216 assert np.allclose(opt.x.data.asnumpy(), m_expect, 217 rtol=1.e-4, atol=1.e-8, equal_nan=True) 218 219 220@pytest.mark.level0 221@pytest.mark.platform_x86_gpu_training 222@pytest.mark.env_onecard 223def test_adam_gpu(): 224 context.set_context(mode=context.GRAPH_MODE, 225 enable_graph_kernel=True, device_target="GPU") 226 test_adam() 227 228 229def test_adam_ascend(): 230 context.set_context(mode=context.GRAPH_MODE, 231 enable_graph_kernel=True, device_target="Ascend") 232 test_adam() 233 234 235@pytest.mark.level0 236@pytest.mark.platform_x86_gpu_training 237@pytest.mark.env_onecard 238def test_adam_weight_decay_gpu(): 239 context.set_context(mode=context.GRAPH_MODE, 240 enable_graph_kernel=True, device_target="GPU") 241 test_adam_weight_decay() 242 243 244def test_adam_weight_decay_ascend(): 245 context.set_context(mode=context.GRAPH_MODE, 246 enable_graph_kernel=True, device_target="Ascend") 247 test_adam_weight_decay() 248 249 250@pytest.mark.level0 251@pytest.mark.platform_x86_gpu_training 252@pytest.mark.env_onecard 253def test_adam_side_effect_gpu(): 254 context.set_context(mode=context.GRAPH_MODE, 255 enable_graph_kernel=True, device_target="GPU") 256 test_adam_side_effect() 257 258 259@pytest.mark.level2 260@pytest.mark.platform_arm_ascend_training 261@pytest.mark.platform_x86_ascend_training 262@pytest.mark.env_onecard 263def test_adam_side_effect_ascend(): 264 context.set_context(mode=context.GRAPH_MODE, 265 enable_graph_kernel=True, device_target="Ascend") 266 test_adam_side_effect() 267