st/optimizer/optimizer_utils.py

# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

import numpy as np
import mindspore
from mindspore import nn, Tensor
from mindspore.ops import operations as P
from mindspore.nn.optim import ASGD
from mindspore.nn.optim import Rprop
from mindspore.nn.optim import AdaMax

np.random.seed(1024)

fc1_weight = np.array([[0.72346634, 0.95608497, 0.4084163, 0.18627149,
                        0.6942514, 0.39767185, 0.24918061, 0.4548748],
                       [0.7203382, 0.19086994, 0.76286614, 0.87920564,
                        0.3169892, 0.9462494, 0.62827677, 0.27504718],
                       [0.3544535, 0.2524781, 0.5370583, 0.8313121,
                        0.6670143, 0.0488653, 0.62225235, 0.7546456],
                       [0.17985944, 0.05106374, 0.31064633, 0.4863033,
                        0.848814, 0.5523157, 0.20295663, 0.7213356]]).astype("float32")

fc1_bias = np.array([0.79708564, 0.13728078, 0.66322654, 0.88128525]).astype("float32")

fc2_weight = np.array([[0.8473515, 0.50923985, 0.42287776, 0.29769543]]).astype("float32")

fc2_bias = np.array([0.09996348]).astype("float32")


def make_fake_data():
    """
    make fake data
    """
    data, label = [], []
    for i in range(20):
        data.append(mindspore.Tensor(np.array(np.ones((2, 8)) * i, dtype=np.float32)))
        label.append(mindspore.Tensor(np.array(np.ones((2, 1)) * (i + 1), dtype=np.float32)))
    return data, label


class NetWithLoss(nn.Cell):
    """
    build net with loss
    """

    def __init__(self, network, loss_fn):
        super(NetWithLoss, self).__init__()
        self.network = network
        self.loss = loss_fn

    def construct(self, x, label):
        out = self.network(x)
        loss = self.loss(out, label)
        return loss


class FakeNet(nn.Cell):
    """
    build fake net
    """

    def __init__(self):
        super(FakeNet, self).__init__()
        self.fc1 = nn.Dense(in_channels=8, out_channels=4, weight_init=Tensor(fc1_weight), bias_init=Tensor(fc1_bias))
        self.fc2 = nn.Dense(in_channels=4, out_channels=1, weight_init=Tensor(fc2_weight), bias_init=Tensor(fc2_bias))
        self.relu = nn.ReLU()
        self.reducemean = P.ReduceMean()

    def construct(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def _initialize_weights(self):
        """
        parameter initialization
        """
        self.init_parameters_data()
        for name, m in self.cells_and_names():
            if name == 'fc1':
                m.weight.set_data(Tensor(fc1_weight))
                m.bias.set_data(Tensor(fc1_bias))
            elif name == 'fc2':
                m.weight.set_data(Tensor(fc2_weight))
                m.bias.set_data(Tensor(fc2_bias))


def build_network(opt_config, net, is_group=None, loss_fn=None):
    """
    Construct training
    """
    if is_group is None:
        is_group = False
    if loss_fn is None:
        loss_fn = nn.L1Loss(reduction='mean')
    losses = []
    networkwithloss = NetWithLoss(net, loss_fn)
    networkwithloss.set_train()

    if is_group:
        fc1_params = list(filter(lambda x: 'fc1' in x.name, networkwithloss.trainable_params()))
        fc2_params = list(filter(lambda x: 'fc1' not in x.name, networkwithloss.trainable_params()))
        if opt_config['name'] == 'ASGD':
            params = [{'params': fc1_params, 'weight_decay': 0.01, 'lr': 0.01}, {'params': fc2_params, 'lr': 0.1}]
        elif opt_config['name'] == 'adamax':
            params = [{'params': fc1_params, 'lr': 0.0018}, {'params': fc2_params, 'lr': 0.0022}]
        elif opt_config['name'] == 'SGD':
            params = [{'params': fc1_params, 'weight_decay': 0.2}, {'params': fc2_params}]
        else:
            params = [{'params': fc1_params, 'lr': 0.01}, {'params': fc2_params, 'lr': 0.01}]
    else:
        params = networkwithloss.trainable_params()

    if opt_config['name'] == 'ASGD':
        net_opt = ASGD(params, learning_rate=opt_config['lr'], lambd=opt_config['lambd'], alpha=opt_config['alpha'],
                       t0=opt_config['t0'], weight_decay=opt_config['weight_decay'])

    elif opt_config['name'] == 'Rprop':
        net_opt = Rprop(params, learning_rate=opt_config['lr'], etas=opt_config['etas'],
                        step_sizes=opt_config['step_sizes'], weight_decay=0.0)

    elif opt_config['name'] == 'adamax':
        net_opt = AdaMax(params, learning_rate=opt_config['lr'], beta1=opt_config['beta1'],
                         beta2=opt_config['beta2'], eps=opt_config['eps'], weight_decay=0.0)
    elif opt_config['name'] == 'SGD':
        net_opt = nn.SGD(params, weight_decay=opt_config['weight_decay'], dampening=0.3, momentum=0.1)
    trainonestepcell = mindspore.nn.TrainOneStepCell(networkwithloss, net_opt)
    data, label = make_fake_data()
    for i in range(20):
        loss = trainonestepcell(data[i], label[i])
        losses.append(loss.asnumpy())
    return np.array(losses), net_opt


default_fc1_weight_asgd = np.array([[0.460443, 0.693057, 0.145399, -0.076741, 0.431228, 0.134655,
                                     -0.013833, 0.191857],
                                    [0.391073, -0.138385, 0.433600, 0.549937, -0.012268, 0.616980,
                                     0.299013, -0.054209],
                                    [0.064144, -0.037829, 0.246745, 0.540993, 0.376698, -0.241438,
                                     0.331937, 0.464328],
                                    [-0.066224, -0.195017, 0.064560, 0.240214, 0.602717, 0.306225,
                                     -0.043127, 0.475241]], dtype=np.float32)
default_fc1_bias_asgd = np.array([0.740427, 0.091827, 0.624849, 0.851911], dtype=np.float32)
default_fc2_weight_asgd = np.array([[0.585555, 0.512303, 0.424419, 0.323499]], dtype=np.float32)
default_fc2_bias_asgd = np.array([0.059962], dtype=np.float32)

no_default_fc1_weight_asgd = np.array([[0.645291, 0.877900, 0.330253, 0.108117, 0.616077, 0.319509, 0.171024,
                                        0.376710],
                                       [0.687056, 0.157610, 0.729583, 0.845918, 0.283724, 0.912958, 0.594999,
                                        0.241783],
                                       [0.328432, 0.226461, 0.511030, 0.805272, 0.640981, 0.022857, 0.596221,
                                        0.728608],
                                       [0.165102, 0.036311, 0.295884, 0.471533, 0.834030, 0.537543, 0.188198,
                                        0.706556]], dtype=np.float32)
no_default_fc1_bias_asgd = np.array([0.785650, 0.131580, 0.658614, 0.878328], dtype=np.float32)
no_default_fc2_weight_asgd = np.array([[0.374859, -0.049370, -0.068307, -0.115195]], dtype=np.float32)
no_default_fc2_bias_asgd = np.array([0.083960], dtype=np.float32)

no_default_group_fc1_weight_asgd = np.array([[0.197470, 0.429578, -0.116887, -0.338544, 0.168320, -0.127608,
                                              -0.275773, -0.070531],
                                             [0.119964, -0.408341, 0.162399, 0.278482, -0.282498, 0.345379,
                                              0.028105, -0.324348],
                                             [-0.168310, -0.270062, 0.013893, 0.307500, 0.143563, -0.473227,
                                              0.098900, 0.231002],
                                             [-0.254349, -0.382861, -0.123849, 0.051422, 0.413136, 0.117289,
                                              -0.231302, 0.285938]], dtype=np.float32)
no_default_group_fc1_bias_asgd = np.array([0.706595, 0.042866, 0.579553, 0.811499], dtype=np.float32)
no_default_group_fc2_weight_asgd = np.array([[-0.076689, -0.092399, -0.072100, -0.054189]], dtype=np.float32)
no_default_group_fc2_bias_asgd = np.array([0.698678], dtype=np.float32)

default_fc1_weight_sgd = np.array([[0.00533873, 0.03210080, -0.03090680, -0.05646387, 0.00197765,
                                    -0.03214293, -0.04922638, -0.02556189],
                                   [-0.00658702, -0.06750072, -0.00169432, 0.01169018, -0.05299109,
                                    0.01940336, -0.01717841, -0.05781638],
                                   [-0.03723934, -0.04897130, -0.01623122, 0.01762178, -0.00128018,
                                    -0.07239634, -0.00642990, 0.00880153],
                                   [-0.04421479, -0.05903235, -0.02916817, -0.00895938, 0.03274637,
                                    -0.00136485, -0.04155754, 0.01808037]], dtype=np.float32)
default_fc2_weight_sgd = np.array([[-0.01070179, -0.00702989, -0.00210839, 0.00160410]], dtype=np.float32)

default_fc1_weight_adamax = np.array([[0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000,
                                       0.00000000, 0.00000000, 0.00000000],
                                      [11.18415642, 11.18415642, 11.18415642, 11.18415642, 11.18415642,
                                       11.18415642, 11.18415642, 11.18415642],
                                      [-6.70855522, -6.70855522, -6.70855522, -6.70855522, -6.70855522,
                                       -6.70855522, -6.70855522, -6.70855522],
                                      [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000,
                                       0.00000000, 0.00000000, 0.00000000]], dtype=np.float32)
default_fc1_bias_adamax = np.array([0.00000000, 0.86349380, -0.51633584, 0.00000000], dtype=np.float32)

no_default_fc1_weight_adamax = np.array([[0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000,
                                          0.00000000, 0.00000000, 0.00000000],
                                         [-4.02891350, -4.02891350, -4.02891350, -4.02891350, -4.02891350,
                                          -4.02891350, -4.02891350, -4.02891350],
                                         [3.10859227, 3.10859227, 3.10859227, 3.10859227, 3.10859227,
                                          3.10859227, 3.10859227, 3.10859227],
                                         [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000,
                                          0.00000000, 0.00000000, 0.00000000]], dtype=np.float32)
no_default_fc1_bias_adamax = np.array([0.00000000, -0.04809491, 0.06205747, 0.00000000], dtype=np.float32)

default_group_fc1_weight_adamax = np.array([[0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000,
                                             0.00000000, 0.00000000, 0.00000000],
                                            [11.07278919, 11.07278919, 11.07278919, 11.07278919, 11.07278919,
                                             11.07278919, 11.07278919, 11.07278919],
                                            [-6.81674862, -6.81674862, -6.81674862, -6.81674862, -6.81674862,
                                             -6.81674862, -6.81674862, -6.81674862],
                                            [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000,
                                             0.00000000, 0.00000000, 0.00000000]], dtype=np.float32)
default_group_fc1_bias_adamax = np.array([0.00000000, 0.85614461, -0.52348828, 0.00000000], dtype=np.float32)

default_fc1_weight_rprop = np.array([[9.10877514, 9.10877514, 9.10877514, 9.10877514, 9.10877514,
                                      9.10877514, 9.10877514, 9.10877514],
                                     [2.68465400, 2.68465400, 2.68465400, 2.68465400, 2.68465400,
                                      2.68465400, 2.68465400, 2.68465400],
                                     [1.04377401, 1.04377401, 1.04377401, 1.04377401, 1.04377401,
                                      1.04377401, 1.04377401, 1.04377401],
                                     [-1.33468997, -1.33468997, -1.33468997, -1.33468997, -1.33468997,
                                      -1.33468997, -1.33468997, -1.33468997]], dtype=np.float32)
default_fc1_bias_rprop = np.array([0.47940922, 0.14129758, 0.05493547, -0.07024684], dtype=np.float32)

no_default_fc1_weight_rprop = np.array([[8.41605091, 8.41605091, 8.41605091, 8.41605091, 8.41605091, 8.41605091,
                                         8.41605091, 8.41605091],
                                        [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000,
                                         0.00000000, 0.00000000],
                                        [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000,
                                         0.00000000, 0.00000000],
                                        [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000,
                                         0.00000000, 0.00000000]], dtype=np.float32)
no_default_fc1_bias_rprop = np.array([0.44295004, 0.00000000, 0.00000000, 0.00000000], dtype=np.float32)

default_group_fc1_weight_rprop = np.array([[8.41605091, 8.41605091, 8.41605091, 8.41605091, 8.41605091, 8.41605091,
                                            8.41605091, 8.41605091],
                                           [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000,
                                            0.00000000, 0.00000000],
                                           [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000,
                                            0.00000000, 0.00000000],
                                           [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000,
                                            0.00000000, 0.00000000]], dtype=np.float32)
default_group_fc1_bias_rprop = np.array([0.44295004, 0.00000000, 0.00000000, 0.00000000], dtype=np.float32)