• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2020 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ============================================================================
15
16"""test bert thor performance with 8p on mlperf dataset"""
17
18import os
19from multiprocessing import Process, Queue
20import pytest
21import numpy as np
22import mindspore.nn as nn
23from mindspore import Tensor
24from mindspore import dtype as mstype
25from mindspore.ops import operations as P
26import mindspore.communication.management as D
27from mindspore import context
28from mindspore.context import ParallelMode
29
30MINDSPORE_HCCL_CONFIG_PATH = "/home/workspace/mindspore_config/hccl/rank_table_8p.json"
31
32np.random.seed(1)
33os.environ['GLOG_v'] = str(2)
34
35class AllReduceNet(nn.Cell):
36    def __init__(self):
37        super(AllReduceNet, self).__init__()
38        self.all_reduce = P.AllReduce()
39
40    def construct(self, x):
41        return self.all_reduce(x)
42
43def train_allreduce_8p(q, device_id, device_num):
44    os.system("mkdir " + str(device_id))
45    os.chdir(str(device_id))
46    context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend", device_id=device_id)
47    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH
48    os.environ['RANK_ID'] = str(device_id)
49    os.environ['RANK_SIZE'] = str(device_num)
50    D.init()
51    context.reset_auto_parallel_context()
52    context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
53                                      device_num=device_num)
54
55    net = AllReduceNet()
56    input_x = np.ones([32, 255, 255, 3]).astype(np.float32)
57    except_output = input_x * 8
58    output = net(Tensor(input_x, mstype.float32))
59    q.put(np.allclose(output.asnumpy(), except_output))
60
61@pytest.mark.level0
62@pytest.mark.platform_arm_ascend_training
63@pytest.mark.platform_x86_ascend_training
64@pytest.mark.env_single
65def test_pynative_hccl_8p():
66    device_num = 8
67    process = []
68    q = Queue()
69    for i in range(device_num):
70        device_id = i
71        process.append(Process(target=train_allreduce_8p, args=(q, device_id, device_num)))
72
73    for i in range(device_num):
74        process[i].start()
75
76    print("Waiting for all subprocesses done...")
77
78    for i in range(device_num):
79        process[i].join()
80
81    # check result
82    for i in range(device_num):
83        assert not q.empty()
84        assert q.get()
85
86    for i in range(device_num):
87        os.system("rm -rf " + str(i))
88
89    print("End training...")
90
91@pytest.mark.level1
92@pytest.mark.platform_arm_ascend_training
93@pytest.mark.platform_x86_ascend_training
94@pytest.mark.env_single
95def test_pynative_hccl_8pv2():
96    os.environ['GRAPH_OP_RUN'] = str(1)
97    device_num = 8
98    process = []
99    q = Queue()
100    for i in range(device_num):
101        device_id = i
102        process.append(Process(target=train_allreduce_8p, args=(q, device_id, device_num)))
103
104    for i in range(device_num):
105        process[i].start()
106
107    print("Waiting for all subprocesses done...")
108
109    for i in range(device_num):
110        process[i].join()
111
112    # check result
113    for i in range(device_num):
114        assert not q.empty()
115        assert q.get()
116
117    for i in range(device_num):
118        os.system("rm -rf " + str(i))
119
120    print("End training...")
121