• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_DEVICE_MANAGER_H_
18 #define MINDSPORE_CCSRC_FRONTEND_PARALLEL_DEVICE_MANAGER_H_
19 
20 #include <cstdint>
21 #include <cstring>
22 #include <map>
23 #include <memory>
24 #include <string>
25 #include <utility>
26 #include <vector>
27 
28 #include "frontend/parallel/device.h"
29 #include "frontend/parallel/device_matrix.h"
30 #include "frontend/parallel/group_manager.h"
31 #include "frontend/parallel/status.h"
32 #include "frontend/parallel/strategy.h"
33 #include "utils/convert_utils.h"
34 #include "utils/ms_utils.h"
35 
36 namespace mindspore {
37 namespace parallel {
38 #define MAX_DEVICE_NUM 4096
39 
40 constexpr char HCCL_BACKEND[] = "hccl";
41 constexpr char NCCL_BACKEND[] = "nccl";
42 constexpr char UNDEFINED_BACKEND[] = "undefined_backend";
43 
44 class DeviceManager;
45 using DeviceManagerPtr = std::shared_ptr<DeviceManager>;
46 // 'g_device_manager' is the globally unique manager to manage the devices.
47 extern DeviceManagerPtr g_device_manager;
48 
49 // This method is used for initializing the global DeviceManager 'g_device_manager',
50 // arguments including 'device_num' and 'global_rank'
51 bool InitDevice(int64_t device_num, int64_t global_rank, const std::string &backend, const std::vector<int64_t> &stage);
52 
53 void CheckGlobalDeviceManager();
54 
55 std::string HashName(const std::string &rank_list_name);
56 
57 class DeviceManager {
58   // This class is used to manage the abstract devices, including group-related and stage-related management.
59  public:
DeviceManager()60   DeviceManager() { gm_ = GroupManager(); }
61   ~DeviceManager() = default;
62 
63   Status Init(const RankList &devices, int64_t local_device, const RankList &stage_map, const std::string &backend);
64 
65   static DeviceManager &GetInstance();
66   RankList GetDeviceListByStageId(int64_t stage_id) const;
67   RankList GetDeviceListInThisStage() const;
68 
69   Device CreateNewDeviceByRank(int64_t rank) const;
70   std::vector<Device> CreateDeviceListByRankList(RankList ranks);
71 
72   std::string GenerateGroupNameByRanks(RankList dev_ranks);
73   Group CreateGroup(const std::string &group_name, const std::vector<Device> &devices);
74   Group CreateGroup(const RankList &dev_ranks);
75 
DeviceNum()76   size_t DeviceNum() const { return devices_.size(); }
stage_num()77   int64_t stage_num() const { return stage_num_; }
stage_device_num()78   int64_t stage_device_num() const { return stage_device_num_; }
stage_id()79   int64_t stage_id() const { return stage_id_; }
rank_index_in_stage()80   int64_t rank_index_in_stage() const { return rank_index_in_stage_; }
global_rank()81   int64_t global_rank() const { return global_rank_; }
backend()82   std::string backend() const { return backend_; }
group_manager()83   GroupManager group_manager() const { return gm_; }
set_group_manager(const GroupManager & gm)84   void set_group_manager(const GroupManager &gm) { gm_ = gm; }
85 
86   void Clear();
world_group()87   std::string world_group() const { return gm_.world_group(); }
group_info()88   std::vector<std::pair<std::string, std::vector<uint32_t>>> group_info() const { return gm_.group_info(); }
89   std::string FindRankListNameByHashName(const std::string &hash_name);
90 
91  private:
92   std::vector<std::shared_ptr<Device>> devices_;
93   // each stage has a list of devices
94   std::vector<std::vector<int64_t>> stage_devices_;
95   std::shared_ptr<Device> device_;
96   GroupManager gm_;
97   std::string backend_;
98 
99   // bimap:
100   std::map<std::string, std::string> rank_to_group_;  // the key is rank list, value is hash name
101   std::map<std::string, std::string> group_to_rank_;  // the key is hash name, value is rank list
102 
103   int64_t global_rank_ = 0;          // the real rank in all devices
104   int64_t stage_num_ = 1;            // the stage num
105   int64_t stage_id_ = 0;             // the stage id of the global_rank_
106   int64_t rank_index_in_stage_ = 0;  // the index of this rank in it's stage
107   int64_t stage_device_num_ = 0;     // the device num of one stage
108 };
109 }  // namespace parallel
110 }  // namespace mindspore
111 
112 #endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_DEVICE_MANAGER_H_
113