1 /** 2 * Copyright 2019 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_DEVICE_MANAGER_H_ 18 #define MINDSPORE_CCSRC_FRONTEND_PARALLEL_DEVICE_MANAGER_H_ 19 20 #include <cstdint> 21 #include <cstring> 22 #include <map> 23 #include <memory> 24 #include <string> 25 #include <utility> 26 #include <vector> 27 28 #include "frontend/parallel/device.h" 29 #include "frontend/parallel/device_matrix.h" 30 #include "frontend/parallel/group_manager.h" 31 #include "frontend/parallel/status.h" 32 #include "frontend/parallel/strategy.h" 33 #include "utils/convert_utils.h" 34 #include "utils/ms_utils.h" 35 36 namespace mindspore { 37 namespace parallel { 38 #define MAX_DEVICE_NUM 4096 39 40 constexpr char HCCL_BACKEND[] = "hccl"; 41 constexpr char NCCL_BACKEND[] = "nccl"; 42 constexpr char UNDEFINED_BACKEND[] = "undefined_backend"; 43 44 class DeviceManager; 45 using DeviceManagerPtr = std::shared_ptr<DeviceManager>; 46 // 'g_device_manager' is the globally unique manager to manage the devices. 47 extern DeviceManagerPtr g_device_manager; 48 49 // This method is used for initializing the global DeviceManager 'g_device_manager', 50 // arguments including 'device_num' and 'global_rank' 51 bool InitDevice(int64_t device_num, int64_t global_rank, const std::string &backend, const std::vector<int64_t> &stage); 52 53 void CheckGlobalDeviceManager(); 54 55 std::string HashName(const std::string &rank_list_name); 56 57 class DeviceManager { 58 // This class is used to manage the abstract devices, including group-related and stage-related management. 59 public: DeviceManager()60 DeviceManager() { gm_ = GroupManager(); } 61 ~DeviceManager() = default; 62 63 Status Init(const RankList &devices, int64_t local_device, const RankList &stage_map, const std::string &backend); 64 65 static DeviceManager &GetInstance(); 66 RankList GetDeviceListByStageId(int64_t stage_id) const; 67 RankList GetDeviceListInThisStage() const; 68 69 Device CreateNewDeviceByRank(int64_t rank) const; 70 std::vector<Device> CreateDeviceListByRankList(RankList ranks); 71 72 std::string GenerateGroupNameByRanks(RankList dev_ranks); 73 Group CreateGroup(const std::string &group_name, const std::vector<Device> &devices); 74 Group CreateGroup(const RankList &dev_ranks); 75 DeviceNum()76 size_t DeviceNum() const { return devices_.size(); } stage_num()77 int64_t stage_num() const { return stage_num_; } stage_device_num()78 int64_t stage_device_num() const { return stage_device_num_; } stage_id()79 int64_t stage_id() const { return stage_id_; } rank_index_in_stage()80 int64_t rank_index_in_stage() const { return rank_index_in_stage_; } global_rank()81 int64_t global_rank() const { return global_rank_; } backend()82 std::string backend() const { return backend_; } group_manager()83 GroupManager group_manager() const { return gm_; } set_group_manager(const GroupManager & gm)84 void set_group_manager(const GroupManager &gm) { gm_ = gm; } 85 86 void Clear(); world_group()87 std::string world_group() const { return gm_.world_group(); } group_info()88 std::vector<std::pair<std::string, std::vector<uint32_t>>> group_info() const { return gm_.group_info(); } 89 std::string FindRankListNameByHashName(const std::string &hash_name); 90 91 private: 92 std::vector<std::shared_ptr<Device>> devices_; 93 // each stage has a list of devices 94 std::vector<std::vector<int64_t>> stage_devices_; 95 std::shared_ptr<Device> device_; 96 GroupManager gm_; 97 std::string backend_; 98 99 // bimap: 100 std::map<std::string, std::string> rank_to_group_; // the key is rank list, value is hash name 101 std::map<std::string, std::string> group_to_rank_; // the key is hash name, value is rank list 102 103 int64_t global_rank_ = 0; // the real rank in all devices 104 int64_t stage_num_ = 1; // the stage num 105 int64_t stage_id_ = 0; // the stage id of the global_rank_ 106 int64_t rank_index_in_stage_ = 0; // the index of this rank in it's stage 107 int64_t stage_device_num_ = 0; // the device num of one stage 108 }; 109 } // namespace parallel 110 } // namespace mindspore 111 112 #endif // MINDSPORE_CCSRC_FRONTEND_PARALLEL_DEVICE_MANAGER_H_ 113