1 /** 2 * Copyright 2019 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_GROUP_MANAGER_H_ 18 #define MINDSPORE_CCSRC_FRONTEND_PARALLEL_GROUP_MANAGER_H_ 19 20 #include <cstdint> 21 #include <map> 22 #include <string> 23 #include <vector> 24 #include <utility> 25 26 #include "frontend/parallel/device.h" 27 #include "frontend/parallel/status.h" 28 29 namespace mindspore { 30 namespace parallel { 31 constexpr char HCCL_WORLD_GROUP[] = "hccl_world_group"; 32 constexpr char NCCL_WORLD_GROUP[] = "nccl_world_group"; 33 constexpr char UNDEFINED_WORLD_GROUP[] = "undefined_world_group"; 34 35 // Devices that need communication should in the same group. These classes are used to 36 // create and destroy group among devices. 37 class Group { 38 public: 39 Group(); 40 ~Group() = default; 41 Status Init(const std::string &name, const std::vector<Device> &devices); 42 std::vector<Device> GetDevicesList() const; name()43 std::string name() const { return name_; } 44 bool IsInThisGroup(int64_t device_rank); 45 Status GetIndex(size_t *index); GetDevNum()46 size_t GetDevNum() const { return devices_.size(); } 47 48 private: 49 std::string name_; 50 std::vector<Device> devices_; 51 }; 52 53 class GroupManager { 54 public: 55 GroupManager(); 56 ~GroupManager() = default; 57 58 Status CreateGroup(const std::string &name, const std::vector<Device> &devices, Group *group); 59 Status DestroyGroup(Group *group); 60 Status DestroyAllGroups(); 61 Status GetRankID(const std::string &name, uint32_t *rank_id); 62 Status GetRankSize(const std::string &name, uint32_t *rank_size); 63 Status FindGroup(const std::string &name, Group **group); world_group()64 std::string world_group() const { return world_group_; } set_world_group(const std::string & name)65 void set_world_group(const std::string &name) { world_group_ = name; } group_info()66 std::vector<std::pair<std::string, std::vector<uint32_t>>> group_info() const { return group_info_; } 67 void Clear(); 68 69 private: 70 bool CreateGroupByExecutor(const std::string &device_name, const std::string &group_name, 71 const std::vector<uint32_t> ranks, uint32_t device_id); 72 bool DestroyGroupByExecutor(const std::string &device_name, const std::string &group_name, uint32_t device_id); 73 Status DestroyGroup(const std::string &group_name); 74 // the key is group name (name_) 75 std::map<std::string, Group> groups_; 76 std::string world_group_; 77 std::vector<std::pair<std::string, std::vector<uint32_t>>> group_info_; 78 }; 79 80 Status CreateGroups(const std::vector<std::pair<std::string, std::vector<uint32_t>>> &group_info); 81 } // namespace parallel 82 } // namespace mindspore 83 84 #endif // MINDSPORE_CCSRC_FRONTEND_PARALLEL_GROUP_MANAGER_H_ 85