1 /** 2 * Copyright 2019 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_GROUP_MANAGER_H_ 18 #define MINDSPORE_CCSRC_FRONTEND_PARALLEL_GROUP_MANAGER_H_ 19 20 #include <cstdint> 21 #include <map> 22 #include <string> 23 #include <vector> 24 #include <utility> 25 26 #include "frontend/parallel/device.h" 27 #include "frontend/parallel/status.h" 28 29 namespace mindspore { 30 namespace parallel { 31 constexpr char HCCL_WORLD_GROUP[] = "hccl_world_group"; 32 constexpr char NCCL_WORLD_GROUP[] = "nccl_world_group"; 33 constexpr char UNDEFINED_WORLD_GROUP[] = "undefined_world_group"; 34 35 // Devices that need communication should in the same group. These classes are used to 36 // create and destroy group among devices. 37 class Group { 38 public: 39 Group(); 40 ~Group() = default; 41 Status Init(const std::string &name, const std::vector<Device> &devices); 42 std::vector<Device> GetDevicesList() const; name()43 std::string name() const { return name_; } 44 bool IsInThisGroup(int64_t device_rank); 45 Status GetIndex(size_t *index); 46 Status GetIndexByRank(int64_t rank, size_t *index); GetDevNum()47 size_t GetDevNum() const { return devices_.size(); } 48 49 private: 50 std::string name_; 51 std::vector<Device> devices_; 52 }; 53 54 class GroupManager { 55 public: 56 GroupManager(); 57 ~GroupManager() = default; 58 59 Status CreateGroup(const std::string &group_name, const std::vector<Device> &devices, Group *const group); 60 Status CreateGlobalGroup(const std::string &group_name, const std::vector<Device> &devices, Group *const group); 61 Status DestroyGroup(Group *const group); 62 Status DestroyAllGroups(); 63 Status GetRankID(const std::string &name, uint32_t *const rank_id); 64 Status GetRankSize(const std::string &name, uint32_t *const rank_size); 65 Status FindGroup(const std::string &name, Group **group); world_group()66 std::string world_group() const { return world_group_; } set_world_group(const std::string & name)67 void set_world_group(const std::string &name) { world_group_ = name; } group_info()68 std::vector<std::pair<std::string, std::vector<uint32_t>>> group_info() const { return group_info_; } 69 void Clear(); 70 71 private: 72 bool CreateGroupByExecutor(const std::string &device_name, const std::string &group_name, 73 const std::vector<uint32_t> ranks, uint32_t device_id) const; 74 bool DestroyGroupByExecutor(const std::string &device_name, const std::string &group_name, uint32_t device_id) const; 75 Status DestroyGroup(const std::string &group_name) const; 76 // the key is group name (name_) 77 std::map<std::string, Group> groups_; 78 std::string world_group_; 79 std::vector<std::pair<std::string, std::vector<uint32_t>>> group_info_; 80 }; 81 82 Status CreateGroups(const std::vector<std::pair<std::string, std::vector<uint32_t>>> &group_info); 83 } // namespace parallel 84 } // namespace mindspore 85 86 #endif // MINDSPORE_CCSRC_FRONTEND_PARALLEL_GROUP_MANAGER_H_ 87