• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_GROUP_MANAGER_H_
18 #define MINDSPORE_CCSRC_FRONTEND_PARALLEL_GROUP_MANAGER_H_
19 
20 #include <cstdint>
21 #include <map>
22 #include <string>
23 #include <vector>
24 #include <utility>
25 
26 #include "frontend/parallel/device.h"
27 #include "frontend/parallel/status.h"
28 
29 namespace mindspore {
30 namespace parallel {
31 constexpr char HCCL_WORLD_GROUP[] = "hccl_world_group";
32 constexpr char NCCL_WORLD_GROUP[] = "nccl_world_group";
33 constexpr char UNDEFINED_WORLD_GROUP[] = "undefined_world_group";
34 
35 // Devices that need communication should in the same group. These classes are used to
36 // create and destroy group among devices.
37 class Group {
38  public:
39   Group();
40   ~Group() = default;
41   Status Init(const std::string &name, const std::vector<Device> &devices);
42   std::vector<Device> GetDevicesList() const;
name()43   std::string name() const { return name_; }
44   bool IsInThisGroup(int64_t device_rank);
45   Status GetIndex(size_t *index);
46   Status GetIndexByRank(int64_t rank, size_t *index);
GetDevNum()47   size_t GetDevNum() const { return devices_.size(); }
48 
49  private:
50   std::string name_;
51   std::vector<Device> devices_;
52 };
53 
54 class GroupManager {
55  public:
56   GroupManager();
57   ~GroupManager() = default;
58 
59   Status CreateGroup(const std::string &group_name, const std::vector<Device> &devices, Group *const group);
60   Status CreateGlobalGroup(const std::string &group_name, const std::vector<Device> &devices, Group *const group);
61   Status DestroyGroup(Group *const group);
62   Status DestroyAllGroups();
63   Status GetRankID(const std::string &name, uint32_t *const rank_id);
64   Status GetRankSize(const std::string &name, uint32_t *const rank_size);
65   Status FindGroup(const std::string &name, Group **group);
world_group()66   std::string world_group() const { return world_group_; }
set_world_group(const std::string & name)67   void set_world_group(const std::string &name) { world_group_ = name; }
group_info()68   std::vector<std::pair<std::string, std::vector<uint32_t>>> group_info() const { return group_info_; }
69   void Clear();
70 
71  private:
72   bool CreateGroupByExecutor(const std::string &device_name, const std::string &group_name,
73                              const std::vector<uint32_t> ranks, uint32_t device_id) const;
74   bool DestroyGroupByExecutor(const std::string &device_name, const std::string &group_name, uint32_t device_id) const;
75   Status DestroyGroup(const std::string &group_name) const;
76   // the key is group name (name_)
77   std::map<std::string, Group> groups_;
78   std::string world_group_;
79   std::vector<std::pair<std::string, std::vector<uint32_t>>> group_info_;
80 };
81 
82 Status CreateGroups(const std::vector<std::pair<std::string, std::vector<uint32_t>>> &group_info);
83 }  // namespace parallel
84 }  // namespace mindspore
85 
86 #endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_GROUP_MANAGER_H_
87