• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_GROUP_MANAGER_H_
18 #define MINDSPORE_CCSRC_FRONTEND_PARALLEL_GROUP_MANAGER_H_
19 
20 #include <cstdint>
21 #include <map>
22 #include <string>
23 #include <vector>
24 #include <utility>
25 
26 #include "frontend/parallel/device.h"
27 #include "frontend/parallel/status.h"
28 
29 namespace mindspore {
30 namespace parallel {
31 constexpr char HCCL_WORLD_GROUP[] = "hccl_world_group";
32 constexpr char NCCL_WORLD_GROUP[] = "nccl_world_group";
33 constexpr char UNDEFINED_WORLD_GROUP[] = "undefined_world_group";
34 
35 // Devices that need communication should in the same group. These classes are used to
36 // create and destroy group among devices.
37 class Group {
38  public:
39   Group();
40   ~Group() = default;
41   Status Init(const std::string &name, const std::vector<Device> &devices);
42   std::vector<Device> GetDevicesList() const;
name()43   std::string name() const { return name_; }
44   bool IsInThisGroup(int64_t device_rank);
45   Status GetIndex(size_t *index);
GetDevNum()46   size_t GetDevNum() const { return devices_.size(); }
47 
48  private:
49   std::string name_;
50   std::vector<Device> devices_;
51 };
52 
53 class GroupManager {
54  public:
55   GroupManager();
56   ~GroupManager() = default;
57 
58   Status CreateGroup(const std::string &name, const std::vector<Device> &devices, Group *group);
59   Status DestroyGroup(Group *group);
60   Status DestroyAllGroups();
61   Status GetRankID(const std::string &name, uint32_t *rank_id);
62   Status GetRankSize(const std::string &name, uint32_t *rank_size);
63   Status FindGroup(const std::string &name, Group **group);
world_group()64   std::string world_group() const { return world_group_; }
set_world_group(const std::string & name)65   void set_world_group(const std::string &name) { world_group_ = name; }
group_info()66   std::vector<std::pair<std::string, std::vector<uint32_t>>> group_info() const { return group_info_; }
67   void Clear();
68 
69  private:
70   bool CreateGroupByExecutor(const std::string &device_name, const std::string &group_name,
71                              const std::vector<uint32_t> ranks, uint32_t device_id);
72   bool DestroyGroupByExecutor(const std::string &device_name, const std::string &group_name, uint32_t device_id);
73   Status DestroyGroup(const std::string &group_name);
74   // the key is group name (name_)
75   std::map<std::string, Group> groups_;
76   std::string world_group_;
77   std::vector<std::pair<std::string, std::vector<uint32_t>>> group_info_;
78 };
79 
80 Status CreateGroups(const std::vector<std::pair<std::string, std::vector<uint32_t>>> &group_info);
81 }  // namespace parallel
82 }  // namespace mindspore
83 
84 #endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_GROUP_MANAGER_H_
85