1 /**
2 * Copyright 2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef MINDSPORE_CCSRC_FL_SERVER_ITERATION_METRICS_H_
18 #define MINDSPORE_CCSRC_FL_SERVER_ITERATION_METRICS_H_
19
20 #include <map>
21 #include <string>
22 #include <memory>
23 #include <fstream>
24 #include "ps/ps_context.h"
25 #include "ps/core/configuration.h"
26 #include "ps/core/file_configuration.h"
27 #include "fl/server/local_meta_store.h"
28 #include "fl/server/iteration.h"
29
30 namespace mindspore {
31 namespace fl {
32 namespace server {
33 constexpr auto kFLName = "flName";
34 constexpr auto kInstanceStatus = "instanceStatus";
35 constexpr auto kFLIterationNum = "flIterationNum";
36 constexpr auto kCurIteration = "currentIteration";
37 constexpr auto kJoinedClientNum = "joinedClientNum";
38 constexpr auto kRejectedClientNum = "rejectedClientNum";
39 constexpr auto kMetricsAuc = "metricsAuc";
40 constexpr auto kMetricsLoss = "metricsLoss";
41 constexpr auto kIterExecutionTime = "iterationExecutionTime";
42
43 const std::map<InstanceState, std::string> kInstanceStateName = {
44 {InstanceState::kRunning, "running"}, {InstanceState::kDisable, "disable"}, {InstanceState::kFinish, "finish"}};
45
46 template <typename T>
JsonGetKeyWithException(const nlohmann::json & json,const std::string & key)47 inline T JsonGetKeyWithException(const nlohmann::json &json, const std::string &key) {
48 if (!json.contains(key)) {
49 MS_LOG(EXCEPTION) << "The key " << key << "does not exist in json " << json.dump();
50 }
51 return json[key].get<T>();
52 }
53
54 constexpr auto kMetrics = "metrics";
55
56 class IterationMetrics {
57 public:
IterationMetrics(const std::string & config_file)58 explicit IterationMetrics(const std::string &config_file)
59 : config_file_path_(config_file),
60 config_(nullptr),
61 fl_name_(""),
62 fl_iteration_num_(0),
63 cur_iteration_num_(0),
64 instance_state_(InstanceState::kFinish),
65 loss_(0.0),
66 accuracy_(0.0),
67 joined_client_num_(0),
68 rejected_client_num_(0),
69 iteration_time_cost_(0) {}
70 ~IterationMetrics() = default;
71
72 bool Initialize();
73
74 // Gather the details of this iteration and output to the persistent storage.
75 bool Summarize();
76
77 // Clear data in persistent storage.
78 bool Clear();
79
80 // Setters for the metrics data.
81 void set_fl_name(const std::string &fl_name);
82 void set_fl_iteration_num(size_t fl_iteration_num);
83 void set_cur_iteration_num(size_t cur_iteration_num);
84 void set_instance_state(InstanceState state);
85 void set_loss(float loss);
86 void set_accuracy(float acc);
87 void set_joined_client_num(size_t joined_client_num);
88 void set_rejected_client_num(size_t rejected_client_num);
89 void set_iteration_time_cost(uint64_t iteration_time_cost);
90
91 private:
92 // This is the main config file set by ps context.
93 std::string config_file_path_;
94 std::unique_ptr<ps::core::FileConfiguration> config_;
95
96 // The metrics file object.
97 std::fstream metrics_file_;
98
99 // The metrics file path.
100 std::string metrics_file_path_;
101
102 // Json object of metrics data.
103 nlohmann::basic_json<std::map, std::vector, std::string, bool, int64_t, uint64_t, float> js_;
104
105 // The federated learning job name. Set by ps_context.
106 std::string fl_name_;
107
108 // Federated learning iteration number. Set by ps_context.
109 // If this number of iterations are completed, one instance is finished.
110 size_t fl_iteration_num_;
111
112 // Current iteration number.
113 size_t cur_iteration_num_;
114
115 // Current instance state.
116 InstanceState instance_state_;
117
118 // The training loss after this federated learning iteration, passed by worker.
119 float loss_;
120
121 // The evaluation result after this federated learning iteration, passed by worker.
122 float accuracy_;
123
124 // The number of clients which join the federated aggregation.
125 size_t joined_client_num_;
126
127 // The number of clients which are not involved in federated aggregation.
128 size_t rejected_client_num_;
129
130 // The time cost in millisecond for this completed iteration.
131 uint64_t iteration_time_cost_;
132 };
133 } // namespace server
134 } // namespace fl
135 } // namespace mindspore
136 #endif // MINDSPORE_CCSRC_FL_SERVER_ITERATION_METRICS_H_
137