1 /**
2 * Copyright 2020-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <cmath>
18 #include <algorithm>
19 #include <limits>
20 #include <memory>
21 #include <bitset>
22 #include <tuple>
23 #include <type_traits>
24 #include "debug/debugger/tensor_summary.h"
25
26 #ifdef OFFLINE_DBG_MODE
27 #include "base/float16.h"
28 #endif
29
30 #ifdef ONLINE_DBG_MODE
31 namespace mindspore {
32 #endif
33 using CONDITION_TYPE = DebugServices::CONDITION_TYPE;
34
RangeCountCalculator()35 RangeCountCalculator::RangeCountCalculator()
36 : range_start_inclusive(-std::numeric_limits<double>::infinity()),
37 range_end_inclusive(std::numeric_limits<double>::infinity()),
38 count(0),
39 total(0) {}
40
ProcessElement(double element)41 void RangeCountCalculator::ProcessElement(double element) {
42 count += (element >= range_start_inclusive && element <= range_end_inclusive);
43 total += 1;
44 }
45
GetPercentInRange() const46 double RangeCountCalculator::GetPercentInRange() const {
47 if (total == 0) {
48 return 0.0;
49 }
50 const double factor = 100.0;
51 return factor * count / total;
52 }
53
AllCloseCalculator()54 AllCloseCalculator::AllCloseCalculator() : atol(1.0e-8), rtol(1.0e-5), result(true) {}
55
ProcessElement(double current,double previous)56 void AllCloseCalculator::ProcessElement(double current, double previous) {
57 result = result && (std::abs(current - previous) <= (atol + rtol * std::abs(previous)));
58 }
59
IsAllClose() const60 bool AllCloseCalculator::IsAllClose() const { return result; }
61
MeanCalculator()62 MeanCalculator::MeanCalculator() : mean(0.0), count(0) {}
63
ProcessElement(double value)64 void MeanCalculator::ProcessElement(double value) {
65 count += 1;
66 double delta = value - mean;
67 mean += delta / count;
68 }
69
GetMean() const70 double MeanCalculator::GetMean() const { return mean; }
71
VarianceAndMeanCalculator()72 VarianceAndMeanCalculator::VarianceAndMeanCalculator() : mean(0.0), count(0), m2(0.0) {}
73
ProcessElement(double value)74 void VarianceAndMeanCalculator::ProcessElement(double value) {
75 count += 1;
76 double delta = value - mean;
77 mean += delta / count;
78 m2 += delta * (value - mean);
79 }
80
GetMean() const81 double VarianceAndMeanCalculator::GetMean() const { return mean; }
82
GetVariance() const83 double VarianceAndMeanCalculator::GetVariance() const {
84 if (count > 1) {
85 return m2 / (count - 1);
86 }
87 return 0.0;
88 }
89
GetStandardDeviation()90 double VarianceAndMeanCalculator::GetStandardDeviation() { return sqrt(GetVariance()); }
91
92 template <typename T>
TensorSummary(const void * current_tensor_ptr,const void * const previous_tensor_ptr,uint32_t num_elements,uint32_t prev_num_elements)93 TensorSummary<T>::TensorSummary(const void *current_tensor_ptr, const void *const previous_tensor_ptr,
94 uint32_t num_elements, uint32_t prev_num_elements)
95 : current_tensor_ptr_(reinterpret_cast<const T *>(current_tensor_ptr)),
96 prev_tensor_ptr_(reinterpret_cast<const T *>(previous_tensor_ptr)),
97 num_elements_(num_elements),
98 prev_num_elements_(prev_num_elements),
99 min_(std::numeric_limits<double>::max()),
100 max_(std::numeric_limits<double>::lowest()),
101 avg_(0.0),
102 is_bool_(false),
103 neg_zero_count_(0),
104 pos_zero_count_(0),
105 pos_inf_count_(0),
106 neg_inf_count_(0),
107 inf_count_(0),
108 nan_count_(0),
109 zero_count_(0),
110 epsilon_(1.0e-9),
111 mean_sd_cal_enabled_(false) {}
112
113 template <typename T>
SummarizeTensor(const std::vector<DebugServices::watchpoint_t> & wps)114 void TensorSummary<T>::SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &wps) {
115 InitCalculators(wps);
116 for (size_t i = 0; i < num_elements_; ++i) {
117 auto current_value = static_cast<double>(current_tensor_ptr_[i]);
118 double previous_value = std::numeric_limits<double>::quiet_NaN();
119 if (prev_tensor_ptr_) {
120 if (num_elements_ == prev_num_elements_) {
121 previous_value = static_cast<double>(prev_tensor_ptr_[i]);
122 } else {
123 MS_LOG(DEBUG) << "Current and previous tensor are not the same size.";
124 }
125 }
126 if (std::isinf(current_value)) {
127 inf_count_ += 1;
128 }
129 if (std::isnan(current_value)) {
130 nan_count_ += 1;
131 }
132 if (current_value == 0) {
133 zero_count_ += 1;
134 }
135 max_ = std::max(max_, current_value);
136 min_ = std::min(min_, current_value);
137 if (mean_sd_cal_enabled_) {
138 current_mean_variance_.ProcessElement(current_value);
139 }
140 for (auto &it : all_close_) {
141 it.second->ProcessElement(current_value, previous_value);
142 }
143 for (auto &range_count : range_counts_) {
144 range_count.second->ProcessElement(current_value);
145 }
146 for (auto &mean : means_) {
147 if (mean.first.compare("curr_prev_diff_mean") == 0) {
148 mean.second->ProcessElement(std::abs(current_value - previous_value));
149 } else if (mean.first.compare("abs_prev_mean") == 0) {
150 mean.second->ProcessElement(std::abs(previous_value));
151 } else if (mean.first.compare("abs_current_mean") == 0) {
152 mean.second->ProcessElement(std::abs(current_value));
153 }
154 }
155 }
156 }
157
158 template <typename T>
TensorStatistics(DbgDataType dtype_value)159 void TensorSummary<T>::TensorStatistics(DbgDataType dtype_value) {
160 if (dtype_value == DT_BOOL) {
161 is_bool_ = true;
162 }
163 double sum_elements = 0.0;
164 for (size_t i = 0; i < num_elements_; ++i) {
165 auto current_value = static_cast<double>(current_tensor_ptr_[i]);
166 if (std::isinf(current_value)) {
167 if (current_value > 0) {
168 pos_inf_count_ += 1;
169 } else {
170 neg_inf_count_ += 1;
171 }
172 }
173 if (current_value == 0) {
174 zero_count_ += 1;
175 }
176 if (std::isnan(current_value)) {
177 nan_count_ += 1;
178 }
179 if (!(std::isnan(current_value) || std::isinf(current_value))) {
180 // only considering tensor elements with value
181 if (std::signbit(current_value) && !(current_value == 0)) {
182 neg_zero_count_ += 1;
183 } else if (!(current_value == 0)) {
184 pos_zero_count_ += 1;
185 }
186 max_ = std::max(max_, current_value);
187 min_ = std::min(min_, current_value);
188 sum_elements += current_value;
189 }
190 }
191 unsigned int value_count = zero_count_ + neg_zero_count_ + pos_zero_count_;
192 avg_ = sum_elements / value_count;
193 }
194
195 template <typename T>
IsWatchpointHit(DebugServices::watchpoint_t wp)196 std::tuple<bool, int, std::vector<DebugServices::parameter_t>> TensorSummary<T>::IsWatchpointHit(
197 DebugServices::watchpoint_t wp) {
198 auto parameter_list = wp.parameter_list;
199 bool hit = false;
200 const uint8_t bit_size = 32;
201 std::bitset<bit_size> error_code;
202 CONDITION_TYPE type = wp.condition.type;
203 // bit 0 denotes presence of nan
204 (void)error_code.set(0, nan_count_ > 0);
205 // bit 1 denotes presence of inf
206 (void)error_code.set(1, inf_count_ > 0);
207
208 if (type == CONDITION_TYPE::HAS_NAN) {
209 error_code.reset();
210 hit = nan_count_ > 0;
211 } else if (type == CONDITION_TYPE::HAS_INF) {
212 error_code.reset();
213 hit = inf_count_ > 0;
214 } else if (type == CONDITION_TYPE::GENERAL_OVERFLOW) {
215 error_code.reset();
216 hit = (nan_count_ + inf_count_) > 0;
217 } else if (type == CONDITION_TYPE::NOT_CHANGED && prev_tensor_ptr_ && error_code.none()) {
218 hit = all_close_[wp.id]->IsAllClose();
219 } else if ((type == CONDITION_TYPE::NOT_CHANGED || type == CONDITION_TYPE::CHANGE_TOO_LARGE ||
220 type == CONDITION_TYPE::CHANGE_TOO_SMALL) &&
221 !prev_tensor_ptr_) {
222 // bit 2 denotes absence of previous tensor
223 error_code.set(2, true);
224 }
225
226 if (error_code.none()) {
227 for (auto ¶meter : parameter_list) {
228 if (parameter.disabled || error_code.any()) {
229 continue;
230 }
231 // extract inequality type from watchpoint for backward compatibility
232 std::string inequality_type;
233 if (wp.is_gt_wp()) {
234 inequality_type = "gt";
235 } else if (wp.is_lt_wp()) {
236 inequality_type = "lt";
237 }
238 parameter.Evaluate(StatLookup(parameter.name, wp), inequality_type);
239 hit = hit || parameter.hit;
240 }
241 }
242 return std::make_tuple(hit, static_cast<int32_t>(error_code.to_ulong()), parameter_list);
243 }
244
245 template <typename T>
StatLookup(const std::string & parameter_name,const DebugServices::watchpoint_t & wp)246 double_t TensorSummary<T>::StatLookup(const std::string ¶meter_name, const DebugServices::watchpoint_t &wp) {
247 if (parameter_name == "param") return StatLookup(wp);
248 std::string param_type;
249 auto pos = parameter_name.find_last_of('_');
250 if (pos != std::string::npos) {
251 param_type = parameter_name.substr(0, pos);
252 }
253
254 if (param_type == "max") {
255 return max_;
256 }
257 if (param_type == "min") {
258 return min_;
259 }
260 if (param_type == "max_min") {
261 return max_ - min_;
262 }
263 if (param_type == "mean") {
264 return current_mean_variance_.GetMean();
265 }
266 if (param_type == "sd") {
267 return current_mean_variance_.GetStandardDeviation();
268 }
269 if (param_type == "abs_mean") {
270 if (means_.find("abs_current_mean") != means_.end()) {
271 return means_["abs_current_mean"]->GetMean();
272 }
273 }
274 if (param_type == "abs_mean_update_ratio" && prev_tensor_ptr_) {
275 if (means_.find("curr_prev_diff_mean") != means_.end() && means_.find("abs_prev_mean") != means_.end()) {
276 return means_["curr_prev_diff_mean"]->GetMean() / (means_["abs_prev_mean"]->GetMean() + epsilon_);
277 }
278 }
279 if (param_type == "range_percentage") {
280 if (range_counts_.find(wp.id) != range_counts_.end()) {
281 return range_counts_[wp.id]->GetPercentInRange();
282 }
283 }
284 if (param_type == "zero_percentage") {
285 return GetZeroValPercent();
286 }
287 return std::numeric_limits<double_t>::quiet_NaN();
288 }
289
290 template <typename T>
StatLookup(const DebugServices::watchpoint_t & wp)291 double_t TensorSummary<T>::StatLookup(const DebugServices::watchpoint_t &wp) {
292 CONDITION_TYPE type = wp.condition.type;
293 if (type == CONDITION_TYPE::MAX_LT || type == CONDITION_TYPE::MAX_GT) {
294 return max_;
295 }
296 if (type == CONDITION_TYPE::MIN_LT || type == CONDITION_TYPE::MIN_GT) {
297 return min_;
298 }
299 if (type == CONDITION_TYPE::MEAN_LT || type == CONDITION_TYPE::MEAN_GT) {
300 return current_mean_variance_.GetMean();
301 }
302 if (type == CONDITION_TYPE::SD_LT || type == CONDITION_TYPE::SD_GT) {
303 return current_mean_variance_.GetStandardDeviation();
304 }
305 if (type == CONDITION_TYPE::MAX_MIN_GT || type == CONDITION_TYPE::MAX_MIN_LT) {
306 return max_ - min_;
307 }
308 return std::numeric_limits<double_t>::quiet_NaN();
309 }
310
311 template <typename T>
GetZeroValPercent()312 double_t TensorSummary<T>::GetZeroValPercent() {
313 if (num_elements_ == 0) {
314 return 0;
315 }
316
317 return (zero_count_ * 100.0) / num_elements_;
318 }
319
320 template <typename T>
InitCalculators(const std::vector<DebugServices::watchpoint_t> & wps)321 void TensorSummary<T>::InitCalculators(const std::vector<DebugServices::watchpoint_t> &wps) {
322 for (auto &wp : wps) {
323 auto wp_id = wp.id;
324 mean_sd_cal_enabled_ = mean_sd_cal_enabled_ || wp.mean_sd_enabled();
325 if (wp.allclose_enabled() && prev_tensor_ptr_) {
326 all_close_[wp_id] = std::make_unique<AllCloseCalculator>();
327 if (!wp.parameter_list[0].disabled) {
328 all_close_[wp_id]->set_atol(wp.parameter_list[0].value);
329 }
330 if (!wp.parameter_list[1].disabled) {
331 all_close_[wp_id]->set_rtol(wp.parameter_list[1].value);
332 }
333 } else if (wp.range_enabled()) {
334 range_counts_[wp_id] = std::make_unique<RangeCountCalculator>();
335 if (!wp.parameter_list[0].disabled) {
336 range_counts_[wp_id]->set_range_start_inclusive(wp.parameter_list[0].value);
337 }
338 if (!wp.parameter_list[1].disabled) {
339 range_counts_[wp_id]->set_range_end_inclusive(wp.parameter_list[1].value);
340 }
341 } else if (wp.tensor_update_ratio_mean_enabled() && prev_tensor_ptr_) {
342 (void)means_.emplace("curr_prev_diff_mean", std::make_unique<MeanCalculator>());
343 (void)means_.emplace("abs_prev_mean", std::make_unique<MeanCalculator>());
344 } else if (wp.abs_mean_enabled()) {
345 (void)means_.emplace("abs_current_mean", std::make_unique<MeanCalculator>());
346 }
347 }
348 }
349 template class TensorSummary<uint8_t>;
350 template class TensorSummary<int8_t>;
351 template class TensorSummary<uint16_t>;
352 template class TensorSummary<int16_t>;
353 template class TensorSummary<uint32_t>;
354 template class TensorSummary<int32_t>;
355 template class TensorSummary<uint64_t>;
356 template class TensorSummary<int64_t>;
357 template class TensorSummary<float16>;
358 template class TensorSummary<float>;
359 template class TensorSummary<double>;
360 template class TensorSummary<bool>;
361 #ifdef ONLINE_DBG_MODE
362 } // namespace mindspore
363 #endif
364