1# Copyright 2020-2021 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================ 15"""Generate the summary event which conform to proto format.""" 16import io 17import platform 18import time 19 20import numpy as np 21from PIL import Image 22 23from mindspore import log as logger 24from mindspore import context 25from mindspore.communication.management import get_rank 26from mindspore.communication.management import GlobalComm 27 28from ..._checkparam import Validator 29from ..anf_ir_pb2 import DataType, ModelProto 30from ..summary_pb2 import Event 31 32# define the MindSpore image format 33MS_IMAGE_TENSOR_FORMAT = 'NCHW' 34# Set the Event mark 35EVENT_FILE_NAME_MARK = ".out.events.summary." 36# Set the init event of version and mark 37EVENT_FILE_INIT_VERSION_MARK = "MindSpore.Event:" 38EVENT_FILE_INIT_VERSION = 1 39 40F32_MIN, F32_MAX = np.finfo(np.float32).min, np.finfo(np.float32).max 41 42 43def get_event_file_name(prefix, suffix, time_second): 44 """ 45 Create file name: file_prefix + EVENT_FILE_NAME_MARK + time(seconds) + "." + Hostname + file_suffix. 46 47 Args: 48 prefix (str): The prefix of file name. 49 suffix (str): The suffix of file name. 50 time_second (str): The time stamp of file name. 51 52 Returns: 53 String, the name of event log file. 54 """ 55 Validator.check_str_by_regular(prefix) 56 Validator.check_str_by_regular(suffix) 57 file_name = "" 58 hostname = platform.node() 59 60 device_num = context.get_auto_parallel_context('device_num') 61 device_id = context.get_context('device_id') 62 if device_num > 1 or GlobalComm.WORLD_COMM_GROUP == 'nccl_world_group': 63 # Notice: 64 # In GPU distribute training scene, get_context('device_id') will not work, 65 # so we use get_rank instead of get_context. 66 device_id = get_rank() 67 68 file_name = f'{file_name}{EVENT_FILE_NAME_MARK}{time_second}.{device_id}.{hostname}' 69 70 if prefix is not None: 71 file_name = prefix + file_name 72 73 if suffix is not None: 74 file_name = file_name + suffix 75 76 return file_name 77 78 79def package_init_event(): 80 """Package the summary init event.""" 81 init_event = Event() 82 init_event.wall_time = time.time() 83 version = EVENT_FILE_INIT_VERSION_MARK + str(EVENT_FILE_INIT_VERSION) 84 init_event.version = version 85 return init_event 86 87 88def package_graph_event(data): 89 """ 90 Package the summary graph event. 91 92 Args: 93 data (Bytes): Graph bytes string. 94 95 Returns: 96 Event, event log object. 97 """ 98 graph_event = Event() 99 graph_event.wall_time = time.time() 100 modelp = ModelProto() 101 modelp.ParseFromString(data) 102 graph_event.graph_def.CopyFrom(modelp.graph) 103 return graph_event 104 105 106def package_summary_event(data_list, step, wall_time): 107 """ 108 Package the summary to event protobuffer. 109 110 Args: 111 data_list (list): Summary data list. 112 step (Number): The recode step index. 113 wall_time (float): The wall time. 114 115 Returns: 116 Summary, the summary event. 117 """ 118 # create the event of summary 119 summary_event = Event() 120 summary = summary_event.summary 121 summary_event.wall_time = wall_time 122 summary_event.step = int(step) 123 124 for value in data_list: 125 summary_type = value["_type"] 126 data = value["data"] 127 tag = value["name"] 128 129 logger.debug(f"Now process {summary_type} summary, tag = {tag}") 130 131 summary_value = summary.value.add() 132 summary_value.tag = tag 133 # get the summary type and parse the tag 134 if summary_type == 'Scalar': 135 if not _fill_scalar_summary(tag, data, summary_value): 136 del summary.value[-1] 137 elif summary_type == 'Tensor': 138 _fill_tensor_summary(tag, data, summary_value.tensor) 139 elif summary_type == 'Image': 140 if not _fill_image_summary(tag, data, summary_value.image, MS_IMAGE_TENSOR_FORMAT): 141 del summary.value[-1] 142 elif summary_type == 'Histogram': 143 _fill_histogram_summary(tag, data, summary_value.histogram) 144 else: 145 # The data is invalid ,jump the data 146 logger.error(f"Summary type({summary_type}) is error, tag = {tag}") 147 del summary.value[-1] 148 149 return summary_event 150 151 152def _nptype_to_prototype(np_value): 153 """ 154 Transform the np type to proto type. 155 156 Args: 157 np_value (Type): Numpy data type. 158 159 Returns: 160 Type, proto data type. 161 """ 162 np2pt_tbl = { 163 np.bool_: 'DT_BOOL', 164 np.int8: 'DT_INT8', 165 np.int16: 'DT_INT16', 166 np.int32: 'DT_INT32', 167 np.int64: 'DT_INT64', 168 np.uint8: 'DT_UINT8', 169 np.uint16: 'DT_UINT16', 170 np.uint32: 'DT_UINT32', 171 np.uint64: 'DT_UINT64', 172 np.float16: 'DT_FLOAT16', 173 np.float: 'DT_FLOAT64', 174 np.float32: 'DT_FLOAT32', 175 np.float64: 'DT_FLOAT64', 176 None: 'DT_UNDEFINED' 177 } 178 np_type = None 179 if np_value is None: 180 logger.error("The numpy value is none") 181 else: 182 np_type = np_value.dtype.type 183 184 proto = np2pt_tbl.get(np_type, None) 185 if proto is None: 186 raise TypeError("No match for proto data type.") 187 188 return proto 189 190 191def _fill_scalar_summary(tag: str, np_value, summary): 192 """ 193 Package the scalar summary. 194 195 Args: 196 tag (str): Summary tag describe. 197 np_value (Object): Scalary object. 198 199 Returns: 200 Summary, return scalar summary content. 201 """ 202 logger.debug(f"Set({tag}) the scalar summary value") 203 if np_value.size == 1: 204 # is scalar 205 summary.scalar_value = np_value.item() 206 return True 207 if np_value.size > 1: 208 logger.warning( 209 f"The tensor is not a single scalar, tag = {tag}, ndim = {np_value.ndim}, shape = {np_value.shape}") 210 summary.scalar_value = next(np_value.flat).item() 211 return True 212 logger.error(f"There no values inside tensor, tag = {tag}, size = {np_value.size}") 213 return False 214 215 216def _fill_tensor_summary(tag: str, np_value, summary_tensor): 217 """ 218 Package the tensor summary. 219 220 Args: 221 tag (str): Summary tag describe. 222 np_value (Type): Summary data type. 223 summary_tensor (Tensor): The tensor of summary. 224 225 Returns: 226 Summary, return tensor summary content. 227 """ 228 logger.debug(f"Set({tag}) the tensor summary value") 229 # get tensor dtype 230 tensor_dtype = _nptype_to_prototype(np_value) 231 summary_tensor.data_type = DataType.Value(tensor_dtype) 232 233 # get the value list 234 tensor_value_list = np_value.reshape(-1).tolist() 235 summary_tensor.float_data.extend(tensor_value_list) 236 237 # get the tensor dim 238 for v in np_value.shape: 239 summary_tensor.dims.append(v) 240 241 return summary_tensor 242 243 244def _calc_histogram_bins(count): 245 """ 246 Calculates experience-based optimal bins number for histogram. 247 248 There should be enough number in each bin. So we calc bin numbers according to count. For very small count(1 - 249 10), we assign carefully chosen number. For large count, we tried to make sure there are 9-10 numbers in each 250 bucket on average. Too many bins will slow down performance, so we set max number of bins to 90. 251 252 Args: 253 count (int): Valid number count for the tensor. 254 255 Returns: 256 int, number of histogram bins. 257 """ 258 max_bins, max_per_bin = 90, 10 259 260 if not count: 261 return 1 262 if count <= 5: 263 return 2 264 if count <= 10: 265 return 3 266 if count <= 880: 267 # note that math.ceil(881/10) + 1 equals 90 268 return count // max_per_bin + 1 269 270 return max_bins 271 272 273def _fill_histogram_summary(tag: str, np_value: np.ndarray, summary) -> None: 274 """ 275 Package the histogram summary. 276 277 Args: 278 tag (str): Summary tag describe. 279 np_value (np.ndarray): Summary data. 280 summary (summary_pb2.Summary.Histogram): Summary histogram data. 281 """ 282 logger.debug(f"Set({tag}) the histogram summary value") 283 # Default bucket for tensor with no valid data. 284 ma_value = np.ma.masked_invalid(np_value) 285 total, valid = np_value.size, ma_value.count() 286 invalids = [] 287 for isfn in np.isnan, np.isposinf, np.isneginf: 288 if total - valid > sum(invalids): 289 invalids.append(np.count_nonzero(isfn(np_value))) 290 else: 291 invalids.append(0) 292 293 summary.count = total 294 summary.nan_count, summary.pos_inf_count, summary.neg_inf_count = invalids 295 if not valid: 296 logger.warning(f'There are no valid values in the ndarray(size={total}, shape={np_value.shape})') 297 # summary.{min, max, sum} are 0s by default, no need to explicitly set 298 else: 299 # BUG: max of a masked array with dtype np.float16 returns inf 300 # See numpy issue#15077 301 if issubclass(np_value.dtype.type, np.floating): 302 summary.min = ma_value.min(fill_value=np.PINF) 303 summary.max = ma_value.max(fill_value=np.NINF) 304 if summary.min < F32_MIN or summary.max > F32_MAX: 305 logger.warning(f'Values({summary.min}, {summary.max}) are too large, ' 306 f'you may encounter some undefined behaviours hereafter.') 307 else: 308 summary.min = ma_value.min() 309 summary.max = ma_value.max() 310 summary.sum = ma_value.sum(dtype=np.float64) 311 _fill_bucket(valid, np_value, summary) 312 313 314def _fill_bucket(valid, np_value, summary): 315 """ 316 Fill the bucket. 317 318 Args: 319 valid (int): The count of valid data. 320 np_value (np.ndarray): Summary data. 321 summary (summary_pb2.Summary.Histogram): Summary histogram data. 322 """ 323 bins = _calc_histogram_bins(valid) 324 first_edge, last_edge = summary.min, summary.max 325 326 if not first_edge < last_edge: 327 first_edge -= 0.5 328 last_edge += 0.5 329 330 bins = np.linspace(first_edge, last_edge, bins + 1, dtype=np_value.dtype) 331 hists, edges = np.histogram(np_value, bins=bins) 332 333 for hist, edge1, edge2 in zip(hists, edges, edges[1:]): 334 bucket = summary.buckets.add() 335 bucket.width = edge2 - edge1 336 bucket.count = hist 337 bucket.left = edge1 338 339 340def _fill_image_summary(tag: str, np_value, summary_image, input_format='NCHW'): 341 """ 342 Package the image summary. 343 344 Args: 345 tag (str): Summary tag describe. 346 np_value (Type): Summary data type. 347 summary_image (Tensor): The tensor of summary. 348 input_format (str): Data sort order index. Default: 'NCHW'. 349 350 Returns: 351 Summary, return image summary content. 352 """ 353 logger.debug(f"Set({tag}) the image summary value") 354 if np_value.ndim != 4 or np_value.shape[1] not in (1, 3): 355 logger.error(f"The value is not Image, tag = {tag}, ndim = {np_value.ndim}, shape={np_value.shape}") 356 return False 357 358 if np_value.ndim != len(input_format): 359 logger.error( 360 f"The tensor with dim({np_value.ndim}) can't convert the format({input_format}) because dim not same") 361 return False 362 363 if 0 in np_value.shape: 364 logger.error( 365 f"The tensor with shape({np_value.shape}) is not a valid image because the shape contains zero.") 366 return False 367 368 # convert the tensor format 369 tensor = _convert_image_format(np_value, input_format) 370 371 # convert the tensor dtype 372 # Do not assume that user passes in values in [0, 255], use data type to detect 373 scale_factor = 1 374 if tensor.dtype == np.uint8: 375 scale_factor = 1 376 elif np.max(tensor) <= 1 and np.min(tensor) >= 0: 377 scale_factor = 255 378 tensor = tensor.astype(np.float32) 379 tensor = (tensor * scale_factor).astype(np.uint8) 380 381 # create the image summary 382 height, width, channel, image_string = _make_image(tensor) 383 summary_image.height = height 384 summary_image.width = width 385 summary_image.colorspace = channel 386 summary_image.encoded_image = image_string 387 return True 388 389 390def _make_image(tensor, rescale=1): 391 """ 392 Convert a numpy representation of an image to Image protobuf. 393 394 Args: 395 tensor (Tensor): The image data. 396 rescale (Number): The rescale value. Default: 1. 397 398 Returns: 399 (Number, Number, Number, Bytes), return the height, width, channel, image string . 400 """ 401 height, width, channel = tensor.shape 402 scaled_height = int(height * rescale) 403 scaled_width = int(width * rescale) 404 image = Image.fromarray(tensor) 405 image = image.resize((scaled_width, scaled_height), Image.ANTIALIAS) 406 output = io.BytesIO() 407 image.save(output, format='PNG') 408 image_string = output.getvalue() 409 output.close() 410 return height, width, channel, image_string 411 412 413def _convert_image_format(np_tensor, input_format, out_format='HWC'): 414 """ 415 Convert the image format. 416 417 Args: 418 np_tensor (Tensor): The image data. 419 input_format (str): Input data format. 420 out_format (str): The output data format. Default: 'HWC'. 421 422 Returns: 423 Tensor, return format image. 424 """ 425 input_format = input_format.upper() 426 427 # convert the NCHW 428 if input_format != 'NCHW': 429 index = [input_format.find(c) for c in 'NCHW'] 430 tensor_nchw = np_tensor.transpose(index) 431 else: 432 tensor_nchw = np_tensor 433 434 # make grid to expand N 435 tensor_chw = _make_canvas_for_imgs(tensor_nchw) 436 437 # convert to out format 438 out_index = ['CHW'.find(c) for c in out_format] 439 out_tensor = tensor_chw.transpose(out_index) 440 return out_tensor 441 442 443def _make_canvas_for_imgs(tensor, col_imgs=8): 444 """ 445 Expand the N, show imgs on a canvas. 446 447 Args: 448 tensor (Tensor): The canvas value. 449 col_imgs (Number): The image colume number. Default: 8. 450 451 Returns: 452 Tensor, return canvas of image. 453 """ 454 # expand the N1HW to N3HW 455 if tensor.shape[1] == 1: 456 tensor = np.concatenate([tensor, tensor, tensor], 1) 457 458 # expand the N 459 n = tensor.shape[0] 460 h = tensor.shape[2] 461 w = tensor.shape[3] 462 cols = min(n, col_imgs) 463 rows = int(np.ceil(float(n) / cols)) 464 465 # create the canvas: expand the n 466 out_canvas = np.zeros((3, h * rows, w * cols)) 467 i = 0 468 for y in range(rows): 469 for x in range(cols): 470 if i >= n: 471 break 472 out_canvas[:, y * h:(y + 1) * h, x * w:(x + 1) * w] = tensor[i] 473 i = i + 1 474 return out_canvas 475