1# Copyright 2020-2021 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================ 15"""Generate the summary event which conform to proto format.""" 16from __future__ import absolute_import 17from __future__ import division 18 19import io 20import platform 21import time 22 23import numpy as np 24from PIL import Image 25 26from mindspore import log as logger 27from mindspore import context 28from mindspore.communication.management import get_rank 29from mindspore.communication.management import GlobalComm 30 31from mindspore import _checkparam as Validator 32from mindspore.train.anf_ir_pb2 import DataType, ModelProto 33from mindspore.train.summary_pb2 import Event 34 35# define the MindSpore image format 36MS_IMAGE_TENSOR_FORMAT = 'NCHW' 37# Set the Event mark 38EVENT_FILE_NAME_MARK = ".out.events.summary." 39# Set the init event of version and mark 40EVENT_FILE_INIT_VERSION_MARK = "MindSpore.Event:" 41EVENT_FILE_INIT_VERSION = 1 42 43F32_MIN, F32_MAX = np.finfo(np.float32).min, np.finfo(np.float32).max 44 45 46def get_event_file_name(prefix, suffix, time_second): 47 """ 48 Create file name: file_prefix + EVENT_FILE_NAME_MARK + time(seconds) + "." + Hostname + file_suffix. 49 50 Args: 51 prefix (str): The prefix of file name. 52 suffix (str): The suffix of file name. 53 time_second (str): The time stamp of file name. 54 55 Returns: 56 String, the name of event log file. 57 """ 58 Validator.check_str_by_regular(prefix) 59 Validator.check_str_by_regular(suffix) 60 file_name = "" 61 hostname = platform.node() 62 63 device_num = context.get_auto_parallel_context('device_num') 64 device_id = context.get_context('device_id') 65 if device_num > 1 or GlobalComm.WORLD_COMM_GROUP == 'nccl_world_group': 66 # Notice: 67 # In GPU distribute training scene, get_context('device_id') will not work, 68 # so we use get_rank instead of get_context. 69 device_id = get_rank() 70 71 file_name = f'{file_name}{EVENT_FILE_NAME_MARK}{time_second}.{device_id}.{hostname}' 72 73 if prefix is not None: 74 file_name = prefix + file_name 75 76 if suffix is not None: 77 file_name = file_name + suffix 78 79 return file_name 80 81 82def package_init_event(): 83 """Package the summary init event.""" 84 init_event = Event() 85 init_event.wall_time = time.time() 86 version = EVENT_FILE_INIT_VERSION_MARK + str(EVENT_FILE_INIT_VERSION) 87 init_event.version = version 88 return init_event 89 90 91def package_graph_event(data): 92 """ 93 Package the summary graph event. 94 95 Args: 96 data (Bytes): Graph bytes string. 97 98 Returns: 99 Event, event log object. 100 """ 101 graph_event = Event() 102 graph_event.wall_time = time.time() 103 modelp = ModelProto() 104 modelp.ParseFromString(data) 105 graph_event.graph_def.CopyFrom(modelp.graph) 106 return graph_event 107 108 109def package_summary_event(data_list, step, wall_time): 110 """ 111 Package the summary to event protobuffer. 112 113 Args: 114 data_list (list): Summary data list. 115 step (Number): The recode step index. 116 wall_time (float): The wall time. 117 118 Returns: 119 Summary, the summary event. 120 """ 121 # create the event of summary 122 summary_event = Event() 123 summary = summary_event.summary 124 summary_event.wall_time = wall_time 125 summary_event.step = int(step) 126 127 for value in data_list: 128 summary_type = value["_type"] 129 data = value["data"] 130 tag = value["name"] 131 132 logger.debug(f"Now process {summary_type} summary, tag = {tag}") 133 134 summary_value = summary.value.add() 135 summary_value.tag = tag 136 # get the summary type and parse the tag 137 if summary_type == 'Scalar': 138 if not _fill_scalar_summary(tag, data, summary_value): 139 del summary.value[-1] 140 elif summary_type == 'Tensor': 141 _fill_tensor_summary(tag, data, summary_value.tensor) 142 elif summary_type == 'Image': 143 if not _fill_image_summary(tag, data, summary_value.image, MS_IMAGE_TENSOR_FORMAT): 144 del summary.value[-1] 145 elif summary_type == 'Histogram': 146 _fill_histogram_summary(tag, data, summary_value.histogram) 147 elif summary_type == 'Landscape': 148 summary_value.loss_landscape.ParseFromString(data) 149 else: 150 # The data is invalid ,jump the data 151 logger.error(f"Summary type({summary_type}) is error, tag = {tag}") 152 del summary.value[-1] 153 154 return summary_event 155 156 157def _nptype_to_prototype(np_value): 158 """ 159 Transform the np type to proto type. 160 161 Args: 162 np_value (Type): Numpy data type. 163 164 Returns: 165 Type, proto data type. 166 """ 167 np2pt_tbl = { 168 np.bool_: 'DT_BOOL', 169 np.int8: 'DT_INT8', 170 np.int16: 'DT_INT16', 171 np.int32: 'DT_INT32', 172 np.int64: 'DT_INT64', 173 np.uint8: 'DT_UINT8', 174 np.uint16: 'DT_UINT16', 175 np.uint32: 'DT_UINT32', 176 np.uint64: 'DT_UINT64', 177 np.float16: 'DT_FLOAT16', 178 float: 'DT_FLOAT64', 179 np.float32: 'DT_FLOAT32', 180 np.float64: 'DT_FLOAT64', 181 None: 'DT_UNDEFINED' 182 } 183 np_type = None 184 if np_value is None: 185 logger.error("The numpy value in Summary is none") 186 else: 187 np_type = np_value.dtype.type 188 189 proto = np2pt_tbl.get(np_type, None) 190 if proto is None: 191 raise TypeError("Transform numpy type failed in Summary, expect numpy type is one of ['np.bool_', 'np.int8', " 192 "'np.int16', 'np.int32', 'np.int64', 'np.uint8', 'np.uint16', 'np.uint32', 'np.uint64', " 193 "'np.float16', 'np.float_', 'np.float64'].") 194 195 return proto 196 197 198def _fill_scalar_summary(tag: str, np_value, summary): 199 """ 200 Package the scalar summary. 201 202 Args: 203 tag (str): Summary tag describe. 204 np_value (Object): Scalary object. 205 206 Returns: 207 Summary, return scalar summary content. 208 """ 209 logger.debug(f"Set({tag}) the scalar summary value") 210 if np_value.size == 1: 211 # is scalar 212 summary.scalar_value = np_value.item() 213 return True 214 if np_value.size > 1: 215 logger.info( 216 f"The tensor is not a single scalar, tag = {tag}, ndim = {np_value.ndim}, shape = {np_value.shape}") 217 summary.scalar_value = next(np_value.flat).item() 218 return True 219 logger.error(f"The size of Summary tensor should greater than 1, " 220 f"but got size = {np_value.size}, this means has no values inside tensor, ") 221 return False 222 223 224def _fill_tensor_summary(tag: str, np_value, summary_tensor): 225 """ 226 Package the tensor summary. 227 228 Args: 229 tag (str): Summary tag describe. 230 np_value (Type): Summary data type. 231 summary_tensor (Tensor): The tensor of summary. 232 233 Returns: 234 Summary, return tensor summary content. 235 """ 236 logger.debug(f"Set({tag}) the tensor summary value") 237 # get tensor dtype 238 tensor_dtype = _nptype_to_prototype(np_value) 239 summary_tensor.data_type = DataType.Value(tensor_dtype) 240 241 # get the value list 242 tensor_value_list = np_value.reshape(-1).tolist() 243 summary_tensor.float_data.extend(tensor_value_list) 244 245 # get the tensor dim 246 for v in np_value.shape: 247 summary_tensor.dims.append(v) 248 249 return summary_tensor 250 251 252def _calc_histogram_bins(count): 253 """ 254 Calculates experience-based optimal bins number for histogram. 255 256 There should be enough number in each bin. So we calc bin numbers according to count. For very small count(1 - 257 10), we assign carefully chosen number. For large count, we tried to make sure there are 9-10 numbers in each 258 bucket on average. Too many bins will slow down performance, so we set max number of bins to 90. 259 260 Args: 261 count (int): Valid number count for the tensor. 262 263 Returns: 264 int, number of histogram bins. 265 """ 266 max_bins, max_per_bin = 90, 10 267 268 if not count: 269 return 1 270 if count <= 5: 271 return 2 272 if count <= 10: 273 return 3 274 if count <= 880: 275 # note that math.ceil(881/10) + 1 equals 90 276 return count // max_per_bin + 1 277 278 return max_bins 279 280 281def _fill_histogram_summary(tag: str, np_value: np.ndarray, summary) -> None: 282 """ 283 Package the histogram summary. 284 285 Args: 286 tag (str): Summary tag describe. 287 np_value (np.ndarray): Summary data. 288 summary (summary_pb2.Summary.Histogram): Summary histogram data. 289 """ 290 logger.debug(f"Set({tag}) the histogram summary value") 291 # Default bucket for tensor with no valid data. 292 ma_value = np.ma.masked_invalid(np_value) 293 total, valid = np_value.size, ma_value.count() 294 invalids = [] 295 for isfn in np.isnan, np.isposinf, np.isneginf: 296 if total - valid > sum(invalids): 297 invalids.append(np.count_nonzero(isfn(np_value))) 298 else: 299 invalids.append(0) 300 301 summary.count = total 302 summary.nan_count, summary.pos_inf_count, summary.neg_inf_count = invalids 303 if not valid: 304 logger.warning(f'There are no valid values in the ndarray(size={total}, shape={np_value.shape})') 305 # summary.{min, max, sum} are 0s by default, no need to explicitly set 306 else: 307 # BUG: max of a masked array with dtype np.float16 returns inf 308 # See numpy issue#15077 309 if issubclass(np_value.dtype.type, np.floating): 310 summary.min = ma_value.min(fill_value=np.PINF) 311 summary.max = ma_value.max(fill_value=np.NINF) 312 if summary.min < F32_MIN or summary.max > F32_MAX: 313 logger.warning(f'Values({summary.min}, {summary.max}) are too large, ' 314 f'you may encounter some undefined behaviours hereafter.') 315 else: 316 summary.min = ma_value.min() 317 summary.max = ma_value.max() 318 summary.sum = ma_value.sum(dtype=np.float64) 319 _fill_bucket(valid, np_value, summary) 320 321 322def _fill_bucket(valid, np_value, summary): 323 """ 324 Fill the bucket. 325 326 Args: 327 valid (int): The count of valid data. 328 np_value (np.ndarray): Summary data. 329 summary (summary_pb2.Summary.Histogram): Summary histogram data. 330 """ 331 bins = _calc_histogram_bins(valid) 332 first_edge, last_edge = summary.min, summary.max 333 334 if not first_edge < last_edge: 335 first_edge -= 0.5 336 last_edge += 0.5 337 338 bins = np.linspace(first_edge, last_edge, bins + 1, dtype=np_value.dtype) 339 hists, edges = np.histogram(np_value, bins=bins) 340 341 for hist, edge1, edge2 in zip(hists, edges, edges[1:]): 342 bucket = summary.buckets.add() 343 bucket.width = edge2 - edge1 344 bucket.count = hist 345 bucket.left = edge1 346 347 348def _fill_image_summary(tag: str, np_value, summary_image, input_format='NCHW'): 349 """ 350 Package the image summary. 351 352 Args: 353 tag (str): Summary tag describe. 354 np_value (Type): Summary data type. 355 summary_image (Tensor): The tensor of summary. 356 input_format (str): Data sort order index. Default: 'NCHW'. 357 358 Returns: 359 Summary, return image summary content. 360 """ 361 logger.debug(f"Set({tag}) the image summary value") 362 if np_value.ndim != 4 or np_value.shape[1] not in (1, 3): 363 logger.error(f"The dimension of Summary tensor should be 4 or second dimension should be 1 or 3, " 364 f"but got tag = {tag}, ndim = {np_value.ndim}, shape={np_value.shape}, " 365 f"which means Summary tensor is not Image.") 366 return False 367 368 if np_value.ndim != len(input_format): 369 logger.error( 370 f"The tensor with dimension({np_value.ndim}) can't convert the format({input_format}) " 371 f"because dimension not same, the dimension should be {len(input_format)}.") 372 return False 373 374 if 0 in np_value.shape: 375 logger.error( 376 f"The tensor with shape({np_value.shape}) is not a valid image because the shape contains zero.") 377 return False 378 379 # convert the tensor format 380 tensor = _convert_image_format(np_value, input_format) 381 382 # convert the tensor dtype 383 # Do not assume that user passes in values in [0, 255], use data type to detect 384 scale_factor = 1 385 shift = 0 386 max_value = np.max(tensor) 387 min_value = np.min(tensor) 388 if tensor.dtype == np.uint8: 389 scale_factor = 1 390 elif max_value <= 1 and min_value >= 0: 391 scale_factor = 255 392 else: 393 if max_value != min_value: 394 # Mapping the value to range [0, 255] linearly. 395 scale_factor = 255 / (max_value - min_value + 1) 396 shift = min_value 397 tensor = tensor.astype(np.float32) 398 tensor = ((tensor - shift) * scale_factor).astype(np.uint8) 399 400 # create the image summary 401 height, width, channel, image_string = _make_image(tensor) 402 summary_image.height = height 403 summary_image.width = width 404 summary_image.colorspace = channel 405 summary_image.encoded_image = image_string 406 return True 407 408 409def _make_image(tensor, rescale=1): 410 """ 411 Convert a numpy representation of an image to Image protobuf. 412 413 Args: 414 tensor (Tensor): The image data. 415 rescale (Number): The rescale value. Default: 1. 416 417 Returns: 418 (Number, Number, Number, Bytes), return the height, width, channel, image string . 419 """ 420 height, width, channel = tensor.shape 421 scaled_height = int(height * rescale) 422 scaled_width = int(width * rescale) 423 image = Image.fromarray(tensor) 424 if hasattr(Image, 'ANTIALIAS'): 425 image = image.resize((scaled_width, scaled_height), Image.ANTIALIAS) 426 else: 427 image = image.resize((scaled_width, scaled_height), Image.LANCZOS) 428 output = io.BytesIO() 429 image.save(output, format='PNG') 430 image_string = output.getvalue() 431 output.close() 432 return height, width, channel, image_string 433 434 435def _convert_image_format(np_tensor, input_format, out_format='HWC'): 436 """ 437 Convert the image format. 438 439 Args: 440 np_tensor (Tensor): The image data. 441 input_format (str): Input data format. 442 out_format (str): The output data format. Default: 'HWC'. 443 444 Returns: 445 Tensor, return format image. 446 """ 447 input_format = input_format.upper() 448 449 # convert the NCHW 450 if input_format != 'NCHW': 451 index = [input_format.find(c) for c in 'NCHW'] 452 tensor_nchw = np_tensor.transpose(index) 453 else: 454 tensor_nchw = np_tensor 455 456 # make grid to expand N 457 tensor_chw = _make_canvas_for_imgs(tensor_nchw) 458 459 # convert to out format 460 out_index = ['CHW'.find(c) for c in out_format] 461 out_tensor = tensor_chw.transpose(out_index) 462 return out_tensor 463 464 465def _make_canvas_for_imgs(tensor, col_imgs=8): 466 """ 467 Expand the N, show imgs on a canvas. 468 469 Args: 470 tensor (Tensor): The canvas value. 471 col_imgs (Number): The image colume number. Default: 8. 472 473 Returns: 474 Tensor, return canvas of image. 475 """ 476 # expand the N1HW to N3HW 477 if tensor.shape[1] == 1: 478 tensor = np.concatenate([tensor, tensor, tensor], 1) 479 480 # expand the N 481 n = tensor.shape[0] 482 h = tensor.shape[2] 483 w = tensor.shape[3] 484 cols = min(n, col_imgs) 485 rows = int(np.ceil(float(n) / cols)) 486 487 # create the canvas: expand the n 488 out_canvas = np.zeros((3, h * rows, w * cols)) 489 i = 0 490 for y in range(rows): 491 for x in range(cols): 492 if i >= n: 493 break 494 out_canvas[:, y * h:(y + 1) * h, x * w:(x + 1) * w] = tensor[i] 495 i = i + 1 496 return out_canvas 497