• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2020-2021 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ============================================================================
15"""Generate the summary event which conform to proto format."""
16from __future__ import absolute_import
17from __future__ import division
18
19import io
20import platform
21import time
22
23import numpy as np
24from PIL import Image
25
26from mindspore import log as logger
27from mindspore import context
28from mindspore.communication.management import get_rank
29from mindspore.communication.management import GlobalComm
30
31from mindspore import _checkparam as Validator
32from mindspore.train.anf_ir_pb2 import DataType, ModelProto
33from mindspore.train.summary_pb2 import Event
34
35# define the MindSpore image format
36MS_IMAGE_TENSOR_FORMAT = 'NCHW'
37# Set the Event mark
38EVENT_FILE_NAME_MARK = ".out.events.summary."
39# Set the init event of version and mark
40EVENT_FILE_INIT_VERSION_MARK = "MindSpore.Event:"
41EVENT_FILE_INIT_VERSION = 1
42
43F32_MIN, F32_MAX = np.finfo(np.float32).min, np.finfo(np.float32).max
44
45
46def get_event_file_name(prefix, suffix, time_second):
47    """
48    Create file name: file_prefix + EVENT_FILE_NAME_MARK + time(seconds) + "." + Hostname + file_suffix.
49
50    Args:
51        prefix (str): The prefix of file name.
52        suffix (str): The suffix of file name.
53        time_second (str): The time stamp of file name.
54
55    Returns:
56        String, the name of event log file.
57    """
58    Validator.check_str_by_regular(prefix)
59    Validator.check_str_by_regular(suffix)
60    file_name = ""
61    hostname = platform.node()
62
63    device_num = context.get_auto_parallel_context('device_num')
64    device_id = context.get_context('device_id')
65    if device_num > 1 or GlobalComm.WORLD_COMM_GROUP == 'nccl_world_group':
66        # Notice:
67        # In GPU distribute training scene, get_context('device_id') will not work,
68        # so we use get_rank instead of get_context.
69        device_id = get_rank()
70
71    file_name = f'{file_name}{EVENT_FILE_NAME_MARK}{time_second}.{device_id}.{hostname}'
72
73    if prefix is not None:
74        file_name = prefix + file_name
75
76    if suffix is not None:
77        file_name = file_name + suffix
78
79    return file_name
80
81
82def package_init_event():
83    """Package the summary init event."""
84    init_event = Event()
85    init_event.wall_time = time.time()
86    version = EVENT_FILE_INIT_VERSION_MARK + str(EVENT_FILE_INIT_VERSION)
87    init_event.version = version
88    return init_event
89
90
91def package_graph_event(data):
92    """
93    Package the summary graph event.
94
95    Args:
96        data (Bytes): Graph bytes string.
97
98    Returns:
99        Event, event log object.
100    """
101    graph_event = Event()
102    graph_event.wall_time = time.time()
103    modelp = ModelProto()
104    modelp.ParseFromString(data)
105    graph_event.graph_def.CopyFrom(modelp.graph)
106    return graph_event
107
108
109def package_summary_event(data_list, step, wall_time):
110    """
111    Package the summary to event protobuffer.
112
113    Args:
114        data_list (list): Summary data list.
115        step (Number): The recode step index.
116        wall_time (float): The wall time.
117
118    Returns:
119        Summary, the summary event.
120    """
121    # create the event of summary
122    summary_event = Event()
123    summary = summary_event.summary
124    summary_event.wall_time = wall_time
125    summary_event.step = int(step)
126
127    for value in data_list:
128        summary_type = value["_type"]
129        data = value["data"]
130        tag = value["name"]
131
132        logger.debug(f"Now process {summary_type} summary, tag = {tag}")
133
134        summary_value = summary.value.add()
135        summary_value.tag = tag
136        # get the summary type and parse the tag
137        if summary_type == 'Scalar':
138            if not _fill_scalar_summary(tag, data, summary_value):
139                del summary.value[-1]
140        elif summary_type == 'Tensor':
141            _fill_tensor_summary(tag, data, summary_value.tensor)
142        elif summary_type == 'Image':
143            if not _fill_image_summary(tag, data, summary_value.image, MS_IMAGE_TENSOR_FORMAT):
144                del summary.value[-1]
145        elif summary_type == 'Histogram':
146            _fill_histogram_summary(tag, data, summary_value.histogram)
147        elif summary_type == 'Landscape':
148            summary_value.loss_landscape.ParseFromString(data)
149        else:
150            # The data is invalid ,jump the data
151            logger.error(f"Summary type({summary_type}) is error, tag = {tag}")
152            del summary.value[-1]
153
154    return summary_event
155
156
157def _nptype_to_prototype(np_value):
158    """
159    Transform the np type to proto type.
160
161    Args:
162        np_value (Type): Numpy data type.
163
164    Returns:
165        Type, proto data type.
166    """
167    np2pt_tbl = {
168        np.bool_: 'DT_BOOL',
169        np.int8: 'DT_INT8',
170        np.int16: 'DT_INT16',
171        np.int32: 'DT_INT32',
172        np.int64: 'DT_INT64',
173        np.uint8: 'DT_UINT8',
174        np.uint16: 'DT_UINT16',
175        np.uint32: 'DT_UINT32',
176        np.uint64: 'DT_UINT64',
177        np.float16: 'DT_FLOAT16',
178        float: 'DT_FLOAT64',
179        np.float32: 'DT_FLOAT32',
180        np.float64: 'DT_FLOAT64',
181        None: 'DT_UNDEFINED'
182    }
183    np_type = None
184    if np_value is None:
185        logger.error("The numpy value in Summary is none")
186    else:
187        np_type = np_value.dtype.type
188
189    proto = np2pt_tbl.get(np_type, None)
190    if proto is None:
191        raise TypeError("Transform numpy type failed in Summary, expect numpy type is one of ['np.bool_', 'np.int8', "
192                        "'np.int16', 'np.int32', 'np.int64', 'np.uint8', 'np.uint16', 'np.uint32', 'np.uint64', "
193                        "'np.float16', 'np.float_', 'np.float64'].")
194
195    return proto
196
197
198def _fill_scalar_summary(tag: str, np_value, summary):
199    """
200    Package the scalar summary.
201
202    Args:
203        tag (str): Summary tag describe.
204        np_value (Object): Scalary object.
205
206    Returns:
207        Summary, return scalar summary content.
208    """
209    logger.debug(f"Set({tag}) the scalar summary value")
210    if np_value.size == 1:
211        # is scalar
212        summary.scalar_value = np_value.item()
213        return True
214    if np_value.size > 1:
215        logger.info(
216            f"The tensor is not a single scalar, tag = {tag}, ndim = {np_value.ndim}, shape = {np_value.shape}")
217        summary.scalar_value = next(np_value.flat).item()
218        return True
219    logger.error(f"The size of Summary tensor should greater than 1, "
220                 f"but got size = {np_value.size}, this means has no values inside tensor, ")
221    return False
222
223
224def _fill_tensor_summary(tag: str, np_value, summary_tensor):
225    """
226    Package the tensor summary.
227
228    Args:
229        tag (str): Summary tag describe.
230        np_value (Type): Summary data type.
231        summary_tensor (Tensor): The tensor of summary.
232
233    Returns:
234        Summary, return tensor summary content.
235    """
236    logger.debug(f"Set({tag}) the tensor summary value")
237    # get tensor dtype
238    tensor_dtype = _nptype_to_prototype(np_value)
239    summary_tensor.data_type = DataType.Value(tensor_dtype)
240
241    # get the value list
242    tensor_value_list = np_value.reshape(-1).tolist()
243    summary_tensor.float_data.extend(tensor_value_list)
244
245    # get the tensor dim
246    for v in np_value.shape:
247        summary_tensor.dims.append(v)
248
249    return summary_tensor
250
251
252def _calc_histogram_bins(count):
253    """
254    Calculates experience-based optimal bins number for histogram.
255
256    There should be enough number in each bin. So we calc bin numbers according to count. For very small count(1 -
257    10), we assign carefully chosen number. For large count, we tried to make sure there are 9-10 numbers in each
258    bucket on average. Too many bins will slow down performance, so we set max number of bins to 90.
259
260    Args:
261        count (int): Valid number count for the tensor.
262
263    Returns:
264        int, number of histogram bins.
265    """
266    max_bins, max_per_bin = 90, 10
267
268    if not count:
269        return 1
270    if count <= 5:
271        return 2
272    if count <= 10:
273        return 3
274    if count <= 880:
275        # note that math.ceil(881/10) + 1 equals 90
276        return count // max_per_bin + 1
277
278    return max_bins
279
280
281def _fill_histogram_summary(tag: str, np_value: np.ndarray, summary) -> None:
282    """
283    Package the histogram summary.
284
285    Args:
286        tag (str): Summary tag describe.
287        np_value (np.ndarray): Summary data.
288        summary (summary_pb2.Summary.Histogram): Summary histogram data.
289    """
290    logger.debug(f"Set({tag}) the histogram summary value")
291    # Default bucket for tensor with no valid data.
292    ma_value = np.ma.masked_invalid(np_value)
293    total, valid = np_value.size, ma_value.count()
294    invalids = []
295    for isfn in np.isnan, np.isposinf, np.isneginf:
296        if total - valid > sum(invalids):
297            invalids.append(np.count_nonzero(isfn(np_value)))
298        else:
299            invalids.append(0)
300
301    summary.count = total
302    summary.nan_count, summary.pos_inf_count, summary.neg_inf_count = invalids
303    if not valid:
304        logger.warning(f'There are no valid values in the ndarray(size={total}, shape={np_value.shape})')
305        # summary.{min, max, sum} are 0s by default, no need to explicitly set
306    else:
307        # BUG: max of a masked array with dtype np.float16 returns inf
308        # See numpy issue#15077
309        if issubclass(np_value.dtype.type, np.floating):
310            summary.min = ma_value.min(fill_value=np.PINF)
311            summary.max = ma_value.max(fill_value=np.NINF)
312            if summary.min < F32_MIN or summary.max > F32_MAX:
313                logger.warning(f'Values({summary.min}, {summary.max}) are too large, '
314                               f'you may encounter some undefined behaviours hereafter.')
315        else:
316            summary.min = ma_value.min()
317            summary.max = ma_value.max()
318        summary.sum = ma_value.sum(dtype=np.float64)
319        _fill_bucket(valid, np_value, summary)
320
321
322def _fill_bucket(valid, np_value, summary):
323    """
324    Fill the bucket.
325
326    Args:
327        valid (int): The count of valid data.
328        np_value (np.ndarray): Summary data.
329        summary (summary_pb2.Summary.Histogram): Summary histogram data.
330    """
331    bins = _calc_histogram_bins(valid)
332    first_edge, last_edge = summary.min, summary.max
333
334    if not first_edge < last_edge:
335        first_edge -= 0.5
336        last_edge += 0.5
337
338    bins = np.linspace(first_edge, last_edge, bins + 1, dtype=np_value.dtype)
339    hists, edges = np.histogram(np_value, bins=bins)
340
341    for hist, edge1, edge2 in zip(hists, edges, edges[1:]):
342        bucket = summary.buckets.add()
343        bucket.width = edge2 - edge1
344        bucket.count = hist
345        bucket.left = edge1
346
347
348def _fill_image_summary(tag: str, np_value, summary_image, input_format='NCHW'):
349    """
350    Package the image summary.
351
352    Args:
353        tag (str): Summary tag describe.
354        np_value (Type): Summary data type.
355        summary_image (Tensor): The tensor of summary.
356        input_format (str): Data sort order index. Default: 'NCHW'.
357
358    Returns:
359        Summary, return image summary content.
360    """
361    logger.debug(f"Set({tag}) the image summary value")
362    if np_value.ndim != 4 or np_value.shape[1] not in (1, 3):
363        logger.error(f"The dimension of Summary tensor should be 4 or second dimension should be 1 or 3, "
364                     f"but got tag = {tag}, ndim = {np_value.ndim}, shape={np_value.shape}, "
365                     f"which means Summary tensor is not Image.")
366        return False
367
368    if np_value.ndim != len(input_format):
369        logger.error(
370            f"The tensor with dimension({np_value.ndim}) can't convert the format({input_format}) "
371            f"because dimension not same, the dimension should be {len(input_format)}.")
372        return False
373
374    if 0 in np_value.shape:
375        logger.error(
376            f"The tensor with shape({np_value.shape}) is not a valid image because the shape contains zero.")
377        return False
378
379    # convert the tensor format
380    tensor = _convert_image_format(np_value, input_format)
381
382    # convert the tensor dtype
383    # Do not assume that user passes in values in [0, 255], use data type to detect
384    scale_factor = 1
385    shift = 0
386    max_value = np.max(tensor)
387    min_value = np.min(tensor)
388    if tensor.dtype == np.uint8:
389        scale_factor = 1
390    elif max_value <= 1 and min_value >= 0:
391        scale_factor = 255
392    else:
393        if max_value != min_value:
394            # Mapping the value to range [0, 255] linearly.
395            scale_factor = 255 / (max_value - min_value + 1)
396        shift = min_value
397    tensor = tensor.astype(np.float32)
398    tensor = ((tensor - shift) * scale_factor).astype(np.uint8)
399
400    # create the image summary
401    height, width, channel, image_string = _make_image(tensor)
402    summary_image.height = height
403    summary_image.width = width
404    summary_image.colorspace = channel
405    summary_image.encoded_image = image_string
406    return True
407
408
409def _make_image(tensor, rescale=1):
410    """
411    Convert a numpy representation of an image to Image protobuf.
412
413    Args:
414        tensor (Tensor): The image data.
415        rescale (Number): The rescale value. Default: 1.
416
417    Returns:
418        (Number, Number, Number, Bytes), return the height, width, channel, image string .
419    """
420    height, width, channel = tensor.shape
421    scaled_height = int(height * rescale)
422    scaled_width = int(width * rescale)
423    image = Image.fromarray(tensor)
424    if hasattr(Image, 'ANTIALIAS'):
425        image = image.resize((scaled_width, scaled_height), Image.ANTIALIAS)
426    else:
427        image = image.resize((scaled_width, scaled_height), Image.LANCZOS)
428    output = io.BytesIO()
429    image.save(output, format='PNG')
430    image_string = output.getvalue()
431    output.close()
432    return height, width, channel, image_string
433
434
435def _convert_image_format(np_tensor, input_format, out_format='HWC'):
436    """
437    Convert the image format.
438
439    Args:
440        np_tensor (Tensor): The image data.
441        input_format (str): Input data format.
442        out_format (str): The output data format. Default: 'HWC'.
443
444    Returns:
445        Tensor, return format image.
446    """
447    input_format = input_format.upper()
448
449    # convert the NCHW
450    if input_format != 'NCHW':
451        index = [input_format.find(c) for c in 'NCHW']
452        tensor_nchw = np_tensor.transpose(index)
453    else:
454        tensor_nchw = np_tensor
455
456    # make grid to expand N
457    tensor_chw = _make_canvas_for_imgs(tensor_nchw)
458
459    # convert to out format
460    out_index = ['CHW'.find(c) for c in out_format]
461    out_tensor = tensor_chw.transpose(out_index)
462    return out_tensor
463
464
465def _make_canvas_for_imgs(tensor, col_imgs=8):
466    """
467    Expand the N, show imgs on a canvas.
468
469    Args:
470        tensor (Tensor): The canvas value.
471        col_imgs (Number): The image colume number. Default: 8.
472
473    Returns:
474        Tensor, return canvas of image.
475    """
476    # expand the N1HW to N3HW
477    if tensor.shape[1] == 1:
478        tensor = np.concatenate([tensor, tensor, tensor], 1)
479
480    # expand the N
481    n = tensor.shape[0]
482    h = tensor.shape[2]
483    w = tensor.shape[3]
484    cols = min(n, col_imgs)
485    rows = int(np.ceil(float(n) / cols))
486
487    # create the canvas: expand the n
488    out_canvas = np.zeros((3, h * rows, w * cols))
489    i = 0
490    for y in range(rows):
491        for x in range(cols):
492            if i >= n:
493                break
494            out_canvas[:, y * h:(y + 1) * h, x * w:(x + 1) * w] = tensor[i]
495            i = i + 1
496    return out_canvas
497