• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2021-2024 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ============================================================================
15"""
16Utils for testing dump feature.
17"""
18
19import json
20import os
21import time
22import glob
23import csv
24import numpy as np
25
26async_dump_dict = {
27    "common_dump_settings": {
28        "dump_mode": 0,
29        "path": "",
30        "net_name": "Net",
31        "iteration": "0",
32        "input_output": 2,
33        "kernels": ["Default/TensorAdd-op3"],
34        "support_device": [0, 1, 2, 3, 4, 5, 6, 7],
35        "op_debug_mode": 0
36    }
37}
38
39e2e_dump_dict = {
40    "common_dump_settings": {
41        "dump_mode": 0,
42        "path": "",
43        "net_name": "Net",
44        "iteration": "0",
45        "input_output": 0,
46        "kernels": ["Default/Conv-op12"],
47        "support_device": [0, 1, 2, 3, 4, 5, 6, 7],
48        "op_debug_mode": 0
49    },
50    "e2e_dump_settings": {
51        "enable": True,
52        "trans_flag": False
53    }
54}
55
56async_dump_dict_2 = {
57    "common_dump_settings": {
58        "dump_mode": 0,
59        "path": "/tmp/async_dump/test_async_dump_net_multi_layer_mode1",
60        "net_name": "test",
61        "iteration": "0",
62        "input_output": 2,
63        "kernels": [
64            "default/TensorAdd-op10",
65            "Gradients/Default/network-WithLossCell/_backbone-ReLUReduceMeanDenseRelu/dense-Dense/gradBiasAdd/" \
66            "BiasAddGrad-op8",
67            "Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/SoftmaxCrossEntropyWithLogits-op5",
68            "Default/optimizer-Momentum/tuple_getitem-op29",
69            "Default/optimizer-Momentum/ApplyMomentum-op12"
70        ],
71        "support_device": [0, 1, 2, 3, 4, 5, 6, 7],
72        "op_debug_mode": 0
73    }
74}
75
76e2e_dump_dict_2 = {
77    "common_dump_settings": {
78        "dump_mode": 0,
79        "path": "",
80        "net_name": "Net",
81        "iteration": "all",
82        "input_output": 0,
83        "kernels": ["Default/Conv-op12"],
84        "support_device": [0, 1, 2, 3, 4, 5, 6, 7],
85        "op_debug_mode": 0
86    },
87    "e2e_dump_settings": {
88        "enable": True,
89        "trans_flag": False
90    }
91}
92
93e2e_dump_dict_3 = {
94    "common_dump_settings": {
95        "dump_mode": 0,
96        "path": "",
97        "net_name": "Net",
98        "iteration": "all",
99        "input_output": 0,
100        "kernels": ["Default/Conv-op12"],
101        "support_device": [0, 1, 2, 3, 4, 5, 6, 7],
102        "op_debug_mode": 0
103    },
104    "e2e_dump_settings": {
105        "enable": True,
106        "trans_flag": False,
107        "slice_flag": 1,
108        "slice_num": 20
109    }
110}
111
112async_dump_dict_3 = {
113    "common_dump_settings": {
114        "dump_mode": 0,
115        "path": "",
116        "net_name": "Net",
117        "iteration": "all",
118        "input_output": 2,
119        "kernels": ["Default/TensorAdd-op3"],
120        "support_device": [0, 1, 2, 3, 4, 5, 6, 7],
121        "op_debug_mode": 0
122    }
123}
124
125async_dump_dict_acl = {
126    "common_dump_settings": {
127        "dump_mode": 0,
128        "path": "",
129        "net_name": "Net",
130        "iteration": "0",
131        "input_output": 0,
132        "kernels": [],
133        "support_device": [0, 1, 2, 3, 4, 5, 6, 7],
134        "op_debug_mode": 0
135    }
136}
137
138async_dump_dict_acl_assign_ops_by_regex = {
139    "common_dump_settings": {
140        "dump_mode": 1,
141        "path": "",
142        "net_name": "Net",
143        "iteration": "0",
144        "input_output": 0,
145        "kernels": ["name-regex(.+Add.*)"],
146        "support_device": [0, 1, 2, 3, 4, 5, 6, 7],
147        "op_debug_mode": 0
148    }
149}
150
151def generate_dump_json(dump_path, json_file_name, test_key, net_name='Net'):
152    """
153    Util function to generate dump configuration json file.
154    """
155    data = {}
156    if test_key in ["test_async_dump", "test_async_dump_dataset_sink", "test_ge_dump"]:
157        data = async_dump_dict
158        data["common_dump_settings"]["path"] = dump_path
159    elif test_key in ("test_e2e_dump", "test_e2e_dump_trans_false"):
160        data = e2e_dump_dict
161        data["common_dump_settings"]["path"] = dump_path
162    elif test_key == "test_async_dump_net_multi_layer_mode1":
163        data = async_dump_dict_2
164        data["common_dump_settings"]["path"] = dump_path
165    elif test_key in ("test_GPU_e2e_multi_root_graph_dump", "test_Ascend_e2e_multi_root_graph_dump"):
166        data = e2e_dump_dict_2
167        data["common_dump_settings"]["path"] = dump_path
168    elif test_key == "test_Ascend_async_multi_root_graph_dump" or test_key == "test_ge_dump_net_multi_layer_mode1":
169        data = async_dump_dict_3
170        data["common_dump_settings"]["path"] = dump_path
171    elif test_key == "test_async_dump_npy" or test_key == "test_ge_dump_npy":
172        data = async_dump_dict
173        data["common_dump_settings"]["path"] = dump_path
174        data["common_dump_settings"]["file_format"] = "npy"
175    elif test_key == "test_async_dump_bin":
176        data = async_dump_dict
177        data["common_dump_settings"]["path"] = dump_path
178        data["common_dump_settings"]["file_format"] = "bin"
179    elif test_key in ["test_e2e_dump_trans_true", "test_e2e_dump_lenet", "test_e2e_dump_dynamic_shape"]:
180        data = e2e_dump_dict
181        data["common_dump_settings"]["path"] = dump_path
182        data["e2e_dump_settings"]["trans_flag"] = True
183    elif test_key == "test_e2e_dump_trans_true_op_debug_mode":
184        data = e2e_dump_dict
185        data["common_dump_settings"]["path"] = dump_path
186        data["e2e_dump_settings"]["trans_flag"] = True
187        data["common_dump_settings"]["op_debug_mode"] = 3
188    elif test_key == "test_e2e_dump_save_kernel_args_true":
189        data = e2e_dump_dict
190        data["common_dump_settings"]["path"] = dump_path
191        data["e2e_dump_settings"]["save_kernel_args"] = True
192    elif test_key == "test_async_dump_net_multi_layer_mode1_npy":
193        data = async_dump_dict_2
194        data["common_dump_settings"]["path"] = dump_path
195        data["common_dump_settings"]["file_format"] = "npy"
196    elif test_key == "test_e2e_dump_sample_debug_mode":
197        data = e2e_dump_dict_3
198        data["common_dump_settings"]["path"] = dump_path
199        data["e2e_dump_settings"]["trans_flag"] = True
200    elif test_key == "test_acl_dump":
201        data = async_dump_dict_acl
202        data["common_dump_settings"]["path"] = dump_path
203    elif test_key == "test_acl_dump_dynamic_shape":
204        data = async_dump_dict_acl
205        data["common_dump_settings"]["path"] = dump_path
206        data["common_dump_settings"]["file_format"] = "npy"
207    elif test_key == "test_kbk_e2e_set_dump":
208        data = e2e_dump_dict
209        data["common_dump_settings"]["dump_mode"] = 2
210        data["common_dump_settings"]["path"] = dump_path
211        data["e2e_dump_settings"]["trans_flag"] = True
212    elif test_key == "test_kbk_e2e_dump_reg":
213        data = e2e_dump_dict
214        data["common_dump_settings"]["dump_mode"] = 1
215        data["common_dump_settings"]["path"] = dump_path
216        data["common_dump_settings"]["kernels"] = ["name-regex(.+/Add[^/]*)"]
217        data["e2e_dump_settings"]["trans_flag"] = True
218    elif test_key == "test_exception_dump":
219        data = e2e_dump_dict
220        data["common_dump_settings"]["path"] = dump_path
221        data["common_dump_settings"]["op_debug_mode"] = 4
222        data["e2e_dump_settings"]["trans_flag"] = True
223    elif test_key == "test_acl_dump_assign_ops_by_regex":
224        data = async_dump_dict_acl_assign_ops_by_regex
225        data["common_dump_settings"]["path"] = dump_path
226    else:
227        raise ValueError(
228            "Failed to generate dump json file. The test name value " + test_key + " is invalid.")
229    data["common_dump_settings"]["net_name"] = net_name
230    with open(json_file_name, 'w') as f:
231        json.dump(data, f)
232
233
234def generate_dump_json_with_overflow(dump_path, json_file_name, test_key, op):
235    """
236    Util function to generate dump configuration json file.
237    """
238    if test_key == "test_async_dump" or test_key == "test_ge_dump":
239        data = async_dump_dict
240        common_dump_settings = data.get("common_dump_settings", "")
241        if not isinstance(common_dump_settings, dict):
242            raise ValueError("Common_dump_settings should be dict, but got %s." % type(common_dump_settings))
243        common_dump_settings["path"] = dump_path
244        common_dump_settings["op_debug_mode"] = op
245    elif test_key == "test_async_dump_npy":
246        data = async_dump_dict
247        common_dump_settings = data.get("common_dump_settings", "")
248        if not isinstance(common_dump_settings, dict):
249            raise ValueError("Common_dump_settings should be dict, but got %s." % type(common_dump_settings))
250        common_dump_settings["path"] = dump_path
251        common_dump_settings["op_debug_mode"] = op
252        common_dump_settings["file_format"] = "npy"
253    else:
254        raise ValueError(
255            "Failed to generate dump json file. Overflow only support in async dump")
256    with open(json_file_name, 'w') as f:
257        json.dump(data, f)
258
259
260def generate_statistic_dump_json(dump_path, json_file_name, test_key, saved_data, net_name='Net',
261                                 statistic_category=None):
262    """
263    Util function to generate dump configuration json file for statistic dump.
264    """
265    data = {}
266    if test_key in ["test_gpu_e2e_dump", "test_e2e_dump_dynamic_shape_custom_statistic"]:
267        data = e2e_dump_dict
268    elif test_key == "test_async_dump" or test_key == "test_ge_dump":
269        data = async_dump_dict
270        data["common_dump_settings"]["input_output"] = 0
271        data["common_dump_settings"]["file_format"] = "npy"
272    elif test_key == "stat_calc_mode":
273        data = e2e_dump_dict
274        data["e2e_dump_settings"]["stat_calc_mode"] = "device"
275    else:
276        raise ValueError(
277            "Failed to generate statistic dump json file. The test name value " + test_key + " is invalid.")
278    data["common_dump_settings"]["path"] = dump_path
279    data["common_dump_settings"]["saved_data"] = saved_data
280    data["common_dump_settings"]["net_name"] = net_name
281    if statistic_category:
282        data["common_dump_settings"]["statistic_category"] = statistic_category
283    with open(json_file_name, 'w') as f:
284        json.dump(data, f)
285
286
287def generate_cell_dump_json(dump_path, json_file_name, test_key, dump_mode):
288    """
289    Util function to generate dump configuration json file.
290    """
291    if test_key == "test_async_dump":
292        data = async_dump_dict
293        data["common_dump_settings"]["path"] = dump_path
294        data["common_dump_settings"]["dump_mode"] = dump_mode
295    else:
296        raise ValueError(
297            "Failed to generate dump json file. Overflow only support in async dump")
298    with open(json_file_name, 'w') as f:
299        json.dump(data, f)
300
301
302def check_dump_structure(dump_path, json_file_path, num_card, num_graph, num_iteration, root_graph_id=None,
303                         test_iteration_id=None, execution_history=True):
304    """
305    Util to check if the dump structure is correct.
306    """
307    with open(json_file_path) as f:
308        data = json.load(f)
309    net_name = data["common_dump_settings"]["net_name"]
310    assert os.path.isdir(dump_path)
311    if root_graph_id is None:
312        root_graph_id = [i for i in range(num_graph)]
313    if test_iteration_id is None:
314        test_iteration_id = [i for i in range(num_iteration)]
315    for rank_id in range(num_card):
316        rank_path = os.path.join(dump_path, "rank_" + str(rank_id))
317        assert os.path.exists(rank_path)
318
319        net_name_path = os.path.join(rank_path, net_name)
320        assert os.path.exists(net_name_path)
321        graph_path = os.path.join(rank_path, "graphs")
322        assert os.path.exists(graph_path)
323        execution_order_path = os.path.join(rank_path, "execution_order")
324        assert os.path.exists(execution_order_path)
325
326        for graph_id in range(num_graph):
327            graph_pb_file = os.path.join(graph_path, "ms_output_trace_code_graph_" + str(graph_id) + ".pb")
328            graph_ir_file = os.path.join(graph_path, "ms_output_trace_code_graph_" + str(graph_id) + ".ir")
329            assert os.path.exists(graph_pb_file)
330            assert os.path.exists(graph_ir_file)
331
332            execution_order_file = os.path.join(execution_order_path, "ms_execution_order_graph_"
333                                                + str(graph_id) + ".csv")
334            assert os.path.exists(execution_order_file)
335            if graph_id in root_graph_id:
336                if execution_history:
337                    execution_history_file = os.path.join(execution_order_path,
338                                                          "ms_global_execution_order_graph_" + str(graph_id) + ".csv")
339                    assert os.path.exists(execution_history_file)
340                graph_id_path = os.path.join(net_name_path, str(graph_id))
341                assert os.path.exists(graph_id_path)
342                for iteration_id in test_iteration_id:
343                    it_id_path = os.path.join(graph_id_path, str(iteration_id))
344                    assert os.path.isdir(it_id_path)
345
346
347def find_nth_pos(string, substring, n):
348    start = string.find(substring)
349    while n > 1 and start >= 0:
350        start = string.find(substring, start + len(substring))
351        n -= 1
352    return start
353
354
355def check_statistic_dump(dump_file_path):
356    output_name = "statistic.csv"
357    output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
358    real_path = os.path.realpath(output_path)
359    with open(real_path) as f:
360        reader = csv.DictReader(f)
361        stats = list(reader)
362
363        def get_add_node(statistic):
364            return statistic['Op Type'] == 'Add'
365
366        add_statistics = list(filter(get_add_node, stats))
367        num_tensors = len(add_statistics)
368        assert num_tensors == 3
369        for tensor in add_statistics:
370            if tensor['IO'] == 'input' and tensor['Slot'] == '0':
371                assert tensor['Min Value'] == '1'
372                assert tensor['Max Value'] == '6'
373                assert tensor['L2Norm Value'] == '9.53939'
374            elif tensor['IO'] == 'input' and tensor['Slot'] == '1':
375                assert tensor['Min Value'] == '7'
376                assert tensor['Max Value'] == '12'
377                assert tensor['L2Norm Value'] == '23.6432'
378            elif tensor['IO'] == 'output' and tensor['Slot'] == '0':
379                assert tensor['Min Value'] == '8'
380                assert tensor['Max Value'] == '18'
381                assert tensor['L2Norm Value'] == '32.9242'
382
383
384def check_data_dump(dump_file_path):
385    output_name = "Add.*Add-op*.output.0.*.npy"
386    output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
387    real_path = os.path.realpath(output_path)
388    output = np.load(real_path)
389    expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
390    assert np.array_equal(output, expect)
391
392
393def check_saved_data(iteration_path, saved_data):
394    if not saved_data:
395        return
396    if saved_data in ('statistic', 'full'):
397        check_statistic_dump(iteration_path)
398    if saved_data in ('tensor', 'full'):
399        check_data_dump(iteration_path)
400    if saved_data == 'statistic':
401        # assert only file is statistic.csv, tensor data is not saved
402        assert len(os.listdir(iteration_path)) == 1
403    elif saved_data == 'tensor':
404        # assert only tensor data is saved, not statistics
405        stat_path = os.path.join(iteration_path, 'statistic.csv')
406        assert not os.path.isfile(stat_path)
407
408
409def check_overflow_file(iteration_path, overflow_num, need_check):
410    if not need_check:
411        return overflow_num
412    overflow_files = glob.glob(os.path.join(iteration_path, "Opdebug.Node_OpDebug.*.*.*"))
413    overflow_num += len(overflow_files)
414    return overflow_num
415
416
417def check_iteration(iteration_id, num_iteration):
418    if iteration_id.isdigit():
419        assert int(iteration_id) < num_iteration
420
421
422def check_ge_dump_structure(dump_path, num_iteration, device_num=1, check_overflow=False, saved_data=None,
423                            check_data=True):
424    overflow_num = 0
425    for _ in range(3):
426        if not os.listdir(dump_path):
427            time.sleep(2)
428    sub_paths = os.listdir(dump_path)
429    assert sub_paths
430    device_path_num = 0
431    for sub_path in sub_paths:
432        # on GE, the whole dump directory of one training is saved within a time path, like '20230822120819'
433        if not (sub_path.isdigit() and len(sub_path) == 14):
434            continue
435        time_path = os.path.join(dump_path, sub_path)
436        assert os.path.isdir(time_path)
437        device_paths = os.listdir(time_path)
438        device_path_num += len(device_paths)
439        for device_path in device_paths:
440            assert device_path.isdigit()
441            abs_device_path = os.path.join(time_path, device_path)
442            assert os.path.isdir(abs_device_path)
443            model_names = os.listdir(abs_device_path)
444            for model_name in model_names:
445                model_path = os.path.join(abs_device_path, model_name)
446                assert os.path.isdir(model_path)
447                model_ids = os.listdir(model_path)
448                for model_id in model_ids:
449                    model_id_path = os.path.join(model_path, model_id)
450                    assert os.path.isdir(model_id_path)
451                    iteration_ids = os.listdir(model_id_path)
452                    for iteration_id in iteration_ids:
453                        check_iteration(iteration_id, num_iteration)
454                        iteration_path = os.path.join(model_id_path, iteration_id)
455                        assert os.path.isdir(iteration_path)
456                        if check_data:
457                            check_saved_data(iteration_path, saved_data)
458                        overflow_num = check_overflow_file(iteration_path, overflow_num, check_overflow)
459    assert device_path_num == device_num
460    if check_overflow:
461        assert overflow_num
462