1# Copyright 2021-2024 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================ 15""" 16Utils for testing dump feature. 17""" 18 19import json 20import os 21import time 22import glob 23import csv 24import numpy as np 25 26async_dump_dict = { 27 "common_dump_settings": { 28 "dump_mode": 0, 29 "path": "", 30 "net_name": "Net", 31 "iteration": "0", 32 "input_output": 2, 33 "kernels": ["Default/TensorAdd-op3"], 34 "support_device": [0, 1, 2, 3, 4, 5, 6, 7], 35 "op_debug_mode": 0 36 } 37} 38 39e2e_dump_dict = { 40 "common_dump_settings": { 41 "dump_mode": 0, 42 "path": "", 43 "net_name": "Net", 44 "iteration": "0", 45 "input_output": 0, 46 "kernels": ["Default/Conv-op12"], 47 "support_device": [0, 1, 2, 3, 4, 5, 6, 7], 48 "op_debug_mode": 0 49 }, 50 "e2e_dump_settings": { 51 "enable": True, 52 "trans_flag": False 53 } 54} 55 56async_dump_dict_2 = { 57 "common_dump_settings": { 58 "dump_mode": 0, 59 "path": "/tmp/async_dump/test_async_dump_net_multi_layer_mode1", 60 "net_name": "test", 61 "iteration": "0", 62 "input_output": 2, 63 "kernels": [ 64 "default/TensorAdd-op10", 65 "Gradients/Default/network-WithLossCell/_backbone-ReLUReduceMeanDenseRelu/dense-Dense/gradBiasAdd/" \ 66 "BiasAddGrad-op8", 67 "Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/SoftmaxCrossEntropyWithLogits-op5", 68 "Default/optimizer-Momentum/tuple_getitem-op29", 69 "Default/optimizer-Momentum/ApplyMomentum-op12" 70 ], 71 "support_device": [0, 1, 2, 3, 4, 5, 6, 7], 72 "op_debug_mode": 0 73 } 74} 75 76e2e_dump_dict_2 = { 77 "common_dump_settings": { 78 "dump_mode": 0, 79 "path": "", 80 "net_name": "Net", 81 "iteration": "all", 82 "input_output": 0, 83 "kernels": ["Default/Conv-op12"], 84 "support_device": [0, 1, 2, 3, 4, 5, 6, 7], 85 "op_debug_mode": 0 86 }, 87 "e2e_dump_settings": { 88 "enable": True, 89 "trans_flag": False 90 } 91} 92 93e2e_dump_dict_3 = { 94 "common_dump_settings": { 95 "dump_mode": 0, 96 "path": "", 97 "net_name": "Net", 98 "iteration": "all", 99 "input_output": 0, 100 "kernels": ["Default/Conv-op12"], 101 "support_device": [0, 1, 2, 3, 4, 5, 6, 7], 102 "op_debug_mode": 0 103 }, 104 "e2e_dump_settings": { 105 "enable": True, 106 "trans_flag": False, 107 "slice_flag": 1, 108 "slice_num": 20 109 } 110} 111 112async_dump_dict_3 = { 113 "common_dump_settings": { 114 "dump_mode": 0, 115 "path": "", 116 "net_name": "Net", 117 "iteration": "all", 118 "input_output": 2, 119 "kernels": ["Default/TensorAdd-op3"], 120 "support_device": [0, 1, 2, 3, 4, 5, 6, 7], 121 "op_debug_mode": 0 122 } 123} 124 125async_dump_dict_acl = { 126 "common_dump_settings": { 127 "dump_mode": 0, 128 "path": "", 129 "net_name": "Net", 130 "iteration": "0", 131 "input_output": 0, 132 "kernels": [], 133 "support_device": [0, 1, 2, 3, 4, 5, 6, 7], 134 "op_debug_mode": 0 135 } 136} 137 138async_dump_dict_acl_assign_ops_by_regex = { 139 "common_dump_settings": { 140 "dump_mode": 1, 141 "path": "", 142 "net_name": "Net", 143 "iteration": "0", 144 "input_output": 0, 145 "kernels": ["name-regex(.+Add.*)"], 146 "support_device": [0, 1, 2, 3, 4, 5, 6, 7], 147 "op_debug_mode": 0 148 } 149} 150 151def generate_dump_json(dump_path, json_file_name, test_key, net_name='Net'): 152 """ 153 Util function to generate dump configuration json file. 154 """ 155 data = {} 156 if test_key in ["test_async_dump", "test_async_dump_dataset_sink", "test_ge_dump"]: 157 data = async_dump_dict 158 data["common_dump_settings"]["path"] = dump_path 159 elif test_key in ("test_e2e_dump", "test_e2e_dump_trans_false"): 160 data = e2e_dump_dict 161 data["common_dump_settings"]["path"] = dump_path 162 elif test_key == "test_async_dump_net_multi_layer_mode1": 163 data = async_dump_dict_2 164 data["common_dump_settings"]["path"] = dump_path 165 elif test_key in ("test_GPU_e2e_multi_root_graph_dump", "test_Ascend_e2e_multi_root_graph_dump"): 166 data = e2e_dump_dict_2 167 data["common_dump_settings"]["path"] = dump_path 168 elif test_key == "test_Ascend_async_multi_root_graph_dump" or test_key == "test_ge_dump_net_multi_layer_mode1": 169 data = async_dump_dict_3 170 data["common_dump_settings"]["path"] = dump_path 171 elif test_key == "test_async_dump_npy" or test_key == "test_ge_dump_npy": 172 data = async_dump_dict 173 data["common_dump_settings"]["path"] = dump_path 174 data["common_dump_settings"]["file_format"] = "npy" 175 elif test_key == "test_async_dump_bin": 176 data = async_dump_dict 177 data["common_dump_settings"]["path"] = dump_path 178 data["common_dump_settings"]["file_format"] = "bin" 179 elif test_key in ["test_e2e_dump_trans_true", "test_e2e_dump_lenet", "test_e2e_dump_dynamic_shape"]: 180 data = e2e_dump_dict 181 data["common_dump_settings"]["path"] = dump_path 182 data["e2e_dump_settings"]["trans_flag"] = True 183 elif test_key == "test_e2e_dump_trans_true_op_debug_mode": 184 data = e2e_dump_dict 185 data["common_dump_settings"]["path"] = dump_path 186 data["e2e_dump_settings"]["trans_flag"] = True 187 data["common_dump_settings"]["op_debug_mode"] = 3 188 elif test_key == "test_e2e_dump_save_kernel_args_true": 189 data = e2e_dump_dict 190 data["common_dump_settings"]["path"] = dump_path 191 data["e2e_dump_settings"]["save_kernel_args"] = True 192 elif test_key == "test_async_dump_net_multi_layer_mode1_npy": 193 data = async_dump_dict_2 194 data["common_dump_settings"]["path"] = dump_path 195 data["common_dump_settings"]["file_format"] = "npy" 196 elif test_key == "test_e2e_dump_sample_debug_mode": 197 data = e2e_dump_dict_3 198 data["common_dump_settings"]["path"] = dump_path 199 data["e2e_dump_settings"]["trans_flag"] = True 200 elif test_key == "test_acl_dump": 201 data = async_dump_dict_acl 202 data["common_dump_settings"]["path"] = dump_path 203 elif test_key == "test_acl_dump_dynamic_shape": 204 data = async_dump_dict_acl 205 data["common_dump_settings"]["path"] = dump_path 206 data["common_dump_settings"]["file_format"] = "npy" 207 elif test_key == "test_kbk_e2e_set_dump": 208 data = e2e_dump_dict 209 data["common_dump_settings"]["dump_mode"] = 2 210 data["common_dump_settings"]["path"] = dump_path 211 data["e2e_dump_settings"]["trans_flag"] = True 212 elif test_key == "test_kbk_e2e_dump_reg": 213 data = e2e_dump_dict 214 data["common_dump_settings"]["dump_mode"] = 1 215 data["common_dump_settings"]["path"] = dump_path 216 data["common_dump_settings"]["kernels"] = ["name-regex(.+/Add[^/]*)"] 217 data["e2e_dump_settings"]["trans_flag"] = True 218 elif test_key == "test_exception_dump": 219 data = e2e_dump_dict 220 data["common_dump_settings"]["path"] = dump_path 221 data["common_dump_settings"]["op_debug_mode"] = 4 222 data["e2e_dump_settings"]["trans_flag"] = True 223 elif test_key == "test_acl_dump_assign_ops_by_regex": 224 data = async_dump_dict_acl_assign_ops_by_regex 225 data["common_dump_settings"]["path"] = dump_path 226 else: 227 raise ValueError( 228 "Failed to generate dump json file. The test name value " + test_key + " is invalid.") 229 data["common_dump_settings"]["net_name"] = net_name 230 with open(json_file_name, 'w') as f: 231 json.dump(data, f) 232 233 234def generate_dump_json_with_overflow(dump_path, json_file_name, test_key, op): 235 """ 236 Util function to generate dump configuration json file. 237 """ 238 if test_key == "test_async_dump" or test_key == "test_ge_dump": 239 data = async_dump_dict 240 common_dump_settings = data.get("common_dump_settings", "") 241 if not isinstance(common_dump_settings, dict): 242 raise ValueError("Common_dump_settings should be dict, but got %s." % type(common_dump_settings)) 243 common_dump_settings["path"] = dump_path 244 common_dump_settings["op_debug_mode"] = op 245 elif test_key == "test_async_dump_npy": 246 data = async_dump_dict 247 common_dump_settings = data.get("common_dump_settings", "") 248 if not isinstance(common_dump_settings, dict): 249 raise ValueError("Common_dump_settings should be dict, but got %s." % type(common_dump_settings)) 250 common_dump_settings["path"] = dump_path 251 common_dump_settings["op_debug_mode"] = op 252 common_dump_settings["file_format"] = "npy" 253 else: 254 raise ValueError( 255 "Failed to generate dump json file. Overflow only support in async dump") 256 with open(json_file_name, 'w') as f: 257 json.dump(data, f) 258 259 260def generate_statistic_dump_json(dump_path, json_file_name, test_key, saved_data, net_name='Net', 261 statistic_category=None): 262 """ 263 Util function to generate dump configuration json file for statistic dump. 264 """ 265 data = {} 266 if test_key in ["test_gpu_e2e_dump", "test_e2e_dump_dynamic_shape_custom_statistic"]: 267 data = e2e_dump_dict 268 elif test_key == "test_async_dump" or test_key == "test_ge_dump": 269 data = async_dump_dict 270 data["common_dump_settings"]["input_output"] = 0 271 data["common_dump_settings"]["file_format"] = "npy" 272 elif test_key == "stat_calc_mode": 273 data = e2e_dump_dict 274 data["e2e_dump_settings"]["stat_calc_mode"] = "device" 275 else: 276 raise ValueError( 277 "Failed to generate statistic dump json file. The test name value " + test_key + " is invalid.") 278 data["common_dump_settings"]["path"] = dump_path 279 data["common_dump_settings"]["saved_data"] = saved_data 280 data["common_dump_settings"]["net_name"] = net_name 281 if statistic_category: 282 data["common_dump_settings"]["statistic_category"] = statistic_category 283 with open(json_file_name, 'w') as f: 284 json.dump(data, f) 285 286 287def generate_cell_dump_json(dump_path, json_file_name, test_key, dump_mode): 288 """ 289 Util function to generate dump configuration json file. 290 """ 291 if test_key == "test_async_dump": 292 data = async_dump_dict 293 data["common_dump_settings"]["path"] = dump_path 294 data["common_dump_settings"]["dump_mode"] = dump_mode 295 else: 296 raise ValueError( 297 "Failed to generate dump json file. Overflow only support in async dump") 298 with open(json_file_name, 'w') as f: 299 json.dump(data, f) 300 301 302def check_dump_structure(dump_path, json_file_path, num_card, num_graph, num_iteration, root_graph_id=None, 303 test_iteration_id=None, execution_history=True): 304 """ 305 Util to check if the dump structure is correct. 306 """ 307 with open(json_file_path) as f: 308 data = json.load(f) 309 net_name = data["common_dump_settings"]["net_name"] 310 assert os.path.isdir(dump_path) 311 if root_graph_id is None: 312 root_graph_id = [i for i in range(num_graph)] 313 if test_iteration_id is None: 314 test_iteration_id = [i for i in range(num_iteration)] 315 for rank_id in range(num_card): 316 rank_path = os.path.join(dump_path, "rank_" + str(rank_id)) 317 assert os.path.exists(rank_path) 318 319 net_name_path = os.path.join(rank_path, net_name) 320 assert os.path.exists(net_name_path) 321 graph_path = os.path.join(rank_path, "graphs") 322 assert os.path.exists(graph_path) 323 execution_order_path = os.path.join(rank_path, "execution_order") 324 assert os.path.exists(execution_order_path) 325 326 for graph_id in range(num_graph): 327 graph_pb_file = os.path.join(graph_path, "ms_output_trace_code_graph_" + str(graph_id) + ".pb") 328 graph_ir_file = os.path.join(graph_path, "ms_output_trace_code_graph_" + str(graph_id) + ".ir") 329 assert os.path.exists(graph_pb_file) 330 assert os.path.exists(graph_ir_file) 331 332 execution_order_file = os.path.join(execution_order_path, "ms_execution_order_graph_" 333 + str(graph_id) + ".csv") 334 assert os.path.exists(execution_order_file) 335 if graph_id in root_graph_id: 336 if execution_history: 337 execution_history_file = os.path.join(execution_order_path, 338 "ms_global_execution_order_graph_" + str(graph_id) + ".csv") 339 assert os.path.exists(execution_history_file) 340 graph_id_path = os.path.join(net_name_path, str(graph_id)) 341 assert os.path.exists(graph_id_path) 342 for iteration_id in test_iteration_id: 343 it_id_path = os.path.join(graph_id_path, str(iteration_id)) 344 assert os.path.isdir(it_id_path) 345 346 347def find_nth_pos(string, substring, n): 348 start = string.find(substring) 349 while n > 1 and start >= 0: 350 start = string.find(substring, start + len(substring)) 351 n -= 1 352 return start 353 354 355def check_statistic_dump(dump_file_path): 356 output_name = "statistic.csv" 357 output_path = glob.glob(os.path.join(dump_file_path, output_name))[0] 358 real_path = os.path.realpath(output_path) 359 with open(real_path) as f: 360 reader = csv.DictReader(f) 361 stats = list(reader) 362 363 def get_add_node(statistic): 364 return statistic['Op Type'] == 'Add' 365 366 add_statistics = list(filter(get_add_node, stats)) 367 num_tensors = len(add_statistics) 368 assert num_tensors == 3 369 for tensor in add_statistics: 370 if tensor['IO'] == 'input' and tensor['Slot'] == '0': 371 assert tensor['Min Value'] == '1' 372 assert tensor['Max Value'] == '6' 373 assert tensor['L2Norm Value'] == '9.53939' 374 elif tensor['IO'] == 'input' and tensor['Slot'] == '1': 375 assert tensor['Min Value'] == '7' 376 assert tensor['Max Value'] == '12' 377 assert tensor['L2Norm Value'] == '23.6432' 378 elif tensor['IO'] == 'output' and tensor['Slot'] == '0': 379 assert tensor['Min Value'] == '8' 380 assert tensor['Max Value'] == '18' 381 assert tensor['L2Norm Value'] == '32.9242' 382 383 384def check_data_dump(dump_file_path): 385 output_name = "Add.*Add-op*.output.0.*.npy" 386 output_path = glob.glob(os.path.join(dump_file_path, output_name))[0] 387 real_path = os.path.realpath(output_path) 388 output = np.load(real_path) 389 expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32) 390 assert np.array_equal(output, expect) 391 392 393def check_saved_data(iteration_path, saved_data): 394 if not saved_data: 395 return 396 if saved_data in ('statistic', 'full'): 397 check_statistic_dump(iteration_path) 398 if saved_data in ('tensor', 'full'): 399 check_data_dump(iteration_path) 400 if saved_data == 'statistic': 401 # assert only file is statistic.csv, tensor data is not saved 402 assert len(os.listdir(iteration_path)) == 1 403 elif saved_data == 'tensor': 404 # assert only tensor data is saved, not statistics 405 stat_path = os.path.join(iteration_path, 'statistic.csv') 406 assert not os.path.isfile(stat_path) 407 408 409def check_overflow_file(iteration_path, overflow_num, need_check): 410 if not need_check: 411 return overflow_num 412 overflow_files = glob.glob(os.path.join(iteration_path, "Opdebug.Node_OpDebug.*.*.*")) 413 overflow_num += len(overflow_files) 414 return overflow_num 415 416 417def check_iteration(iteration_id, num_iteration): 418 if iteration_id.isdigit(): 419 assert int(iteration_id) < num_iteration 420 421 422def check_ge_dump_structure(dump_path, num_iteration, device_num=1, check_overflow=False, saved_data=None, 423 check_data=True): 424 overflow_num = 0 425 for _ in range(3): 426 if not os.listdir(dump_path): 427 time.sleep(2) 428 sub_paths = os.listdir(dump_path) 429 assert sub_paths 430 device_path_num = 0 431 for sub_path in sub_paths: 432 # on GE, the whole dump directory of one training is saved within a time path, like '20230822120819' 433 if not (sub_path.isdigit() and len(sub_path) == 14): 434 continue 435 time_path = os.path.join(dump_path, sub_path) 436 assert os.path.isdir(time_path) 437 device_paths = os.listdir(time_path) 438 device_path_num += len(device_paths) 439 for device_path in device_paths: 440 assert device_path.isdigit() 441 abs_device_path = os.path.join(time_path, device_path) 442 assert os.path.isdir(abs_device_path) 443 model_names = os.listdir(abs_device_path) 444 for model_name in model_names: 445 model_path = os.path.join(abs_device_path, model_name) 446 assert os.path.isdir(model_path) 447 model_ids = os.listdir(model_path) 448 for model_id in model_ids: 449 model_id_path = os.path.join(model_path, model_id) 450 assert os.path.isdir(model_id_path) 451 iteration_ids = os.listdir(model_id_path) 452 for iteration_id in iteration_ids: 453 check_iteration(iteration_id, num_iteration) 454 iteration_path = os.path.join(model_id_path, iteration_id) 455 assert os.path.isdir(iteration_path) 456 if check_data: 457 check_saved_data(iteration_path, saved_data) 458 overflow_num = check_overflow_file(iteration_path, overflow_num, check_overflow) 459 assert device_path_num == device_num 460 if check_overflow: 461 assert overflow_num 462