1# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""test the RunMetadata proto.""" 16 17from collections import defaultdict 18 19import six 20 21from tensorflow.core.protobuf import config_pb2 22from tensorflow.core.protobuf import rewriter_config_pb2 23from tensorflow.python.client import session 24from tensorflow.python.framework import ops 25from tensorflow.python.framework import test_util 26from tensorflow.python.ops import math_ops 27from tensorflow.python.ops import random_ops 28from tensorflow.python.ops import variables 29from tensorflow.python.platform import test 30from tensorflow.python.profiler import option_builder 31 32# pylint: disable=g-bad-import-order 33# XXX: this depends on pywrap_tensorflow and must come later 34from tensorflow.python.profiler import model_analyzer 35from tensorflow.python.profiler.internal import model_analyzer_testlib as lib 36 37SIZE = 1300 38builder = option_builder.ProfileOptionBuilder 39 40 41def _extract_node(run_meta, node_name): 42 ret = defaultdict(list) 43 for dev_stat in run_meta.step_stats.dev_stats: 44 dev = dev_stat.device.lower() 45 if dev.find('cpu:') > 0: 46 dev = dev[dev.find('cpu:'):] 47 elif dev.find('gpu:') > 0: 48 dev = dev[dev.find('gpu:'):] 49 elif '/host:cpu' not in dev: 50 assert False, 'Unrecognized device name: %s' % dev 51 52 for node_stat in dev_stat.node_stats: 53 nname = node_stat.node_name 54 if nname.find(':') > 0: 55 nname = nname[:nname.find(':')] 56 if nname == node_name: 57 ret[dev].append(node_stat) 58 return ret 59 60 61def _run_model(): 62 x = random_ops.random_normal(shape=[1, SIZE]) 63 w = random_ops.random_normal(shape=[SIZE, 2 * SIZE]) 64 y = math_ops.matmul(x, w) 65 66 config = config_pb2.ConfigProto() 67 config.graph_options.rewrite_options.arithmetic_optimization = ( 68 rewriter_config_pb2.RewriterConfig.OFF) 69 with session.Session(config=config) as sess: 70 run_metadata = config_pb2.RunMetadata() 71 opts = builder.time_and_memory() 72 opts['min_micros'] = 0 73 opts['min_bytes'] = 0 74 opts['order_by'] = 'name' 75 opts['output'] = 'none' 76 _ = sess.run( 77 y, 78 options=config_pb2.RunOptions( 79 trace_level=config_pb2.RunOptions.SOFTWARE_TRACE), 80 run_metadata=run_metadata) 81 tfprof_node = model_analyzer.profile( 82 sess.graph, run_meta=run_metadata, options=opts) 83 84 return tfprof_node, run_metadata 85 86 87def _run_loop_model(): 88 config = config_pb2.ConfigProto() 89 # Grappler might fuse MatMul with BiasAdd in remapper optimizer. 90 config.graph_options.rewrite_options.remapping = ( 91 rewriter_config_pb2.RewriterConfig.OFF) 92 with session.Session(config=config) as sess: 93 x = lib.BuildFullModel() 94 95 sess.run(variables.global_variables_initializer()) 96 run_meta = config_pb2.RunMetadata() 97 _ = sess.run( 98 x, 99 options=config_pb2.RunOptions( 100 trace_level=config_pb2.RunOptions.SOFTWARE_TRACE), 101 run_metadata=run_meta) 102 103 opts = builder.time_and_memory() 104 opts['order_by'] = 'name' 105 opts['output'] = 'none' 106 107 tfprof_node = model_analyzer.profile(sess.graph, run_meta, options=opts) 108 return tfprof_node, run_meta 109 110 111class RunMetadataTest(test.TestCase): 112 113 # This test requires HARDWARE_TRACE or FULL_TRACE to be specified to 114 # work as expected. Since we now run this test with SOFTWARE_TRACE 115 # (see _run_model routine above), this test will / should fail since 116 # GPU device tracers are not enabled 117 @test.disable_with_predicate( 118 pred=test.is_built_with_rocm, 119 skip_message='Test fails on ROCm when run without FULL_TRACE') 120 @test_util.run_deprecated_v1 121 def testGPU(self): 122 if not test.is_gpu_available(cuda_only=True): 123 return 124 125 gpu_dev = test.gpu_device_name() 126 ops.reset_default_graph() 127 with ops.device(gpu_dev): 128 tfprof_node, run_meta = _run_model() 129 self.assertEqual(tfprof_node.children[0].name, 'MatMul') 130 self.assertGreater(tfprof_node.children[0].exec_micros, 10) 131 132 ret = _extract_node(run_meta, 'MatMul') 133 self.assertEqual(len(ret['gpu:0']), 1) 134 135 @test_util.run_deprecated_v1 136 def testAllocationHistory(self): 137 if not test.is_gpu_available(cuda_only=True): 138 return 139 140 gpu_dev = test.gpu_device_name() 141 ops.reset_default_graph() 142 with ops.device(gpu_dev): 143 _, run_meta = _run_model() 144 145 mm = _extract_node(run_meta, 'MatMul')['gpu:0'][0] 146 mm_allocs = mm.memory[0].allocation_records 147 # has allocation and deallocation. 148 self.assertEqual(len(mm_allocs), 2) 149 # first allocated. 150 self.assertGreater(mm_allocs[1].alloc_micros, mm_allocs[0].alloc_micros) 151 self.assertGreater(mm_allocs[0].alloc_bytes, 0) 152 # Then deallocated. 153 self.assertLess(mm_allocs[1].alloc_bytes, 0) 154 # All memory deallocated. 155 self.assertEqual(mm_allocs[0].alloc_bytes + mm_allocs[1].alloc_bytes, 0) 156 157 rand = _extract_node(run_meta, 158 'random_normal/RandomStandardNormal')['gpu:0'][0] 159 random_allocs = rand.memory[0].allocation_records 160 # random normal must allocated first since matmul depends on it. 161 self.assertLess(random_allocs[0].alloc_micros, mm.all_start_micros) 162 # deallocates the memory after matmul started. 163 self.assertGreater(random_allocs[1].alloc_micros, mm.all_start_micros) 164 165 @test_util.run_deprecated_v1 166 def testCPU(self): 167 ops.reset_default_graph() 168 with ops.device('/cpu:0'): 169 tfprof_node, run_meta = _run_model() 170 self.assertEqual(tfprof_node.children[0].name, 'MatMul') 171 self.assertGreater(tfprof_node.children[0].exec_micros, 0) 172 173 ret = _extract_node(run_meta, 'MatMul') 174 self.assertEqual(len(ret['cpu:0']), 1) 175 176 ret = _extract_node(run_meta, 'MatMul:MatMul') 177 self.assertEqual(len(ret), 0) 178 179 @test_util.run_v1_only('b/120545219') 180 def testLoopCPU(self): 181 ops.reset_default_graph() 182 with ops.device('/cpu:0'): 183 tfprof_node, run_meta = _run_loop_model() 184 # The while-loop caused a node to appear 4 times in scheduling. 185 ret = _extract_node(run_meta, 'rnn/while/basic_rnn_cell/MatMul') 186 self.assertEqual(len(ret['cpu:0']), 4) 187 188 total_cpu_execs = 0 189 for node in ret['cpu:0']: 190 total_cpu_execs += node.op_end_rel_micros 191 192 mm_node = lib.SearchTFProfNode(tfprof_node, 193 'rnn/while/basic_rnn_cell/MatMul') 194 195 self.assertEqual(mm_node.run_count, 4) 196 self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs) 197 self.assertEqual(mm_node.exec_micros, total_cpu_execs) 198 199 def testGradientGraph(self): 200 # Note: Please don't just adjust the test to make it pass. 201 # The code view logic depends on it. 202 ops.reset_default_graph() 203 _, _ = _run_loop_model() 204 graph = ops.get_default_graph() 205 forward_op = set() 206 backward_op = set() 207 back_to_forward = {} 208 for op in graph.get_operations(): 209 if op.name.find('gradients/') > 0 and op.name.find('_grad/') > 0: 210 backward_op.add(op.name) 211 idx1 = op.name.find('gradients/') + 10 212 idx2 = op.name.find('_grad/') 213 back_to_forward[op.name] = op.name[idx1:idx2] 214 else: 215 forward_op.add(op.name) 216 217 for _, f in six.iteritems(back_to_forward): 218 self.assertTrue(f in forward_op) 219 220 # This test requires HARDWARE_TRACE or FULL_TRACE to be specified to 221 # work as expected. Since we now run this test with SOFTWARE_TRACE 222 # (see _run_model routine above), this test will / should fail since 223 # GPU device tracers are not enabled 224 @test.disable_with_predicate( 225 pred=test.is_built_with_rocm, 226 skip_message='Test fails on ROCm when run without FULL_TRACE') 227 def testLoopGPU(self): 228 if not test.is_gpu_available(): 229 return 230 231 ops.reset_default_graph() 232 with ops.device('/device:GPU:0'): 233 _, run_meta = _run_loop_model() 234 # The while-loop caused a node to appear 4 times in scheduling. 235 ret = _extract_node(run_meta, 'rnn/while/basic_rnn_cell/MatMul') 236 self.assertEqual(len(ret['gpu:0']), 4, '%s' % run_meta) 237 238 total_cpu_execs = 0 239 for node in ret['gpu:0']: 240 total_cpu_execs += node.op_end_rel_micros 241 242 self.assertGreaterEqual( 243 len(ret['gpu:0/stream:all']), 4, '%s' % run_meta) 244 245 246if __name__ == '__main__': 247 test.main() 248