1# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""test the RunMetadata proto.""" 16 17from __future__ import absolute_import 18from __future__ import division 19from __future__ import print_function 20 21from collections import defaultdict 22 23import six 24 25from tensorflow.core.protobuf import config_pb2 26from tensorflow.core.protobuf import rewriter_config_pb2 27from tensorflow.python.client import session 28from tensorflow.python.framework import ops 29from tensorflow.python.framework import test_util 30from tensorflow.python.ops import math_ops 31from tensorflow.python.ops import random_ops 32from tensorflow.python.ops import variables 33from tensorflow.python.platform import test 34from tensorflow.python.profiler import option_builder 35 36# pylint: disable=g-bad-import-order 37# XXX: this depends on pywrap_tensorflow and must come later 38from tensorflow.python.profiler import model_analyzer 39from tensorflow.python.profiler.internal import model_analyzer_testlib as lib 40 41SIZE = 1300 42builder = option_builder.ProfileOptionBuilder 43 44 45def _extract_node(run_meta, node_name): 46 ret = defaultdict(list) 47 for dev_stat in run_meta.step_stats.dev_stats: 48 dev = dev_stat.device.lower() 49 if dev.find('cpu:') > 0: 50 dev = dev[dev.find('cpu:'):] 51 elif dev.find('gpu:') > 0: 52 dev = dev[dev.find('gpu:'):] 53 elif '/host:cpu' not in dev: 54 assert False, 'Unrecognized device name: %s' % dev 55 56 for node_stat in dev_stat.node_stats: 57 nname = node_stat.node_name 58 if nname.find(':') > 0: 59 nname = nname[:nname.find(':')] 60 if nname == node_name: 61 ret[dev].append(node_stat) 62 return ret 63 64 65def _run_model(): 66 x = random_ops.random_normal(shape=[1, SIZE]) 67 w = random_ops.random_normal(shape=[SIZE, 2 * SIZE]) 68 y = math_ops.matmul(x, w) 69 70 config = config_pb2.ConfigProto() 71 config.graph_options.rewrite_options.arithmetic_optimization = ( 72 rewriter_config_pb2.RewriterConfig.OFF) 73 with session.Session(config=config) as sess: 74 run_metadata = config_pb2.RunMetadata() 75 opts = builder.time_and_memory() 76 opts['min_micros'] = 0 77 opts['min_bytes'] = 0 78 opts['order_by'] = 'name' 79 opts['output'] = 'none' 80 _ = sess.run(y, 81 options=config_pb2.RunOptions( 82 trace_level=config_pb2.RunOptions.FULL_TRACE), 83 run_metadata=run_metadata) 84 tfprof_node = model_analyzer.profile( 85 sess.graph, 86 run_meta=run_metadata, 87 options=opts) 88 89 return tfprof_node, run_metadata 90 91 92def _run_loop_model(): 93 with session.Session() as sess: 94 x = lib.BuildFullModel() 95 96 sess.run(variables.global_variables_initializer()) 97 run_meta = config_pb2.RunMetadata() 98 _ = sess.run(x, 99 options=config_pb2.RunOptions( 100 trace_level=config_pb2.RunOptions.FULL_TRACE), 101 run_metadata=run_meta) 102 103 opts = builder.time_and_memory() 104 opts['order_by'] = 'name' 105 opts['output'] = 'none' 106 107 tfprof_node = model_analyzer.profile( 108 sess.graph, run_meta, options=opts) 109 return tfprof_node, run_meta 110 111 112class RunMetadataTest(test.TestCase): 113 114 @test_util.run_deprecated_v1 115 def testGPU(self): 116 if not test.is_gpu_available(cuda_only=True): 117 return 118 119 gpu_dev = test.gpu_device_name() 120 ops.reset_default_graph() 121 with ops.device(gpu_dev): 122 tfprof_node, run_meta = _run_model() 123 self.assertEqual(tfprof_node.children[0].name, 'MatMul') 124 self.assertGreater(tfprof_node.children[0].exec_micros, 10) 125 126 ret = _extract_node(run_meta, 'MatMul') 127 self.assertEqual(len(ret['gpu:0']), 1) 128 self.assertEqual(len(ret['gpu:0/stream:all']), 1, '%s' % run_meta) 129 130 @test_util.run_deprecated_v1 131 def testAllocationHistory(self): 132 if not test.is_gpu_available(cuda_only=True): 133 return 134 135 gpu_dev = test.gpu_device_name() 136 ops.reset_default_graph() 137 with ops.device(gpu_dev): 138 _, run_meta = _run_model() 139 140 mm = _extract_node(run_meta, 'MatMul')['gpu:0'][0] 141 mm_allocs = mm.memory[0].allocation_records 142 # has allocation and deallocation. 143 self.assertEqual(len(mm_allocs), 2) 144 # first allocated. 145 self.assertGreater(mm_allocs[1].alloc_micros, mm_allocs[0].alloc_micros) 146 self.assertGreater(mm_allocs[0].alloc_bytes, 0) 147 # Then deallocated. 148 self.assertLess(mm_allocs[1].alloc_bytes, 0) 149 # All memory deallocated. 150 self.assertEqual(mm_allocs[0].alloc_bytes + mm_allocs[1].alloc_bytes, 0) 151 152 rand = _extract_node( 153 run_meta, 'random_normal/RandomStandardNormal')['gpu:0'][0] 154 random_allocs = rand.memory[0].allocation_records 155 # random normal must allocated first since matmul depends on it. 156 self.assertLess(random_allocs[0].alloc_micros, mm.all_start_micros) 157 # deallocates the memory after matmul started. 158 self.assertGreater(random_allocs[1].alloc_micros, mm.all_start_micros) 159 160 @test_util.run_deprecated_v1 161 def testCPU(self): 162 ops.reset_default_graph() 163 with ops.device('/cpu:0'): 164 tfprof_node, run_meta = _run_model() 165 self.assertEqual(tfprof_node.children[0].name, 'MatMul') 166 self.assertGreater(tfprof_node.children[0].exec_micros, 0) 167 168 ret = _extract_node(run_meta, 'MatMul') 169 self.assertEqual(len(ret['cpu:0']), 1) 170 171 ret = _extract_node(run_meta, 'MatMul:MatMul') 172 self.assertEqual(len(ret), 0) 173 174 @test_util.run_v1_only('b/120545219') 175 def testLoopCPU(self): 176 ops.reset_default_graph() 177 with ops.device('/cpu:0'): 178 tfprof_node, run_meta = _run_loop_model() 179 # The while-loop caused a node to appear 4 times in scheduling. 180 ret = _extract_node(run_meta, 181 'rnn/while/basic_rnn_cell/MatMul') 182 self.assertEqual(len(ret['cpu:0']), 4) 183 184 total_cpu_execs = 0 185 for node in ret['cpu:0']: 186 total_cpu_execs += node.op_end_rel_micros 187 188 mm_node = lib.SearchTFProfNode( 189 tfprof_node, 190 'rnn/while/basic_rnn_cell/MatMul') 191 192 self.assertEqual(mm_node.run_count, 4) 193 self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs) 194 self.assertEqual(mm_node.exec_micros, total_cpu_execs) 195 196 def testGradientGraph(self): 197 # Note: Please don't just adjust the test to make it pass. 198 # The code view logic depends on it. 199 ops.reset_default_graph() 200 _, _ = _run_loop_model() 201 graph = ops.get_default_graph() 202 forward_op = set() 203 backward_op = set() 204 back_to_forward = dict() 205 for op in graph.get_operations(): 206 if op.name.find('gradients/') > 0 and op.name.find('_grad/') > 0: 207 backward_op.add(op.name) 208 idx1 = op.name.find('gradients/') + 10 209 idx2 = op.name.find('_grad/') 210 back_to_forward[op.name] = op.name[idx1:idx2] 211 else: 212 forward_op.add(op.name) 213 214 for _, f in six.iteritems(back_to_forward): 215 self.assertTrue(f in forward_op) 216 217 def testLoopGPU(self): 218 if not test.is_gpu_available(): 219 return 220 221 ops.reset_default_graph() 222 with ops.device('/device:GPU:0'): 223 _, run_meta = _run_loop_model() 224 # The while-loop caused a node to appear 4 times in scheduling. 225 ret = _extract_node(run_meta, 226 'rnn/while/basic_rnn_cell/MatMul') 227 self.assertEqual(len(ret['gpu:0']), 4, '%s' % run_meta) 228 229 total_cpu_execs = 0 230 for node in ret['gpu:0']: 231 total_cpu_execs += node.op_end_rel_micros 232 233 self.assertGreaterEqual(len(ret['gpu:0/stream:all']), 4, '%s' % run_meta) 234 235 236if __name__ == '__main__': 237 test.main() 238