• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""test the RunMetadata proto."""
16
17from collections import defaultdict
18
19import six
20
21from tensorflow.core.protobuf import config_pb2
22from tensorflow.core.protobuf import rewriter_config_pb2
23from tensorflow.python.client import session
24from tensorflow.python.framework import ops
25from tensorflow.python.framework import test_util
26from tensorflow.python.ops import math_ops
27from tensorflow.python.ops import random_ops
28from tensorflow.python.ops import variables
29from tensorflow.python.platform import test
30from tensorflow.python.profiler import option_builder
31
32# pylint: disable=g-bad-import-order
33# XXX: this depends on pywrap_tensorflow and must come later
34from tensorflow.python.profiler import model_analyzer
35from tensorflow.python.profiler.internal import model_analyzer_testlib as lib
36
37SIZE = 1300
38builder = option_builder.ProfileOptionBuilder
39
40
41def _extract_node(run_meta, node_name):
42  ret = defaultdict(list)
43  for dev_stat in run_meta.step_stats.dev_stats:
44    dev = dev_stat.device.lower()
45    if dev.find('cpu:') > 0:
46      dev = dev[dev.find('cpu:'):]
47    elif dev.find('gpu:') > 0:
48      dev = dev[dev.find('gpu:'):]
49    elif '/host:cpu' not in dev:
50      assert False, 'Unrecognized device name: %s' % dev
51
52    for node_stat in dev_stat.node_stats:
53      nname = node_stat.node_name
54      if nname.find(':') > 0:
55        nname = nname[:nname.find(':')]
56      if nname == node_name:
57        ret[dev].append(node_stat)
58  return ret
59
60
61def _run_model():
62  x = random_ops.random_normal(shape=[1, SIZE])
63  w = random_ops.random_normal(shape=[SIZE, 2 * SIZE])
64  y = math_ops.matmul(x, w)
65
66  config = config_pb2.ConfigProto()
67  config.graph_options.rewrite_options.arithmetic_optimization = (
68      rewriter_config_pb2.RewriterConfig.OFF)
69  with session.Session(config=config) as sess:
70    run_metadata = config_pb2.RunMetadata()
71    opts = builder.time_and_memory()
72    opts['min_micros'] = 0
73    opts['min_bytes'] = 0
74    opts['order_by'] = 'name'
75    opts['output'] = 'none'
76    _ = sess.run(
77        y,
78        options=config_pb2.RunOptions(
79            trace_level=config_pb2.RunOptions.SOFTWARE_TRACE),
80        run_metadata=run_metadata)
81    tfprof_node = model_analyzer.profile(
82        sess.graph, run_meta=run_metadata, options=opts)
83
84    return tfprof_node, run_metadata
85
86
87def _run_loop_model():
88  config = config_pb2.ConfigProto()
89  # Grappler might fuse MatMul with BiasAdd in remapper optimizer.
90  config.graph_options.rewrite_options.remapping = (
91      rewriter_config_pb2.RewriterConfig.OFF)
92  with session.Session(config=config) as sess:
93    x = lib.BuildFullModel()
94
95    sess.run(variables.global_variables_initializer())
96    run_meta = config_pb2.RunMetadata()
97    _ = sess.run(
98        x,
99        options=config_pb2.RunOptions(
100            trace_level=config_pb2.RunOptions.SOFTWARE_TRACE),
101        run_metadata=run_meta)
102
103    opts = builder.time_and_memory()
104    opts['order_by'] = 'name'
105    opts['output'] = 'none'
106
107    tfprof_node = model_analyzer.profile(sess.graph, run_meta, options=opts)
108    return tfprof_node, run_meta
109
110
111class RunMetadataTest(test.TestCase):
112
113  # This test requires HARDWARE_TRACE or FULL_TRACE to be specified to
114  # work as expected. Since we now run this test with SOFTWARE_TRACE
115  # (see _run_model routine above), this test will / should fail since
116  # GPU device tracers are not enabled
117  @test.disable_with_predicate(
118      pred=test.is_built_with_rocm,
119      skip_message='Test fails on ROCm when run without FULL_TRACE')
120  @test_util.run_deprecated_v1
121  def testGPU(self):
122    if not test.is_gpu_available(cuda_only=True):
123      return
124
125    gpu_dev = test.gpu_device_name()
126    ops.reset_default_graph()
127    with ops.device(gpu_dev):
128      tfprof_node, run_meta = _run_model()
129      self.assertEqual(tfprof_node.children[0].name, 'MatMul')
130      self.assertGreater(tfprof_node.children[0].exec_micros, 10)
131
132    ret = _extract_node(run_meta, 'MatMul')
133    self.assertEqual(len(ret['gpu:0']), 1)
134
135  @test_util.run_deprecated_v1
136  def testAllocationHistory(self):
137    if not test.is_gpu_available(cuda_only=True):
138      return
139
140    gpu_dev = test.gpu_device_name()
141    ops.reset_default_graph()
142    with ops.device(gpu_dev):
143      _, run_meta = _run_model()
144
145    mm = _extract_node(run_meta, 'MatMul')['gpu:0'][0]
146    mm_allocs = mm.memory[0].allocation_records
147    # has allocation and deallocation.
148    self.assertEqual(len(mm_allocs), 2)
149    # first allocated.
150    self.assertGreater(mm_allocs[1].alloc_micros, mm_allocs[0].alloc_micros)
151    self.assertGreater(mm_allocs[0].alloc_bytes, 0)
152    # Then deallocated.
153    self.assertLess(mm_allocs[1].alloc_bytes, 0)
154    # All memory deallocated.
155    self.assertEqual(mm_allocs[0].alloc_bytes + mm_allocs[1].alloc_bytes, 0)
156
157    rand = _extract_node(run_meta,
158                         'random_normal/RandomStandardNormal')['gpu:0'][0]
159    random_allocs = rand.memory[0].allocation_records
160    # random normal must allocated first since matmul depends on it.
161    self.assertLess(random_allocs[0].alloc_micros, mm.all_start_micros)
162    # deallocates the memory after matmul started.
163    self.assertGreater(random_allocs[1].alloc_micros, mm.all_start_micros)
164
165  @test_util.run_deprecated_v1
166  def testCPU(self):
167    ops.reset_default_graph()
168    with ops.device('/cpu:0'):
169      tfprof_node, run_meta = _run_model()
170      self.assertEqual(tfprof_node.children[0].name, 'MatMul')
171      self.assertGreater(tfprof_node.children[0].exec_micros, 0)
172
173    ret = _extract_node(run_meta, 'MatMul')
174    self.assertEqual(len(ret['cpu:0']), 1)
175
176    ret = _extract_node(run_meta, 'MatMul:MatMul')
177    self.assertEqual(len(ret), 0)
178
179  @test_util.run_v1_only('b/120545219')
180  def testLoopCPU(self):
181    ops.reset_default_graph()
182    with ops.device('/cpu:0'):
183      tfprof_node, run_meta = _run_loop_model()
184      # The while-loop caused a node to appear 4 times in scheduling.
185      ret = _extract_node(run_meta, 'rnn/while/basic_rnn_cell/MatMul')
186      self.assertEqual(len(ret['cpu:0']), 4)
187
188      total_cpu_execs = 0
189      for node in ret['cpu:0']:
190        total_cpu_execs += node.op_end_rel_micros
191
192      mm_node = lib.SearchTFProfNode(tfprof_node,
193                                     'rnn/while/basic_rnn_cell/MatMul')
194
195      self.assertEqual(mm_node.run_count, 4)
196      self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs)
197      self.assertEqual(mm_node.exec_micros, total_cpu_execs)
198
199  def testGradientGraph(self):
200    # Note: Please don't just adjust the test to make it pass.
201    # The code view logic depends on it.
202    ops.reset_default_graph()
203    _, _ = _run_loop_model()
204    graph = ops.get_default_graph()
205    forward_op = set()
206    backward_op = set()
207    back_to_forward = {}
208    for op in graph.get_operations():
209      if op.name.find('gradients/') > 0 and op.name.find('_grad/') > 0:
210        backward_op.add(op.name)
211        idx1 = op.name.find('gradients/') + 10
212        idx2 = op.name.find('_grad/')
213        back_to_forward[op.name] = op.name[idx1:idx2]
214      else:
215        forward_op.add(op.name)
216
217    for _, f in six.iteritems(back_to_forward):
218      self.assertTrue(f in forward_op)
219
220  # This test requires HARDWARE_TRACE or FULL_TRACE to be specified to
221  # work as expected. Since we now run this test with SOFTWARE_TRACE
222  # (see _run_model routine above), this test will / should fail since
223  # GPU device tracers are not enabled
224  @test.disable_with_predicate(
225      pred=test.is_built_with_rocm,
226      skip_message='Test fails on ROCm when run without FULL_TRACE')
227  def testLoopGPU(self):
228    if not test.is_gpu_available():
229      return
230
231    ops.reset_default_graph()
232    with ops.device('/device:GPU:0'):
233      _, run_meta = _run_loop_model()
234      # The while-loop caused a node to appear 4 times in scheduling.
235      ret = _extract_node(run_meta, 'rnn/while/basic_rnn_cell/MatMul')
236      self.assertEqual(len(ret['gpu:0']), 4, '%s' % run_meta)
237
238      total_cpu_execs = 0
239      for node in ret['gpu:0']:
240        total_cpu_execs += node.op_end_rel_micros
241
242        self.assertGreaterEqual(
243            len(ret['gpu:0/stream:all']), 4, '%s' % run_meta)
244
245
246if __name__ == '__main__':
247  test.main()
248