• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Profiler client APIs."""
16
17from tensorflow.python.framework import errors
18from tensorflow.python.profiler.internal import _pywrap_profiler
19
20from tensorflow.python.util.tf_export import tf_export
21
22_GRPC_PREFIX = 'grpc://'
23
24
25@tf_export('profiler.experimental.client.trace', v1=[])
26def trace(service_addr,
27          logdir,
28          duration_ms,
29          worker_list='',
30          num_tracing_attempts=3,
31          options=None):
32  """Sends gRPC requests to one or more profiler servers to perform on-demand profiling.
33
34  This method will block the calling thread until it receives responses from all
35  servers or until deadline expiration. Both single host and multiple host
36  profiling are supported on CPU, GPU, and TPU.
37  The profiled results will be saved by each server to the specified TensorBoard
38  log directory (i.e. the directory you save your model checkpoints). Use the
39  TensorBoard profile plugin to view the visualization and analysis results.
40
41  Args:
42    service_addr: A comma delimited string of gRPC addresses of the workers to
43      profile.
44      e.g. service_addr='grpc://localhost:6009'
45           service_addr='grpc://10.0.0.2:8466,grpc://10.0.0.3:8466'
46           service_addr='grpc://localhost:12345,grpc://localhost:23456'
47    logdir: Path to save profile data to, typically a TensorBoard log directory.
48      This path must be accessible to both the client and server.
49      e.g. logdir='gs://your_tb_dir'
50    duration_ms: Duration of tracing or monitoring in milliseconds. Must be
51      greater than zero.
52    worker_list: An optional TPU only configuration. The list of workers to
53      profile in the current session.
54    num_tracing_attempts: Optional. Automatically retry N times when no trace
55      event is collected (default 3).
56    options: profiler.experimental.ProfilerOptions namedtuple for miscellaneous
57      profiler options.
58
59  Raises:
60    InvalidArgumentError: For when arguments fail validation checks.
61    UnavailableError: If no trace event was collected.
62
63  Example usage (CPU/GPU):
64
65  ```python
66    # Start a profiler server before your model runs.
67    tf.profiler.experimental.server.start(6009)
68    # (Model code goes here).
69    # Send gRPC request to the profiler server to collect a trace of your model.
70    tf.profiler.experimental.client.trace('grpc://localhost:6009',
71                                          '/nfs/tb_log', 2000)
72  ```
73
74  Example usage (Multiple GPUs):
75
76  ```python
77    # E.g. your worker IP addresses are 10.0.0.2, 10.0.0.3, 10.0.0.4, and you
78    # would like to schedule start of profiling 1 second from now, for a
79    # duration of 2 seconds.
80    options['delay_ms'] = 1000
81    tf.profiler.experimental.client.trace(
82        'grpc://10.0.0.2:8466,grpc://10.0.0.3:8466,grpc://10.0.0.4:8466',
83        'gs://your_tb_dir',
84        2000,
85        options=options)
86  ```
87
88  Example usage (TPU):
89
90  ```python
91    # Send gRPC request to a TPU worker to collect a trace of your model. A
92    # profiler service has been started in the TPU worker at port 8466.
93    # E.g. your TPU IP address is 10.0.0.2 and you want to profile for 2 seconds
94    # .
95    tf.profiler.experimental.client.trace('grpc://10.0.0.2:8466',
96                                          'gs://your_tb_dir', 2000)
97  ```
98
99  Example usage (Multiple TPUs):
100
101  ```python
102    # Send gRPC request to a TPU pod to collect a trace of your model on
103    # multiple TPUs. A profiler service has been started in all the TPU workers
104    # at the port 8466.
105    # E.g. your TPU IP addresses are 10.0.0.2, 10.0.0.3, 10.0.0.4, and you want
106    # to profile for 2 seconds.
107    tf.profiler.experimental.client.trace(
108        'grpc://10.0.0.2:8466',
109        'gs://your_tb_dir',
110        2000,
111        '10.0.0.2:8466,10.0.0.3:8466,10.0.0.4:8466')
112  ```
113
114  Launch TensorBoard and point it to the same logdir you provided to this API.
115
116  ```shell
117    # logdir can be gs://your_tb_dir as in the above examples.
118    $ tensorboard --logdir=/tmp/tb_log
119  ```
120
121  Open your browser and go to localhost:6006/#profile to view profiling results.
122
123  """
124  if duration_ms <= 0:
125    raise errors.InvalidArgumentError(None, None,
126                                      'duration_ms must be greater than zero.')
127
128  opts = dict(options._asdict()) if options is not None else {}
129  _pywrap_profiler.trace(
130      _strip_addresses(service_addr, _GRPC_PREFIX), logdir, worker_list, True,
131      duration_ms, num_tracing_attempts, opts)
132
133
134@tf_export('profiler.experimental.client.monitor', v1=[])
135def monitor(service_addr, duration_ms, level=1):
136  """Sends grpc requests to profiler server to perform on-demand monitoring.
137
138  The monitoring result is a light weight performance summary of your model
139  execution. This method will block the caller thread until it receives the
140  monitoring result. This method currently supports Cloud TPU only.
141
142  Args:
143    service_addr: gRPC address of profiler service e.g. grpc://10.0.0.2:8466.
144    duration_ms: Duration of monitoring in ms.
145    level: Choose a monitoring level between 1 and 2 to monitor your job. Level
146      2 is more verbose than level 1 and shows more metrics.
147
148  Returns:
149    A string of monitoring output.
150
151  Example usage:
152
153  ```python
154    # Continuously send gRPC requests to the Cloud TPU to monitor the model
155    # execution.
156
157    for query in range(0, 100):
158      print(
159        tf.profiler.experimental.client.monitor('grpc://10.0.0.2:8466', 1000))
160  ```
161
162  """
163  return _pywrap_profiler.monitor(
164      _strip_prefix(service_addr, _GRPC_PREFIX), duration_ms, level, True)
165
166
167def _strip_prefix(s, prefix):
168  return s[len(prefix):] if s.startswith(prefix) else s
169
170
171def _strip_addresses(addresses, prefix):
172  return ','.join([_strip_prefix(s, prefix) for s in addresses.split(',')])
173