• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1syntax = "proto3";
2
3package tensorflow;
4
5import "tensorflow/core/framework/cost_graph.proto";
6import "tensorflow/core/framework/graph.proto";
7import "tensorflow/core/framework/step_stats.proto";
8import "tensorflow/core/protobuf/cluster.proto";
9import "tensorflow/core/protobuf/debug.proto";
10import "tensorflow/core/protobuf/rewriter_config.proto";
11
12option cc_enable_arenas = true;
13option java_outer_classname = "ConfigProtos";
14option java_multiple_files = true;
15option java_package = "org.tensorflow.framework";
16option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
17
18message GPUOptions {
19  // Fraction of the available GPU memory to allocate for each process.
20  // 1 means to allocate all of the GPU memory, 0.5 means the process
21  // allocates up to ~50% of the available GPU memory.
22  //
23  // GPU memory is pre-allocated unless the allow_growth option is enabled.
24  //
25  // If greater than 1.0, uses CUDA unified memory to potentially oversubscribe
26  // the amount of memory available on the GPU device by using host memory as a
27  // swap space. Accessing memory not available on the device will be
28  // significantly slower as that would require memory transfer between the host
29  // and the device. Options to reduce the memory requirement should be
30  // considered before enabling this option as this may come with a negative
31  // performance impact. Oversubscription using the unified memory requires
32  // Pascal class or newer GPUs and it is currently only supported on the Linux
33  // operating system. See
34  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
35  // for the detailed requirements.
36  double per_process_gpu_memory_fraction = 1;
37
38  // If true, the allocator does not pre-allocate the entire specified
39  // GPU memory region, instead starting small and growing as needed.
40  bool allow_growth = 4;
41
42  // The type of GPU allocation strategy to use.
43  //
44  // Allowed values:
45  // "": The empty string (default) uses a system-chosen default
46  //     which may change over time.
47  //
48  // "BFC": A "Best-fit with coalescing" algorithm, simplified from a
49  //        version of dlmalloc.
50  string allocator_type = 2;
51
52  // Delay deletion of up to this many bytes to reduce the number of
53  // interactions with gpu driver code.  If 0, the system chooses
54  // a reasonable default (several MBs).
55  int64 deferred_deletion_bytes = 3;
56
57  // A comma-separated list of GPU ids that determines the 'visible'
58  // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
59  // can see 8 GPU devices in the process, and one wanted to map
60  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1",
61  // then one would specify this field as "5,3".  This field is similar in
62  // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
63  // it applies to the visible GPU devices in the process.
64  //
65  // NOTE:
66  // 1. The GPU driver provides the process with the visible GPUs
67  //    in an order which is not guaranteed to have any correlation to
68  //    the *physical* GPU id in the machine.  This field is used for
69  //    remapping "visible" to "virtual", which means this operates only
70  //    after the process starts.  Users are required to use vendor
71  //    specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
72  //    physical to visible device mapping prior to invoking TensorFlow.
73  // 2. In the code, the ids in this list are also called "platform GPU id"s,
74  //    and the 'virtual' ids of GPU devices (i.e. the ids in the device
75  //    name "/device:GPU:<id>") are also called "TF GPU id"s. Please
76  //    refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h
77  //    for more information.
78  string visible_device_list = 5;
79
80  // In the event polling loop sleep this many microseconds between
81  // PollEvents calls, when the queue is not empty.  If value is not
82  // set or set to 0, gets set to a non-zero default.
83  int32 polling_active_delay_usecs = 6;
84
85  // This field is deprecated and ignored.
86  int32 polling_inactive_delay_msecs = 7;
87
88  // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
89  // enabling this option forces all CPU tensors to be allocated with Cuda
90  // pinned memory. Normally, TensorFlow will infer which tensors should be
91  // allocated as the pinned memory. But in case where the inference is
92  // incomplete, this option can significantly speed up the cross-device memory
93  // copy performance as long as it fits the memory.
94  // Note that this option is not something that should be
95  // enabled by default for unknown or very large models, since all Cuda pinned
96  // memory is unpageable, having too much pinned memory might negatively impact
97  // the overall host system performance.
98  bool force_gpu_compatible = 8;
99
100  message Experimental {
101    // Configuration for breaking down a visible GPU into multiple "virtual"
102    // devices.
103    message VirtualDevices {
104      // Per "virtual" device memory limit, in MB. The number of elements in
105      // the list is the number of virtual devices to create on the
106      // corresponding visible GPU (see "virtual_devices" below).
107      // If empty, it will create single virtual device taking all available
108      // memory from the device.
109      //
110      // For the concept of "visible" and "virtual" GPU, see the comments for
111      // "visible_device_list" above for more information.
112      repeated float memory_limit_mb = 1;
113
114      // Priority values to use with the virtual devices. Use the cuda function
115      // cudaDeviceGetStreamPriorityRange to query for valid range of values for
116      // priority.
117      //
118      // On a P4000 GPU with cuda 10.1, the priority range reported was 0 for
119      // least priority and -1 for greatest priority.
120      //
121      // If this field is not specified, then the virtual devices will be
122      // created with the default. If this field has values set, then the size
123      // of this must match with the above memory_limit_mb.
124      repeated int32 priority = 2;
125    }
126
127    // The multi virtual device settings. If empty (not set), it will create
128    // single virtual device on each visible GPU, according to the settings
129    // in "visible_device_list" above. Otherwise, the number of elements in the
130    // list must be the same as the number of visible GPUs (after
131    // "visible_device_list" filtering if it is set), and the string represented
132    // device names (e.g. /device:GPU:<id>) will refer to the virtual
133    // devices and have the <id> field assigned sequentially starting from 0,
134    // according to the order they appear in this list and the "memory_limit"
135    // list inside each element. For example,
136    //   visible_device_list = "1,0"
137    //   virtual_devices { memory_limit: 1GB memory_limit: 2GB }
138    //   virtual_devices {}
139    // will create three virtual devices as:
140    //   /device:GPU:0 -> visible GPU 1 with 1GB memory
141    //   /device:GPU:1 -> visible GPU 1 with 2GB memory
142    //   /device:GPU:2 -> visible GPU 0 with all available memory
143    //
144    // NOTE:
145    // 1. It's invalid to set both this and "per_process_gpu_memory_fraction"
146    //    at the same time.
147    // 2. Currently this setting is per-process, not per-session. Using
148    //    different settings in different sessions within same process will
149    //    result in undefined behavior.
150    repeated VirtualDevices virtual_devices = 1;
151
152    // If true, uses CUDA unified memory for memory allocations. If
153    // per_process_gpu_memory_fraction option is greater than 1.0, then unified
154    // memory is used regardless of the value for this field. See comments for
155    // per_process_gpu_memory_fraction field for more details and requirements
156    // of the unified memory. This option is useful to oversubscribe memory if
157    // multiple processes are sharing a single GPU while individually using less
158    // than 1.0 per process memory fraction.
159    bool use_unified_memory = 2;
160
161    // If > 1, the number of device-to-device copy streams to create
162    // for each GPUDevice.  Default value is 0, which is automatically
163    // converted to 1.
164    int32 num_dev_to_dev_copy_streams = 3;
165
166    // If non-empty, defines a good GPU ring order on a single worker based on
167    // device interconnect.  This assumes that all workers have the same GPU
168    // topology.  Specify as a comma-separated string, e.g. "3,2,1,0,7,6,5,4".
169    // This ring order is used by the RingReducer implementation of
170    // CollectiveReduce, and serves as an override to automatic ring order
171    // generation in OrderTaskDeviceMap() during CollectiveParam resolution.
172    string collective_ring_order = 4;
173
174    // If true then extra work is done by GPUDevice and GPUBFCAllocator to
175    // keep track of when GPU memory is freed and when kernels actually
176    // complete so that we can know when a nominally free memory chunk
177    // is really not subject to pending use.
178    bool timestamped_allocator = 5;
179
180    // reserved id: 6
181
182    // Parameters for GPUKernelTracker.  By default no kernel tracking is done.
183    // Note that timestamped_allocator is only effective if some tracking is
184    // specified.
185    //
186    // If kernel_tracker_max_interval = n > 0, then a tracking event
187    // is inserted after every n kernels without an event.
188    int32 kernel_tracker_max_interval = 7;
189    // If kernel_tracker_max_bytes = n > 0, then a tracking event is
190    // inserted after every series of kernels allocating a sum of
191    // memory >= n.  If one kernel allocates b * n bytes, then one
192    // event will be inserted after it, but it will count as b against
193    // the pending limit.
194    int32 kernel_tracker_max_bytes = 8;
195    // If kernel_tracker_max_pending > 0 then no more than this many
196    // tracking events can be outstanding at a time.  An attempt to
197    // launch an additional kernel will stall until an event
198    // completes.
199    int32 kernel_tracker_max_pending = 9;
200
201    // BFC Allocator can return an allocated chunk of memory upto 2x the
202    // requested size. For virtual devices with tight memory constraints, and
203    // proportionately large allocation requests, this can lead to a significant
204    // reduction in available memory. The threshold below controls when a chunk
205    // should be split if the chunk size exceeds requested memory size. It is
206    // expressed as a fraction of total available memory for the tf device. For
207    // example setting it to 0.05 would imply a chunk needs to be split if its
208    // size exceeds the requested memory by 5% of the total virtual device/gpu
209    // memory size.
210    double internal_fragmentation_fraction = 10;
211
212    // When true, use CUDA cudaMallocAsync API instead of TF gpu allocator.
213    bool use_cuda_malloc_async = 11;
214  }
215
216  // Everything inside experimental is subject to change and is not subject
217  // to API stability guarantees in
218  // https://www.tensorflow.org/guide/version_compat.
219  Experimental experimental = 9;
220}
221
222// Options passed to the graph optimizer
223message OptimizerOptions {
224  // If true, optimize the graph using common subexpression elimination.
225  // Note: the optimization Level L1 will override this setting to true. So in
226  // order to disable common subexpression elimination the opt_level has to be
227  // set to L0.
228  bool do_common_subexpression_elimination = 1;
229
230  // If true, perform constant folding optimization on the graph.
231  // Note: the optimization Level L1 will override this setting to true. So in
232  // order to disable constant folding the opt_level has to be set to L0.
233  bool do_constant_folding = 2;
234
235  // Constant folding optimization replaces tensors whose values can be
236  // predetermined, with constant nodes. To avoid inserting too large constants,
237  // the size of each constant created can be limited. If this value is zero, a
238  // default limit of 10 MiB will be applied. If constant folding optimization
239  // is disabled, this value is ignored.
240  int64 max_folded_constant_in_bytes = 6;
241
242  // If true, perform function inlining on the graph.
243  bool do_function_inlining = 4;
244
245  // Optimization level
246  enum Level {
247    // L1 is the default level.
248    // Optimization performed at L1 :
249    // 1. Common subexpression elimination
250    // 2. Constant folding
251    L1 = 0;
252
253    // No optimizations
254    L0 = -1;
255  }
256
257  // Overall optimization level. The actual optimizations applied will be the
258  // logical OR of the flags that this level implies and any flags already set.
259  Level opt_level = 3;
260
261  // Control the use of the compiler/jit.  Experimental.
262  enum GlobalJitLevel {
263    DEFAULT = 0;  // Default setting ("off" now, but later expected to be "on")
264    OFF = -1;
265    // The following settings turn on compilation, with higher values being
266    // more aggressive.  Higher values may reduce opportunities for parallelism
267    // and may use more memory.  (At present, there is no distinction, but this
268    // is expected to change.)
269    ON_1 = 1;
270    ON_2 = 2;
271  }
272  GlobalJitLevel global_jit_level = 5;
273}
274
275message GraphOptions {
276  // Removed, use optimizer_options below.
277  reserved "skip_common_subexpression_elimination";
278  reserved 1;
279
280  // If true, use control flow to schedule the activation of Recv nodes.
281  // (Currently ignored.)
282  bool enable_recv_scheduling = 2;
283
284  // Options controlling how graph is optimized.
285  OptimizerOptions optimizer_options = 3;
286
287  // The number of steps to run before returning a cost model detailing
288  // the memory usage and performance of each node of the graph. 0 means
289  // no cost model.
290  int64 build_cost_model = 4;
291
292  // The number of steps to skip before collecting statistics for the
293  // cost model.
294  int64 build_cost_model_after = 9;
295
296  // Annotate each Node with Op output shape data, to the extent it can
297  // be statically inferred.
298  bool infer_shapes = 5;
299
300  // Only place the subgraphs that are run, rather than the entire graph.
301  //
302  // This is useful for interactive graph building, where one might
303  // produce graphs that cannot be placed during the debugging
304  // process.  In particular, it allows the client to continue work in
305  // a session after adding a node to a graph whose placement
306  // constraints are unsatisfiable.
307  bool place_pruned_graph = 6;
308
309  // If true, transfer float values between processes as bfloat16.
310  bool enable_bfloat16_sendrecv = 7;
311
312  // If > 0, record a timeline every this many steps.
313  // EXPERIMENTAL: This currently has no effect in MasterSession.
314  int32 timeline_step = 8;
315
316  // Options that control the type and amount of graph rewriting.
317  // Not currently configurable via the public Python API (i.e. there is no API
318  // stability guarantee if you import RewriterConfig explicitly).
319  RewriterConfig rewrite_options = 10;
320}
321
322message ThreadPoolOptionProto {
323  // The number of threads in the pool.
324  //
325  // 0 means the system picks a value based on where this option proto is used
326  // (see the declaration of the specific field for more info).
327  int32 num_threads = 1;
328
329  // The global name of the threadpool.
330  //
331  // If empty, then the threadpool is made and used according to the scope it's
332  // in - e.g., for a session threadpool, it is used by that session only.
333  //
334  // If non-empty, then:
335  // - a global threadpool associated with this name is looked
336  //   up or created. This allows, for example, sharing one threadpool across
337  //   many sessions (e.g., like the default behavior, if
338  //   inter_op_parallelism_threads is not configured), but still partitioning
339  //   into a large and small pool.
340  // - if the threadpool for this global_name already exists, then it is an
341  //   error if the existing pool was created using a different num_threads
342  //   value as is specified on this call.
343  // - threadpools created this way are never garbage collected.
344  string global_name = 2;
345}
346
347message RPCOptions {
348  // If true, always use RPC to contact the session target.
349  //
350  // If false (the default option), TensorFlow may use an optimized
351  // transport for client-master communication that avoids the RPC
352  // stack. This option is primarily for used testing the RPC stack.
353  bool use_rpc_for_inprocess_master = 1;
354
355  // The compression algorithm to be used. One of "deflate", "gzip".
356  string compression_algorithm = 2;
357
358  // If compression_algorithm is set, the compression level to be used.
359  // From 0 (no compression), up to 3.
360  int32 compression_level = 3;
361
362  // Setting cache_rpc_response to true will enable sender side caching of
363  // response for RecvTensorAsync and RecvBufAsync to allow receiver to retry
364  // requests . This is only necessary when the network fabric is experiencing a
365  // significant error rate.  Without it we'll fail a step on an network error,
366  // while with it we'll be able to complete long steps (like complex
367  // initializations) in the face of some network errors during RecvTensor.
368  bool cache_rpc_response = 4;
369
370  // Disables TCP connection sharing when opening a new RPC channel.
371  bool disable_session_connection_sharing = 5;
372
373  // Setting num_channels_per_target > 0 allows uses of multiple channels to
374  // communicate to the same target. This can be used to improve the aggregate
375  // throughput on high speed links (e.g 100G) where single connection is not
376  // sufficient to maximize link utilization. Note that a single RPC only goes
377  // on a single channel, this only helps in situations where there are multiple
378  // transfers to the same target overlapping in time.
379  int32 num_channels_per_target = 6;
380}
381
382// Metadata about the session.
383//
384// This can be used by the runtime and the Ops for debugging, monitoring, etc.
385//
386// The (name, version) tuple is expected to be a unique identifier for
387// sessions within the same process.
388//
389// NOTE: This is currently used and propagated only by the direct session.
390message SessionMetadata {
391  string name = 1;
392
393  // The version is optional. If set, needs to be >= 0.
394  int64 version = 2;
395}
396
397// Session configuration parameters.
398// The system picks appropriate values for fields that are not set.
399message ConfigProto {
400  // Map from device type name (e.g., "CPU" or "GPU" ) to maximum
401  // number of devices of that type to use.  If a particular device
402  // type is not found in the map, the system picks an appropriate
403  // number.
404  map<string, int32> device_count = 1;
405
406  // The execution of an individual op (for some op types) can be
407  // parallelized on a pool of intra_op_parallelism_threads.
408  // 0 means the system picks an appropriate number.
409  //
410  // If you create an ordinary session, e.g., from Python or C++,
411  // then there is exactly one intra op thread pool per process.
412  // The first session created determines the number of threads in this pool.
413  // All subsequent sessions reuse/share this one global pool.
414  //
415  // There are notable exceptions to the default behavior described above:
416  // 1. There is an environment variable  for overriding this thread pool,
417  //    named TF_OVERRIDE_GLOBAL_THREADPOOL.
418  // 2. When connecting to a server, such as a remote `tf.train.Server`
419  //    instance, then this option will be ignored altogether.
420  int32 intra_op_parallelism_threads = 2;
421
422  // Nodes that perform blocking operations are enqueued on a pool of
423  // inter_op_parallelism_threads available in each process.
424  //
425  // 0 means the system picks an appropriate number.
426  // Negative means all operations are performed in caller's thread.
427  //
428  // Note that the first Session created in the process sets the
429  // number of threads for all future sessions unless use_per_session_threads is
430  // true or session_inter_op_thread_pool is configured.
431  int32 inter_op_parallelism_threads = 5;
432
433  // If true, use a new set of threads for this session rather than the global
434  // pool of threads. Only supported by direct sessions.
435  //
436  // If false, use the global threads created by the first session, or the
437  // per-session thread pools configured by session_inter_op_thread_pool.
438  //
439  // This option is deprecated. The same effect can be achieved by setting
440  // session_inter_op_thread_pool to have one element, whose num_threads equals
441  // inter_op_parallelism_threads.
442  bool use_per_session_threads = 9;
443
444  // This option is experimental - it may be replaced with a different mechanism
445  // in the future.
446  //
447  // Configures session thread pools. If this is configured, then RunOptions for
448  // a Run call can select the thread pool to use.
449  //
450  // The intended use is for when some session invocations need to run in a
451  // background pool limited to a small number of threads:
452  // - For example, a session may be configured to have one large pool (for
453  // regular compute) and one small pool (for periodic, low priority work);
454  // using the small pool is currently the mechanism for limiting the inter-op
455  // parallelism of the low priority work.  Note that it does not limit the
456  // parallelism of work spawned by a single op kernel implementation.
457  // - Using this setting is normally not needed in training, but may help some
458  // serving use cases.
459  // - It is also generally recommended to set the global_name field of this
460  // proto, to avoid creating multiple large pools. It is typically better to
461  // run the non-low-priority work, even across sessions, in a single large
462  // pool.
463  repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12;
464
465  // Assignment of Nodes to Devices is recomputed every placement_period
466  // steps until the system warms up (at which point the recomputation
467  // typically slows down automatically).
468  int32 placement_period = 3;
469
470  // When any filters are present sessions will ignore all devices which do not
471  // match the filters. Each filter can be partially specified, e.g. "/job:ps"
472  // "/job:worker/replica:3", etc.
473  repeated string device_filters = 4;
474
475  // Options that apply to all GPUs.
476  GPUOptions gpu_options = 6;
477
478  // Whether soft placement is allowed. If allow_soft_placement is true,
479  // an op will be placed on CPU if
480  //   1. there's no GPU implementation for the OP
481  // or
482  //   2. no GPU devices are known or registered
483  // or
484  //   3. need to co-locate with reftype input(s) which are from CPU.
485  bool allow_soft_placement = 7;
486
487  // Whether device placements should be logged.
488  bool log_device_placement = 8;
489
490  // Options that apply to all graphs.
491  GraphOptions graph_options = 10;
492
493  // Global timeout for all blocking operations in this session.  If non-zero,
494  // and not overridden on a per-operation basis, this value will be used as the
495  // deadline for all blocking operations.
496  int64 operation_timeout_in_ms = 11;
497
498  // Options that apply when this session uses the distributed runtime.
499  RPCOptions rpc_options = 13;
500
501  // Optional list of all workers to use in this session.
502  ClusterDef cluster_def = 14;
503
504  // If true, any resources such as Variables used in the session will not be
505  // shared with other sessions. However, when clusterspec propagation is
506  // enabled, this field is ignored and sessions are always isolated.
507  bool isolate_session_state = 15;
508
509  // When true, WorkerSessions are created with device attributes from the
510  // full cluster.
511  // This is helpful when a worker wants to partition a graph
512  // (for example during a PartitionedCallOp).
513  bool share_cluster_devices_in_session = 17;
514
515  // Everything inside Experimental is subject to change and is not subject
516  // to API stability guarantees in
517  // https://www.tensorflow.org/guide/version_compat.
518  message Experimental {
519    // Task name for group resolution.
520    string collective_group_leader = 1;
521
522    // We removed the flag client_handles_error_formatting. Marking the tag
523    // number as reserved.
524    // TODO(shikharagarwal): Should we just remove this tag so that it can be
525    // used in future for other purpose?
526    reserved 2;
527
528    // Which executor to use, the default executor will be used
529    // if it is an empty string or "DEFAULT"
530    string executor_type = 3;
531
532    // Guidance to formatting of large RecvBuf fields for transfer.
533    // Any positive value sets the max chunk size.  0 defaults to 4096.
534    // Any negative value indicates no max, i.e. one chunk only.
535    int32 recv_buf_max_chunk = 4;
536
537    // If true, and supported by the platform, the runtime will attempt to
538    // use NUMA affinity where applicable.  One consequence will be the
539    // existence of as many CPU devices as there are available NUMA nodes.
540    bool use_numa_affinity = 5;
541
542    // If true, make collective op execution order sequential and deterministic
543    // for potentially concurrent collective instances.
544    bool collective_deterministic_sequential_execution = 6;
545
546    // If true, use NCCL for CollectiveOps.  This feature is highly
547    // experimental.
548    bool collective_nccl = 7;
549
550    // In the following, session state means the value of a variable, elements
551    // in a hash table, or any other resource, accessible by worker sessions
552    // held by a TF server.
553    //
554    // When ClusterSpec propagation is enabled, the value of
555    // isolate_session_state is ignored when deciding whether to share session
556    // states in a TF server (for backwards compatibility reasons).
557    // - If share_session_state_in_clusterspec_propagation is true, the session
558    // states are shared.
559    // - If share_session_state_in_clusterspec_propagation is false, session
560    // states are isolated.
561    //
562    // When clusterspec propagation is not used, the value of
563    // share_session_state_in_clusterspec_propagation is ignored when deciding
564    // whether to share session states in a TF server.
565    // - If isolate_session_state is true, session states are isolated.
566    // - If isolate_session_state is false, session states are shared.
567    //
568    // TODO(b/129330037): Add a single API that consistently treats
569    // isolate_session_state and ClusterSpec propagation.
570    bool share_session_state_in_clusterspec_propagation = 8;
571
572    // If using a direct session, disable spinning while waiting for work in
573    // the thread pool. This may result in higher latency for completing ops,
574    // but in the case where there is a lot of spinning may result in lower
575    // CPU usage.
576    bool disable_thread_spinning = 9;
577
578    // This was promoted to a non-experimental API. Please use
579    // ConfigProto.share_cluster_devices_in_session instead.
580    bool share_cluster_devices_in_session = 10;
581
582    // Metadata about the session.
583    //
584    // If set, this can be used by the runtime and the Ops for debugging,
585    // monitoring, etc.
586    //
587    // NOTE: This is currently used and propagated only by the direct session.
588    SessionMetadata session_metadata = 11;
589
590    // If true, the session may treat the graph as being static for optimization
591    // purposes.
592    //
593    // If this option is set to true when a session is created, the full
594    // GraphDef must be passed in a single call to Session::Create(), and
595    // Session::Extend() may not be supported.
596    bool optimize_for_static_graph = 12;
597
598    // This field will eventually be deprecated and replaced by
599    // mlir_bridge_rollout (b/166038521).
600    //
601    // Whether to enable the MLIR-based TF->XLA bridge.
602    //
603    // This is a replacement to the existing bridge, and not ready for
604    // production usage yet.
605    // If this option is set to true when a session is created, MLIR is used to
606    // perform the set of graph transformations to put the graph in a form that
607    // can be executed with delegation of some computations to an accelerator.
608    // This builds on the model of XLA where a subset of the graph is
609    // encapsulated and attached to a "compile" operation, whose result is fed
610    // to an "execute" operation. The kernel for these operations is responsible
611    // to lower the encapsulated graph to a particular device.
612    bool enable_mlir_bridge = 13;
613
614    // An enum that describes the state of the MLIR bridge rollout.
615    enum MlirBridgeRollout {
616      // If this field is left unspecified, the MLIR bridge may be selectively
617      // enabled on a per graph basis.
618      MLIR_BRIDGE_ROLLOUT_UNSPECIFIED = 0;
619      // Enabling the MLIR bridge enables it for all graphs in this session.
620      MLIR_BRIDGE_ROLLOUT_ENABLED = 1;
621      // Disabling the MLIR bridge disables it for all graphs in this session.
622      MLIR_BRIDGE_ROLLOUT_DISABLED = 2;
623      // Enable the MLIR bridge on a per graph basis based on an analysis of
624      // the features used in the graph. If the features used by the graph are
625      // supported by the MLIR bridge, the MLIR bridge will be used to run the
626      // graph.
627      MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED = 3;
628      // Enable the MLIR bridge in a fallback mode on a per graph basis based
629      // on an analysis of the features used in the graph.
630      // Running the MLIR bridge in the fallback mode means that it is
631      // executed and it commits all the changes to the TF graph in case
632      // of success. And it does not in case of failures and let the old bridge
633      // to process the TF graph.
634      MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED = 4;
635    }
636    // This field is underdevelopment, for now use enable_mlir_bridge
637    // (b/166038521).
638    //
639    // Whether to enable the MLIR-based TF->XLA bridge.
640    MlirBridgeRollout mlir_bridge_rollout = 17;
641
642    // Whether to enable the MLIR-based Graph optimizations.
643    //
644    // This will become a part of standard Tensorflow graph optimization
645    // pipeline, currently this is only used for gradual migration and testing
646    // new passes that are replacing existing optimizations in Grappler.
647    bool enable_mlir_graph_optimization = 16;
648
649    // If true, the session will not store an additional copy of the graph for
650    // each subgraph.
651    //
652    // If this option is set to true when a session is created, the
653    // `RunOptions.output_partition_graphs` options must not be set.
654    bool disable_output_partition_graphs = 14;
655
656    // Minimum number of batches run through the XLA graph before XLA fusion
657    // autotuner is enabled. Default value of zero disables the autotuner.
658    //
659    // The XLA fusion autotuner can improve performance by executing a heuristic
660    // search on the compiler parameters.
661    int64 xla_fusion_autotuner_thresh = 15;
662
663    // Whether runtime execution uses TFRT.
664    bool use_tfrt = 18;
665
666    // Distributed coordination service to be enabled if set.
667    // Currently only effective in multi-client setup.
668    string coordination_service = 19;
669
670    // Whether the remote devices in the cluster should be fetched during setup
671    // of multi-client cluster. If enabled, the workers will run an extra device
672    // information exchange step during startup and the workers' EagerContexts
673    // will become aware of remote devices in the cluster as well.
674    bool fetch_remote_devices_in_multi_client = 20;
675
676    // Next: 21
677  }
678
679  Experimental experimental = 16;
680
681  // Next: 18
682}
683
684// Options for a single Run() call.
685message RunOptions {
686  // TODO(pbar) Turn this into a TraceOptions proto which allows
687  // tracing to be controlled in a more orthogonal manner?
688  enum TraceLevel {
689    NO_TRACE = 0;
690    SOFTWARE_TRACE = 1;
691    HARDWARE_TRACE = 2;
692    FULL_TRACE = 3;
693  }
694  TraceLevel trace_level = 1;
695
696  // Time to wait for operation to complete in milliseconds.
697  int64 timeout_in_ms = 2;
698
699  // The thread pool to use, if session_inter_op_thread_pool is configured.
700  // To use the caller thread set this to -1 - this uses the caller thread
701  // to execute Session::Run() and thus avoids a context switch. Using the
702  // caller thread to execute Session::Run() should be done ONLY for simple
703  // graphs, where the overhead of an additional context switch is
704  // comparable with the overhead of Session::Run().
705  int32 inter_op_thread_pool = 3;
706
707  // Whether the partition graph(s) executed by the executor(s) should be
708  // outputted via RunMetadata.
709  bool output_partition_graphs = 5;
710
711  // EXPERIMENTAL.  Options used to initialize DebuggerState, if enabled.
712  DebugOptions debug_options = 6;
713
714  // When enabled, causes tensor allocation information to be included in
715  // the error message when the Run() call fails because the allocator ran
716  // out of memory (OOM).
717  //
718  // Enabling this option can slow down the Run() call.
719  bool report_tensor_allocations_upon_oom = 7;
720
721  // Everything inside Experimental is subject to change and is not subject
722  // to API stability guarantees in
723  // https://www.tensorflow.org/guide/version_compat.
724  message Experimental {
725    // If non-zero, declares that this graph is going to use collective
726    // ops and must synchronize step_ids with any other graph with this
727    // same group_key value (in a distributed computation where tasks
728    // run disjoint graphs).
729    int64 collective_graph_key = 1;
730    // If true, then operations (using the inter-op pool) across all
731    // session::run() calls will be centrally scheduled, optimizing for (median
732    // and tail) latency.
733    // Consider using this option for CPU-bound workloads like inference.
734    bool use_run_handler_pool = 2;
735    // Options for run handler thread pool.
736    message RunHandlerPoolOptions {
737      // Priority of the request. The run handler thread pool will schedule ops
738      // based on the priority number. The larger number means higher priority.
739      int64 priority = 1;
740    }
741    RunHandlerPoolOptions run_handler_pool_options = 3;
742  }
743
744  Experimental experimental = 8;
745
746  reserved 4;
747}
748
749// Metadata output (i.e., non-Tensor) for a single Run() call.
750message RunMetadata {
751  // Statistics traced for this step. Populated if tracing is turned on via the
752  // "RunOptions" proto.
753  // EXPERIMENTAL: The format and set of events may change in future versions.
754  StepStats step_stats = 1;
755
756  // The cost graph for the computation defined by the run call.
757  CostGraphDef cost_graph = 2;
758
759  // Graphs of the partitions executed by executors.
760  repeated GraphDef partition_graphs = 3;
761
762  message FunctionGraphs {
763    // TODO(nareshmodi): Include some sort of function/cache-key identifier?
764    repeated GraphDef partition_graphs = 1;
765
766    GraphDef pre_optimization_graph = 2;
767    GraphDef post_optimization_graph = 3;
768  }
769  // This is only populated for graphs that are run as functions in TensorFlow
770  // V2. There will be an entry below for each function that is traced.
771  // The main use cases of the post_optimization_graph and the partition_graphs
772  // is to give the caller insight into the graphs that were actually run by the
773  // runtime. Additional information (such as those in step_stats) will match
774  // these graphs.
775  // We also include the pre_optimization_graph since it is usually easier to
776  // read, and is helpful in situations where the caller wants to get a high
777  // level idea of what the built graph looks like (since the various graph
778  // optimization passes might change the structure of the graph significantly).
779  repeated FunctionGraphs function_graphs = 4;
780}
781
782// Defines a connection between two tensors in a `GraphDef`.
783message TensorConnection {
784  // A tensor name. The value of this tensor will be substituted for
785  // the tensor named in `to_tensor`.
786  string from_tensor = 1;
787
788  // A tensor name. The value of this tensor will be bound to the
789  // value of the tensor named in `from_tensor`.
790  string to_tensor = 2;
791}
792
793// Defines a subgraph in another `GraphDef` as a set of feed points and nodes
794// to be fetched or executed.
795//
796// Compare with the arguments to `Session::Run()`.
797message CallableOptions {
798  // Tensors to be fed in the callable. Each feed is the name of a tensor.
799  repeated string feed = 1;
800
801  // Fetches. A list of tensor names. The caller of the callable expects a
802  // tensor to be returned for each fetch[i] (see RunStepResponse.tensor). The
803  // order of specified fetches does not change the execution order.
804  repeated string fetch = 2;
805
806  // Target Nodes. A list of node names. The named nodes will be run by the
807  // callable but their outputs will not be returned.
808  repeated string target = 3;
809
810  // Options that will be applied to each run.
811  RunOptions run_options = 4;
812
813  // Tensors to be connected in the callable. Each TensorConnection denotes
814  // a pair of tensors in the graph, between which an edge will be created
815  // in the callable.
816  repeated TensorConnection tensor_connection = 5;
817
818  // The Tensor objects fed in the callable and fetched from the callable
819  // are expected to be backed by host (CPU) memory by default.
820  //
821  // The options below allow changing that - feeding tensors backed by
822  // device memory, or returning tensors that are backed by device memory.
823  //
824  // The maps below map the name of a feed/fetch tensor (which appears in
825  // 'feed' or 'fetch' fields above), to the fully qualified name of the device
826  // owning the memory backing the contents of the tensor.
827  //
828  // For example, creating a callable with the following options:
829  //
830  // CallableOptions {
831  //   feed: "a:0"
832  //   feed: "b:0"
833  //
834  //   fetch: "x:0"
835  //   fetch: "y:0"
836  //
837  //   feed_devices: {
838  //     "a:0": "/job:localhost/replica:0/task:0/device:GPU:0"
839  //   }
840  //
841  //   fetch_devices: {
842  //     "y:0": "/job:localhost/replica:0/task:0/device:GPU:0"
843  //  }
844  // }
845  //
846  // means that the Callable expects:
847  // - The first argument ("a:0") is a Tensor backed by GPU memory.
848  // - The second argument ("b:0") is a Tensor backed by host memory.
849  // and of its return values:
850  // - The first output ("x:0") will be backed by host memory.
851  // - The second output ("y:0") will be backed by GPU memory.
852  //
853  // FEEDS:
854  // It is the responsibility of the caller to ensure that the memory of the fed
855  // tensors will be correctly initialized and synchronized before it is
856  // accessed by operations executed during the call to Session::RunCallable().
857  //
858  // This is typically ensured by using the TensorFlow memory allocators
859  // (Device::GetAllocator()) to create the Tensor to be fed.
860  //
861  // Alternatively, for CUDA-enabled GPU devices, this typically means that the
862  // operation that produced the contents of the tensor has completed, i.e., the
863  // CUDA stream has been synchronized (e.g., via cuCtxSynchronize() or
864  // cuStreamSynchronize()).
865  map<string, string> feed_devices = 6;
866  map<string, string> fetch_devices = 7;
867
868  // By default, RunCallable() will synchronize the GPU stream before returning
869  // fetched tensors on a GPU device, to ensure that the values in those tensors
870  // have been produced. This simplifies interacting with the tensors, but
871  // potentially incurs a performance hit.
872  //
873  // If this options is set to true, the caller is responsible for ensuring
874  // that the values in the fetched tensors have been produced before they are
875  // used. The caller can do this by invoking `Device::Sync()` on the underlying
876  // device(s), or by feeding the tensors back to the same Session using
877  // `feed_devices` with the same corresponding device name.
878  bool fetch_skip_sync = 8;
879
880  // Next: 9
881}
882