1syntax = "proto3"; 2 3package tensorflow; 4 5option cc_enable_arenas = true; 6option java_outer_classname = "ConfigProtos"; 7option java_multiple_files = true; 8option java_package = "org.tensorflow.framework"; 9 10option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf"; 11import "tensorflow/core/framework/cost_graph.proto"; 12import "tensorflow/core/framework/graph.proto"; 13import "tensorflow/core/framework/step_stats.proto"; 14import "tensorflow/core/protobuf/cluster.proto"; 15import "tensorflow/core/protobuf/debug.proto"; 16import "tensorflow/core/protobuf/rewriter_config.proto"; 17 18message GPUOptions { 19 // Fraction of the available GPU memory to allocate for each process. 20 // 1 means to allocate all of the GPU memory, 0.5 means the process 21 // allocates up to ~50% of the available GPU memory. 22 // 23 // GPU memory is pre-allocated unless the allow_growth option is enabled. 24 // 25 // If greater than 1.0, uses CUDA unified memory to potentially oversubscribe 26 // the amount of memory available on the GPU device by using host memory as a 27 // swap space. Accessing memory not available on the device will be 28 // significantly slower as that would require memory transfer between the host 29 // and the device. Options to reduce the memory requirement should be 30 // considered before enabling this option as this may come with a negative 31 // performance impact. Oversubscription using the unified memory requires 32 // Pascal class or newer GPUs and it is currently only supported on the Linux 33 // operating system. See 34 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements 35 // for the detailed requirements. 36 double per_process_gpu_memory_fraction = 1; 37 38 // If true, the allocator does not pre-allocate the entire specified 39 // GPU memory region, instead starting small and growing as needed. 40 bool allow_growth = 4; 41 42 // The type of GPU allocation strategy to use. 43 // 44 // Allowed values: 45 // "": The empty string (default) uses a system-chosen default 46 // which may change over time. 47 // 48 // "BFC": A "Best-fit with coalescing" algorithm, simplified from a 49 // version of dlmalloc. 50 string allocator_type = 2; 51 52 // Delay deletion of up to this many bytes to reduce the number of 53 // interactions with gpu driver code. If 0, the system chooses 54 // a reasonable default (several MBs). 55 int64 deferred_deletion_bytes = 3; 56 57 // A comma-separated list of GPU ids that determines the 'visible' 58 // to 'virtual' mapping of GPU devices. For example, if TensorFlow 59 // can see 8 GPU devices in the process, and one wanted to map 60 // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1", 61 // then one would specify this field as "5,3". This field is similar in 62 // spirit to the CUDA_VISIBLE_DEVICES environment variable, except 63 // it applies to the visible GPU devices in the process. 64 // 65 // NOTE: 66 // 1. The GPU driver provides the process with the visible GPUs 67 // in an order which is not guaranteed to have any correlation to 68 // the *physical* GPU id in the machine. This field is used for 69 // remapping "visible" to "virtual", which means this operates only 70 // after the process starts. Users are required to use vendor 71 // specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the 72 // physical to visible device mapping prior to invoking TensorFlow. 73 // 2. In the code, the ids in this list are also called "platform GPU id"s, 74 // and the 'virtual' ids of GPU devices (i.e. the ids in the device 75 // name "/device:GPU:<id>") are also called "TF GPU id"s. Please 76 // refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h 77 // for more information. 78 string visible_device_list = 5; 79 80 // In the event polling loop sleep this many microseconds between 81 // PollEvents calls, when the queue is not empty. If value is not 82 // set or set to 0, gets set to a non-zero default. 83 int32 polling_active_delay_usecs = 6; 84 85 // This field is deprecated and ignored. 86 int32 polling_inactive_delay_msecs = 7; 87 88 // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow, 89 // enabling this option forces all CPU tensors to be allocated with Cuda 90 // pinned memory. Normally, TensorFlow will infer which tensors should be 91 // allocated as the pinned memory. But in case where the inference is 92 // incomplete, this option can significantly speed up the cross-device memory 93 // copy performance as long as it fits the memory. 94 // Note that this option is not something that should be 95 // enabled by default for unknown or very large models, since all Cuda pinned 96 // memory is unpageable, having too much pinned memory might negatively impact 97 // the overall host system performance. 98 bool force_gpu_compatible = 8; 99 100 message Experimental { 101 // Configuration for breaking down a visible GPU into multiple "virtual" 102 // devices. 103 message VirtualDevices { 104 // Per "virtual" device memory limit, in MB. The number of elements in 105 // the list is the number of virtual devices to create on the 106 // corresponding visible GPU (see "virtual_devices" below). 107 // If empty, it will create single virtual device taking all available 108 // memory from the device. 109 // 110 // For the concept of "visible" and "virtual" GPU, see the comments for 111 // "visible_device_list" above for more information. 112 repeated float memory_limit_mb = 1; 113 } 114 115 // The multi virtual device settings. If empty (not set), it will create 116 // single virtual device on each visible GPU, according to the settings 117 // in "visible_device_list" above. Otherwise, the number of elements in the 118 // list must be the same as the number of visible GPUs (after 119 // "visible_device_list" filtering if it is set), and the string represented 120 // device names (e.g. /device:GPU:<id>) will refer to the virtual 121 // devices and have the <id> field assigned sequentially starting from 0, 122 // according to the order they appear in this list and the "memory_limit" 123 // list inside each element. For example, 124 // visible_device_list = "1,0" 125 // virtual_devices { memory_limit: 1GB memory_limit: 2GB } 126 // virtual_devices {} 127 // will create three virtual devices as: 128 // /device:GPU:0 -> visible GPU 1 with 1GB memory 129 // /device:GPU:1 -> visible GPU 1 with 2GB memory 130 // /device:GPU:2 -> visible GPU 0 with all available memory 131 // 132 // NOTE: 133 // 1. It's invalid to set both this and "per_process_gpu_memory_fraction" 134 // at the same time. 135 // 2. Currently this setting is per-process, not per-session. Using 136 // different settings in different sessions within same process will 137 // result in undefined behavior. 138 repeated VirtualDevices virtual_devices = 1; 139 140 // If true, uses CUDA unified memory for memory allocations. If 141 // per_process_gpu_memory_fraction option is greater than 1.0, then unified 142 // memory is used regardless of the value for this field. See comments for 143 // per_process_gpu_memory_fraction field for more details and requirements 144 // of the unified memory. This option is useful to oversubscribe memory if 145 // multiple processes are sharing a single GPU while individually using less 146 // than 1.0 per process memory fraction. 147 bool use_unified_memory = 2; 148 149 // If > 1, the number of device-to-device copy streams to create 150 // for each GPUDevice. Default value is 0, which is automatically 151 // converted to 1. 152 int32 num_dev_to_dev_copy_streams = 3; 153 154 // If non-empty, defines a good GPU ring order on a single worker based on 155 // device interconnect. This assumes that all workers have the same GPU 156 // topology. Specify as a comma-separated string, e.g. "3,2,1,0,7,6,5,4". 157 // This ring order is used by the RingReducer implementation of 158 // CollectiveReduce, and serves as an override to automatic ring order 159 // generation in OrderTaskDeviceMap() during CollectiveParam resolution. 160 string collective_ring_order = 4; 161 162 // If true then extra work is done by GPUDevice and GPUBFCAllocator to 163 // keep track of when GPU memory is freed and when kernels actually 164 // complete so that we can know when a nominally free memory chunk 165 // is really not subject to pending use. 166 bool timestamped_allocator = 5; 167 168 // reserved id: 6 169 170 // Parameters for GPUKernelTracker. By default no kernel tracking is done. 171 // Note that timestamped_allocator is only effective if some tracking is 172 // specified. 173 // 174 // If kernel_tracker_max_interval = n > 0, then a tracking event 175 // is inserted after every n kernels without an event. 176 int32 kernel_tracker_max_interval = 7; 177 // If kernel_tracker_max_bytes = n > 0, then a tracking event is 178 // inserted after every series of kernels allocating a sum of 179 // memory >= n. If one kernel allocates b * n bytes, then one 180 // event will be inserted after it, but it will count as b against 181 // the pending limit. 182 int32 kernel_tracker_max_bytes = 8; 183 // If kernel_tracker_max_pending > 0 then no more than this many 184 // tracking events can be outstanding at a time. An attempt to 185 // launch an additional kernel will stall until an event 186 // completes. 187 int32 kernel_tracker_max_pending = 9; 188 } 189 190 // Everything inside experimental is subject to change and is not subject 191 // to API stability guarantees in 192 // https://www.tensorflow.org/guide/version_compat. 193 Experimental experimental = 9; 194} 195 196// Options passed to the graph optimizer 197message OptimizerOptions { 198 // If true, optimize the graph using common subexpression elimination. 199 bool do_common_subexpression_elimination = 1; 200 201 // If true, perform constant folding optimization on the graph. 202 bool do_constant_folding = 2; 203 204 // Constant folding optimization replaces tensors whose values can be 205 // predetermined, with constant nodes. To avoid inserting too large constants, 206 // the size of each constant created can be limited. If this value is zero, a 207 // default limit of 10 MiB will be applied. If constant folding optimization 208 // is disabled, this value is ignored. 209 int64 max_folded_constant_in_bytes = 6; 210 211 // If true, perform function inlining on the graph. 212 bool do_function_inlining = 4; 213 214 // Optimization level 215 enum Level { 216 // L1 is the default level. 217 // Optimization performed at L1 : 218 // 1. Common subexpression elimination 219 // 2. Constant folding 220 L1 = 0; 221 222 // No optimizations 223 L0 = -1; 224 } 225 226 // Overall optimization level. The actual optimizations applied will be the 227 // logical OR of the flags that this level implies and any flags already set. 228 Level opt_level = 3; 229 230 // Control the use of the compiler/jit. Experimental. 231 enum GlobalJitLevel { 232 DEFAULT = 0; // Default setting ("off" now, but later expected to be "on") 233 OFF = -1; 234 // The following settings turn on compilation, with higher values being 235 // more aggressive. Higher values may reduce opportunities for parallelism 236 // and may use more memory. (At present, there is no distinction, but this 237 // is expected to change.) 238 ON_1 = 1; 239 ON_2 = 2; 240 } 241 GlobalJitLevel global_jit_level = 5; 242} 243 244message GraphOptions { 245 // Removed, use optimizer_options below. 246 reserved "skip_common_subexpression_elimination"; 247 reserved 1; 248 249 // If true, use control flow to schedule the activation of Recv nodes. 250 // (Currently ignored.) 251 bool enable_recv_scheduling = 2; 252 253 // Options controlling how graph is optimized. 254 OptimizerOptions optimizer_options = 3; 255 256 // The number of steps to run before returning a cost model detailing 257 // the memory usage and performance of each node of the graph. 0 means 258 // no cost model. 259 int64 build_cost_model = 4; 260 261 // The number of steps to skip before collecting statistics for the 262 // cost model. 263 int64 build_cost_model_after = 9; 264 265 // Annotate each Node with Op output shape data, to the extent it can 266 // be statically inferred. 267 bool infer_shapes = 5; 268 269 // Only place the subgraphs that are run, rather than the entire graph. 270 // 271 // This is useful for interactive graph building, where one might 272 // produce graphs that cannot be placed during the debugging 273 // process. In particular, it allows the client to continue work in 274 // a session after adding a node to a graph whose placement 275 // constraints are unsatisfiable. 276 bool place_pruned_graph = 6; 277 278 // If true, transfer float values between processes as bfloat16. 279 bool enable_bfloat16_sendrecv = 7; 280 281 // If > 0, record a timeline every this many steps. 282 // EXPERIMENTAL: This currently has no effect in MasterSession. 283 int32 timeline_step = 8; 284 285 // Options that control the type and amount of graph rewriting. 286 // Not currently configurable via the public Python API (i.e. there is no API 287 // stability guarantee if you import RewriterConfig explicitly). 288 RewriterConfig rewrite_options = 10; 289} 290 291message ThreadPoolOptionProto { 292 // The number of threads in the pool. 293 // 294 // 0 means the system picks a value based on where this option proto is used 295 // (see the declaration of the specific field for more info). 296 int32 num_threads = 1; 297 298 // The global name of the threadpool. 299 // 300 // If empty, then the threadpool is made and used according to the scope it's 301 // in - e.g., for a session threadpool, it is used by that session only. 302 // 303 // If non-empty, then: 304 // - a global threadpool associated with this name is looked 305 // up or created. This allows, for example, sharing one threadpool across 306 // many sessions (e.g., like the default behavior, if 307 // inter_op_parallelism_threads is not configured), but still partitioning 308 // into a large and small pool. 309 // - if the threadpool for this global_name already exists, then it is an 310 // error if the existing pool was created using a different num_threads 311 // value as is specified on this call. 312 // - threadpools created this way are never garbage collected. 313 string global_name = 2; 314} 315 316message RPCOptions { 317 // If true, always use RPC to contact the session target. 318 // 319 // If false (the default option), TensorFlow may use an optimized 320 // transport for client-master communication that avoids the RPC 321 // stack. This option is primarily for used testing the RPC stack. 322 bool use_rpc_for_inprocess_master = 1; 323 324 // The compression algorithm to be used. One of "deflate", "gzip". 325 string compression_algorithm = 2; 326 327 // If compression_algorithm is set, the compression level to be used. 328 // From 0 (no compression), up to 3. 329 int32 compression_level = 3; 330 331 // Setting cache_rpc_response to true will enable sender side caching of 332 // response for RecvTensorAsync and RecvBufAsync to allow receiver to retry 333 // requests . This is only necessary when the network fabric is experiencing a 334 // significant error rate. Without it we'll fail a step on an network error, 335 // while with it we'll be able to complete long steps (like complex 336 // initializations) in the face of some network errors during RecvTensor. 337 bool cache_rpc_response = 4; 338 339 // Disables TCP connection sharing when opening a new RPC channel. 340 bool disable_session_connection_sharing = 5; 341} 342 343// Metadata about the session. 344// 345// This can be used by the runtime and the Ops for debugging, monitoring, etc. 346// 347// The (name, version) tuple is expected to be a unique identifier for 348// sessions within the same process. 349// 350// NOTE: This is currently used and propagated only by the direct session. 351message SessionMetadata { 352 string name = 1; 353 354 // The version is optional. If set, needs to be >= 0. 355 int64 version = 2; 356} 357 358// Session configuration parameters. 359// The system picks appropriate values for fields that are not set. 360message ConfigProto { 361 // Map from device type name (e.g., "CPU" or "GPU" ) to maximum 362 // number of devices of that type to use. If a particular device 363 // type is not found in the map, the system picks an appropriate 364 // number. 365 map<string, int32> device_count = 1; 366 367 // The execution of an individual op (for some op types) can be 368 // parallelized on a pool of intra_op_parallelism_threads. 369 // 0 means the system picks an appropriate number. 370 // 371 // If you create an ordinary session, e.g., from Python or C++, 372 // then there is exactly one intra op thread pool per process. 373 // The first session created determines the number of threads in this pool. 374 // All subsequent sessions reuse/share this one global pool. 375 // 376 // There are notable exceptions to the default behavior describe above: 377 // 1. There is an environment variable for overriding this thread pool, 378 // named TF_OVERRIDE_GLOBAL_THREADPOOL. 379 // 2. When connecting to a server, such as a remote `tf.train.Server` 380 // instance, then this option will be ignored altogether. 381 int32 intra_op_parallelism_threads = 2; 382 383 // Nodes that perform blocking operations are enqueued on a pool of 384 // inter_op_parallelism_threads available in each process. 385 // 386 // 0 means the system picks an appropriate number. 387 // Negative means all operations are performed in caller's thread. 388 // 389 // Note that the first Session created in the process sets the 390 // number of threads for all future sessions unless use_per_session_threads is 391 // true or session_inter_op_thread_pool is configured. 392 int32 inter_op_parallelism_threads = 5; 393 394 // If true, use a new set of threads for this session rather than the global 395 // pool of threads. Only supported by direct sessions. 396 // 397 // If false, use the global threads created by the first session, or the 398 // per-session thread pools configured by session_inter_op_thread_pool. 399 // 400 // This option is deprecated. The same effect can be achieved by setting 401 // session_inter_op_thread_pool to have one element, whose num_threads equals 402 // inter_op_parallelism_threads. 403 bool use_per_session_threads = 9; 404 405 // This option is experimental - it may be replaced with a different mechanism 406 // in the future. 407 // 408 // Configures session thread pools. If this is configured, then RunOptions for 409 // a Run call can select the thread pool to use. 410 // 411 // The intended use is for when some session invocations need to run in a 412 // background pool limited to a small number of threads: 413 // - For example, a session may be configured to have one large pool (for 414 // regular compute) and one small pool (for periodic, low priority work); 415 // using the small pool is currently the mechanism for limiting the inter-op 416 // parallelism of the low priority work. Note that it does not limit the 417 // parallelism of work spawned by a single op kernel implementation. 418 // - Using this setting is normally not needed in training, but may help some 419 // serving use cases. 420 // - It is also generally recommended to set the global_name field of this 421 // proto, to avoid creating multiple large pools. It is typically better to 422 // run the non-low-priority work, even across sessions, in a single large 423 // pool. 424 repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12; 425 426 // Assignment of Nodes to Devices is recomputed every placement_period 427 // steps until the system warms up (at which point the recomputation 428 // typically slows down automatically). 429 int32 placement_period = 3; 430 431 // When any filters are present sessions will ignore all devices which do not 432 // match the filters. Each filter can be partially specified, e.g. "/job:ps" 433 // "/job:worker/replica:3", etc. 434 repeated string device_filters = 4; 435 436 // Options that apply to all GPUs. 437 GPUOptions gpu_options = 6; 438 439 // Whether soft placement is allowed. If allow_soft_placement is true, 440 // an op will be placed on CPU if 441 // 1. there's no GPU implementation for the OP 442 // or 443 // 2. no GPU devices are known or registered 444 // or 445 // 3. need to co-locate with reftype input(s) which are from CPU. 446 bool allow_soft_placement = 7; 447 448 // Whether device placements should be logged. 449 bool log_device_placement = 8; 450 451 // Options that apply to all graphs. 452 GraphOptions graph_options = 10; 453 454 // Global timeout for all blocking operations in this session. If non-zero, 455 // and not overridden on a per-operation basis, this value will be used as the 456 // deadline for all blocking operations. 457 int64 operation_timeout_in_ms = 11; 458 459 // Options that apply when this session uses the distributed runtime. 460 RPCOptions rpc_options = 13; 461 462 // Optional list of all workers to use in this session. 463 ClusterDef cluster_def = 14; 464 465 // If true, any resources such as Variables used in the session will not be 466 // shared with other sessions. However, when clusterspec propagation is 467 // enabled, this field is ignored and sessions are always isolated. 468 bool isolate_session_state = 15; 469 470 // Everything inside Experimental is subject to change and is not subject 471 // to API stability guarantees in 472 // https://www.tensorflow.org/guide/version_compat. 473 message Experimental { 474 // Task name for group resolution. 475 string collective_group_leader = 1; 476 477 // We removed the flag client_handles_error_formatting. Marking the tag 478 // number as reserved. 479 // TODO(shikharagarwal): Should we just remove this tag so that it can be 480 // used in future for other purpose? 481 reserved 2; 482 483 // Which executor to use, the default executor will be used 484 // if it is an empty string or "DEFAULT" 485 string executor_type = 3; 486 487 // Guidance to formatting of large RecvBuf fields for transfer. 488 // Any positive value sets the max chunk size. 0 defaults to 4096. 489 // Any negative value indicates no max, i.e. one chunk only. 490 int32 recv_buf_max_chunk = 4; 491 492 // If true, and supported by the platform, the runtime will attempt to 493 // use NUMA affinity where applicable. One consequence will be the 494 // existence of as many CPU devices as there are available NUMA nodes. 495 bool use_numa_affinity = 5; 496 497 // If true, make collective op execution order sequential and deterministic 498 // for potentially concurrent collective instances. 499 bool collective_deterministic_sequential_execution = 6; 500 501 // If true, use NCCL for CollectiveOps. This feature is highly 502 // experimental. 503 bool collective_nccl = 7; 504 505 // In the following, session state means the value of a variable, elements 506 // in a hash table, or any other resource, accessible by worker sessions 507 // held by a TF server. 508 // 509 // When ClusterSpec propagation is enabled, the value of 510 // isolate_session_state is ignored when deciding whether to share session 511 // states in a TF server (for backwards compatibility reasons). 512 // - If share_session_state_in_clusterspec_propagation is true, the session 513 // states are shared. 514 // - If share_session_state_in_clusterspec_propagation is false, session 515 // states are isolated. 516 // 517 // When clusterspec propagation is not used, the value of 518 // share_session_state_in_clusterspec_propagation is ignored when deciding 519 // whether to share session states in a TF server. 520 // - If isolate_session_state is true, session states are isolated. 521 // - If isolate_session_state is false, session states are shared. 522 // 523 // TODO(b/129330037): Add a single API that consistently treats 524 // isolate_session_state and ClusterSpec propagation. 525 bool share_session_state_in_clusterspec_propagation = 8; 526 527 // If using a direct session, disable spinning while waiting for work in 528 // the thread pool. This may result in higher latency for completing ops, 529 // but in the case where there is a lot of spinning may result in lower 530 // CPU usage. 531 bool disable_thread_spinning = 9; 532 533 // When true, WorkerSessions are created with device attributes from the 534 // full cluster. 535 // This is helpful when a worker wants to partition a graph 536 // (for example during a PartitionedCallOp). 537 bool share_cluster_devices_in_session = 10; 538 539 // Metadata about the session. 540 // 541 // If set, this can be used by the runtime and the Ops for debugging, 542 // monitoring, etc. 543 // 544 // NOTE: This is currently used and propagated only by the direct session. 545 SessionMetadata session_metadata = 11; 546 547 // If true, the session may treat the graph as being static for optimization 548 // purposes. 549 // 550 // If this option is set to true when a session is created, the full 551 // GraphDef must be passed in a single call to Session::Create(), and 552 // Session::Extend() may not be supported. 553 bool optimize_for_static_graph = 12; 554 555 // Whether to enable the MLIR-based TF->XLA bridge. 556 // 557 // This is a replacement to the existing bridge, and not ready for 558 // production usage yet. 559 // If this option is set to true when a session is created, MLIR is used to 560 // perform the set of graph transformations to put the graph in a form that 561 // can be executed with delegation of some computations to an accelerator. 562 // This builds on the model of XLA where a subset of the graph is 563 // encapsulated and attached to a "compile" operation, whose result is fed 564 // to an "execute" operation. The kernel for these operations is responsible 565 // to lower the encapsulated graph to a particular device. 566 bool enable_mlir_bridge = 13; 567 568 // If true, the session will not store an additional copy of the graph for 569 // each subgraph. 570 // 571 // If this option is set to true when a session is created, the 572 // `RunOptions.output_partition_graphs` options must not be set. 573 bool disable_output_partition_graphs = 14; 574 575 // Minimum number of batches run through the XLA graph before XLA fusion 576 // autotuner is enabled. Default value of zero disables the autotuner. 577 // 578 // The XLA fusion autotuner can improve performance by executing a heuristic 579 // search on the compiler parameters. 580 int64 xla_fusion_autotuner_thresh = 15; 581 }; 582 583 Experimental experimental = 16; 584 585 // Next: 17 586} 587 588// Options for a single Run() call. 589message RunOptions { 590 // TODO(pbar) Turn this into a TraceOptions proto which allows 591 // tracing to be controlled in a more orthogonal manner? 592 enum TraceLevel { 593 NO_TRACE = 0; 594 SOFTWARE_TRACE = 1; 595 HARDWARE_TRACE = 2; 596 FULL_TRACE = 3; 597 } 598 TraceLevel trace_level = 1; 599 600 // Time to wait for operation to complete in milliseconds. 601 int64 timeout_in_ms = 2; 602 603 // The thread pool to use, if session_inter_op_thread_pool is configured. 604 // To use the caller thread set this to -1 - this uses the caller thread 605 // to execute Session::Run() and thus avoids a context switch. Using the 606 // caller thread to execute Session::Run() should be done ONLY for simple 607 // graphs, where the overhead of an additional context switch is 608 // comparable with the overhead of Session::Run(). 609 int32 inter_op_thread_pool = 3; 610 611 // Whether the partition graph(s) executed by the executor(s) should be 612 // outputted via RunMetadata. 613 bool output_partition_graphs = 5; 614 615 // EXPERIMENTAL. Options used to initialize DebuggerState, if enabled. 616 DebugOptions debug_options = 6; 617 618 // When enabled, causes tensor allocation information to be included in 619 // the error message when the Run() call fails because the allocator ran 620 // out of memory (OOM). 621 // 622 // Enabling this option can slow down the Run() call. 623 bool report_tensor_allocations_upon_oom = 7; 624 625 // Everything inside Experimental is subject to change and is not subject 626 // to API stability guarantees in 627 // https://www.tensorflow.org/guide/version_compat. 628 message Experimental { 629 // If non-zero, declares that this graph is going to use collective 630 // ops and must synchronize step_ids with any other graph with this 631 // same group_key value (in a distributed computation where tasks 632 // run disjoint graphs). 633 int64 collective_graph_key = 1; 634 // If true, then operations (using the inter-op pool) across all 635 // session::run() calls will be centrally scheduled, optimizing for (median 636 // and tail) latency. 637 // Consider using this option for CPU-bound workloads like inference. 638 bool use_run_handler_pool = 2; 639 }; 640 641 Experimental experimental = 8; 642 643 reserved 4; 644} 645 646// Metadata output (i.e., non-Tensor) for a single Run() call. 647message RunMetadata { 648 // Statistics traced for this step. Populated if tracing is turned on via the 649 // "RunOptions" proto. 650 // EXPERIMENTAL: The format and set of events may change in future versions. 651 StepStats step_stats = 1; 652 653 // The cost graph for the computation defined by the run call. 654 CostGraphDef cost_graph = 2; 655 656 // Graphs of the partitions executed by executors. 657 repeated GraphDef partition_graphs = 3; 658 659 message FunctionGraphs { 660 // TODO(nareshmodi): Include some sort of function/cache-key identifier? 661 repeated GraphDef partition_graphs = 1; 662 663 GraphDef pre_optimization_graph = 2; 664 GraphDef post_optimization_graph = 3; 665 } 666 // This is only populated for graphs that are run as functions in TensorFlow 667 // V2. There will be an entry below for each function that is traced. 668 // The main use cases of the post_optimization_graph and the partition_graphs 669 // is to give the caller insight into the graphs that were actually run by the 670 // runtime. Additional information (such as those in step_stats) will match 671 // these graphs. 672 // We also include the pre_optimization_graph since it is usually easier to 673 // read, and is helpful in situations where the caller wants to get a high 674 // level idea of what the built graph looks like (since the various graph 675 // optimization passes might change the structure of the graph significantly). 676 repeated FunctionGraphs function_graphs = 4; 677} 678 679// Defines a connection between two tensors in a `GraphDef`. 680message TensorConnection { 681 // A tensor name. The value of this tensor will be substituted for 682 // the tensor named in `to_tensor`. 683 string from_tensor = 1; 684 685 // A tensor name. The value of this tensor will be bound to the 686 // value of the tensor named in `from_tensor`. 687 string to_tensor = 2; 688} 689 690// Defines a subgraph in another `GraphDef` as a set of feed points and nodes 691// to be fetched or executed. 692// 693// Compare with the arguments to `Session::Run()`. 694message CallableOptions { 695 // Tensors to be fed in the callable. Each feed is the name of a tensor. 696 repeated string feed = 1; 697 698 // Fetches. A list of tensor names. The caller of the callable expects a 699 // tensor to be returned for each fetch[i] (see RunStepResponse.tensor). The 700 // order of specified fetches does not change the execution order. 701 repeated string fetch = 2; 702 703 // Target Nodes. A list of node names. The named nodes will be run by the 704 // callable but their outputs will not be returned. 705 repeated string target = 3; 706 707 // Options that will be applied to each run. 708 RunOptions run_options = 4; 709 710 // Tensors to be connected in the callable. Each TensorConnection denotes 711 // a pair of tensors in the graph, between which an edge will be created 712 // in the callable. 713 repeated TensorConnection tensor_connection = 5; 714 715 // The Tensor objects fed in the callable and fetched from the callable 716 // are expected to be backed by host (CPU) memory by default. 717 // 718 // The options below allow changing that - feeding tensors backed by 719 // device memory, or returning tensors that are backed by device memory. 720 // 721 // The maps below map the name of a feed/fetch tensor (which appears in 722 // 'feed' or 'fetch' fields above), to the fully qualified name of the device 723 // owning the memory backing the contents of the tensor. 724 // 725 // For example, creating a callable with the following options: 726 // 727 // CallableOptions { 728 // feed: "a:0" 729 // feed: "b:0" 730 // 731 // fetch: "x:0" 732 // fetch: "y:0" 733 // 734 // feed_devices: { 735 // "a:0": "/job:localhost/replica:0/task:0/device:GPU:0" 736 // } 737 // 738 // fetch_devices: { 739 // "y:0": "/job:localhost/replica:0/task:0/device:GPU:0" 740 // } 741 // } 742 // 743 // means that the Callable expects: 744 // - The first argument ("a:0") is a Tensor backed by GPU memory. 745 // - The second argument ("b:0") is a Tensor backed by host memory. 746 // and of its return values: 747 // - The first output ("x:0") will be backed by host memory. 748 // - The second output ("y:0") will be backed by GPU memory. 749 // 750 // FEEDS: 751 // It is the responsibility of the caller to ensure that the memory of the fed 752 // tensors will be correctly initialized and synchronized before it is 753 // accessed by operations executed during the call to Session::RunCallable(). 754 // 755 // This is typically ensured by using the TensorFlow memory allocators 756 // (Device::GetAllocator()) to create the Tensor to be fed. 757 // 758 // Alternatively, for CUDA-enabled GPU devices, this typically means that the 759 // operation that produced the contents of the tensor has completed, i.e., the 760 // CUDA stream has been synchronized (e.g., via cuCtxSynchronize() or 761 // cuStreamSynchronize()). 762 map<string, string> feed_devices = 6; 763 map<string, string> fetch_devices = 7; 764 765 // By default, RunCallable() will synchronize the GPU stream before returning 766 // fetched tensors on a GPU device, to ensure that the values in those tensors 767 // have been produced. This simplifies interacting with the tensors, but 768 // potentially incurs a performance hit. 769 // 770 // If this options is set to true, the caller is responsible for ensuring 771 // that the values in the fetched tensors have been produced before they are 772 // used. The caller can do this by invoking `Device::Sync()` on the underlying 773 // device(s), or by feeding the tensors back to the same Session using 774 // `feed_devices` with the same corresponding device name. 775 bool fetch_skip_sync = 8; 776 777 // Next: 9 778} 779