1syntax = "proto3"; 2 3package tensorflow; 4 5import "tensorflow/core/framework/cost_graph.proto"; 6import "tensorflow/core/framework/graph.proto"; 7import "tensorflow/core/framework/step_stats.proto"; 8import "tensorflow/core/protobuf/cluster.proto"; 9import "tensorflow/core/protobuf/coordination_config.proto"; 10import "tensorflow/core/protobuf/debug.proto"; 11import "tensorflow/core/protobuf/rewriter_config.proto"; 12 13option cc_enable_arenas = true; 14option java_outer_classname = "ConfigProtos"; 15option java_multiple_files = true; 16option java_package = "org.tensorflow.framework"; 17option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"; 18 19message GPUOptions { 20 // Fraction of the available GPU memory to allocate for each process. 21 // 1 means to allocate all of the GPU memory, 0.5 means the process 22 // allocates up to ~50% of the available GPU memory. 23 // 24 // GPU memory is pre-allocated unless the allow_growth option is enabled. 25 // 26 // If greater than 1.0, uses CUDA unified memory to potentially oversubscribe 27 // the amount of memory available on the GPU device by using host memory as a 28 // swap space. Accessing memory not available on the device will be 29 // significantly slower as that would require memory transfer between the host 30 // and the device. Options to reduce the memory requirement should be 31 // considered before enabling this option as this may come with a negative 32 // performance impact. Oversubscription using the unified memory requires 33 // Pascal class or newer GPUs and it is currently only supported on the Linux 34 // operating system. See 35 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements 36 // for the detailed requirements. 37 double per_process_gpu_memory_fraction = 1; 38 39 // If true, the allocator does not pre-allocate the entire specified 40 // GPU memory region, instead starting small and growing as needed. 41 bool allow_growth = 4; 42 43 // The type of GPU allocation strategy to use. 44 // 45 // Allowed values: 46 // "": The empty string (default) uses a system-chosen default 47 // which may change over time. 48 // 49 // "BFC": A "Best-fit with coalescing" algorithm, simplified from a 50 // version of dlmalloc. 51 string allocator_type = 2; 52 53 // Delay deletion of up to this many bytes to reduce the number of 54 // interactions with gpu driver code. If 0, the system chooses 55 // a reasonable default (several MBs). 56 int64 deferred_deletion_bytes = 3; 57 58 // A comma-separated list of GPU ids that determines the 'visible' 59 // to 'virtual' mapping of GPU devices. For example, if TensorFlow 60 // can see 8 GPU devices in the process, and one wanted to map 61 // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1", 62 // then one would specify this field as "5,3". This field is similar in 63 // spirit to the CUDA_VISIBLE_DEVICES environment variable, except 64 // it applies to the visible GPU devices in the process. 65 // 66 // NOTE: 67 // 1. The GPU driver provides the process with the visible GPUs 68 // in an order which is not guaranteed to have any correlation to 69 // the *physical* GPU id in the machine. This field is used for 70 // remapping "visible" to "virtual", which means this operates only 71 // after the process starts. Users are required to use vendor 72 // specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the 73 // physical to visible device mapping prior to invoking TensorFlow. 74 // 2. In the code, the ids in this list are also called "platform GPU id"s, 75 // and the 'virtual' ids of GPU devices (i.e. the ids in the device 76 // name "/device:GPU:<id>") are also called "TF GPU id"s. Please 77 // refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h 78 // for more information. 79 string visible_device_list = 5; 80 81 // In the event polling loop sleep this many microseconds between 82 // PollEvents calls, when the queue is not empty. If value is not 83 // set or set to 0, gets set to a non-zero default. 84 int32 polling_active_delay_usecs = 6; 85 86 // This field is deprecated and ignored. 87 int32 polling_inactive_delay_msecs = 7; 88 89 // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow, 90 // enabling this option forces all CPU tensors to be allocated with Cuda 91 // pinned memory. Normally, TensorFlow will infer which tensors should be 92 // allocated as the pinned memory. But in case where the inference is 93 // incomplete, this option can significantly speed up the cross-device memory 94 // copy performance as long as it fits the memory. 95 // Note that this option is not something that should be 96 // enabled by default for unknown or very large models, since all Cuda pinned 97 // memory is unpageable, having too much pinned memory might negatively impact 98 // the overall host system performance. 99 bool force_gpu_compatible = 8; 100 101 message Experimental { 102 // Configuration for breaking down a visible GPU into multiple "virtual" 103 // devices. 104 message VirtualDevices { 105 // Per "virtual" device memory limit, in MB. The number of elements in 106 // the list is the number of virtual devices to create on the 107 // corresponding visible GPU (see "virtual_devices" below). 108 // If empty, it will create single virtual device taking all available 109 // memory from the device. 110 // 111 // For the concept of "visible" and "virtual" GPU, see the comments for 112 // "visible_device_list" above for more information. 113 repeated float memory_limit_mb = 1; 114 115 // Priority values to use with the virtual devices. Use the cuda function 116 // cudaDeviceGetStreamPriorityRange to query for valid range of values for 117 // priority. 118 // 119 // On a P4000 GPU with cuda 10.1, the priority range reported was 0 for 120 // least priority and -1 for greatest priority. 121 // 122 // If this field is not specified, then the virtual devices will be 123 // created with the default. If this field has values set, then the size 124 // of this must match with the above memory_limit_mb. 125 repeated int32 priority = 2; 126 127 // Virtual Device ordinal number determines the device ID of the device. 128 // A Virtual device with a lower ordinal number always receives the a 129 // smaller device id. The phyiscal device id and location in the 130 // virtual device list is used to break ties. 131 repeated int32 device_ordinal = 3; 132 } 133 134 // The multi virtual device settings. If empty (not set), it will create 135 // single virtual device on each visible GPU, according to the settings 136 // in "visible_device_list" above. Otherwise, the number of elements in the 137 // list must be the same as the number of visible GPUs (after 138 // "visible_device_list" filtering if it is set), and the string represented 139 // device names (e.g. /device:GPU:<id>) will refer to the virtual 140 // devices and have the <id> field assigned sequentially starting from 0, 141 // according to the order of the virtual devices determined by 142 // device_ordinal and the location in the virtual device list. 143 // 144 // For example, 145 // visible_device_list = "1,0" 146 // virtual_devices { memory_limit: 1GB memory_limit: 2GB } 147 // virtual_devices { memory_limit: 3GB memory_limit: 4GB } 148 // will create 4 virtual devices as: 149 // /device:GPU:0 -> visible GPU 1 with 1GB memory 150 // /device:GPU:1 -> visible GPU 1 with 2GB memory 151 // /device:GPU:2 -> visible GPU 0 with 3GB memory 152 // /device:GPU:3 -> visible GPU 0 with 4GB memory 153 // 154 // but 155 // visible_device_list = "1,0" 156 // virtual_devices { memory_limit: 1GB memory_limit: 2GB 157 // device_ordinal: 10 device_ordinal: 20} 158 // virtual_devices { memory_limit: 3GB memory_limit: 4GB 159 // device_ordinal: 10 device_ordinal: 20} 160 // will create 4 virtual devices as: 161 // /device:GPU:0 -> visible GPU 1 with 1GB memory (ordinal 10) 162 // /device:GPU:1 -> visible GPU 0 with 3GB memory (ordinal 10) 163 // /device:GPU:2 -> visible GPU 1 with 2GB memory (ordinal 20) 164 // /device:GPU:3 -> visible GPU 0 with 4GB memory (ordinal 20) 165 // 166 // NOTE: 167 // 1. It's invalid to set both this and "per_process_gpu_memory_fraction" 168 // at the same time. 169 // 2. Currently this setting is per-process, not per-session. Using 170 // different settings in different sessions within same process will 171 // result in undefined behavior. 172 repeated VirtualDevices virtual_devices = 1; 173 174 // If true, uses CUDA unified memory for memory allocations. If 175 // per_process_gpu_memory_fraction option is greater than 1.0, then unified 176 // memory is used regardless of the value for this field. See comments for 177 // per_process_gpu_memory_fraction field for more details and requirements 178 // of the unified memory. This option is useful to oversubscribe memory if 179 // multiple processes are sharing a single GPU while individually using less 180 // than 1.0 per process memory fraction. 181 bool use_unified_memory = 2; 182 183 // If > 1, the number of device-to-device copy streams to create 184 // for each GPUDevice. Default value is 0, which is automatically 185 // converted to 1. 186 int32 num_dev_to_dev_copy_streams = 3; 187 188 // If non-empty, defines a good GPU ring order on a single worker based on 189 // device interconnect. This assumes that all workers have the same GPU 190 // topology. Specify as a comma-separated string, e.g. "3,2,1,0,7,6,5,4". 191 // This ring order is used by the RingReducer implementation of 192 // CollectiveReduce, and serves as an override to automatic ring order 193 // generation in OrderTaskDeviceMap() during CollectiveParam resolution. 194 string collective_ring_order = 4; 195 196 // If true then extra work is done by GPUDevice and GPUBFCAllocator to 197 // keep track of when GPU memory is freed and when kernels actually 198 // complete so that we can know when a nominally free memory chunk 199 // is really not subject to pending use. 200 bool timestamped_allocator = 5; 201 202 // reserved id: 6 203 204 // Parameters for GPUKernelTracker. By default no kernel tracking is done. 205 // Note that timestamped_allocator is only effective if some tracking is 206 // specified. 207 // 208 // If kernel_tracker_max_interval = n > 0, then a tracking event 209 // is inserted after every n kernels without an event. 210 int32 kernel_tracker_max_interval = 7; 211 // If kernel_tracker_max_bytes = n > 0, then a tracking event is 212 // inserted after every series of kernels allocating a sum of 213 // memory >= n. If one kernel allocates b * n bytes, then one 214 // event will be inserted after it, but it will count as b against 215 // the pending limit. 216 int32 kernel_tracker_max_bytes = 8; 217 // If kernel_tracker_max_pending > 0 then no more than this many 218 // tracking events can be outstanding at a time. An attempt to 219 // launch an additional kernel will stall until an event 220 // completes. 221 int32 kernel_tracker_max_pending = 9; 222 223 // BFC Allocator can return an allocated chunk of memory upto 2x the 224 // requested size. For virtual devices with tight memory constraints, and 225 // proportionately large allocation requests, this can lead to a significant 226 // reduction in available memory. The threshold below controls when a chunk 227 // should be split if the chunk size exceeds requested memory size. It is 228 // expressed as a fraction of total available memory for the tf device. For 229 // example setting it to 0.05 would imply a chunk needs to be split if its 230 // size exceeds the requested memory by 5% of the total virtual device/gpu 231 // memory size. 232 double internal_fragmentation_fraction = 10; 233 234 // When true, use CUDA cudaMallocAsync API instead of TF gpu allocator. 235 bool use_cuda_malloc_async = 11; 236 237 // By default, BFCAllocator may sleep when it runs out of memory, in the 238 // hopes that another thread will free up memory in the meantime. Setting 239 // this to true disables the sleep; instead we'll OOM immediately. 240 bool disallow_retry_on_allocation_failure = 12; 241 } 242 243 // Everything inside experimental is subject to change and is not subject 244 // to API stability guarantees in 245 // https://www.tensorflow.org/guide/version_compat. 246 Experimental experimental = 9; 247} 248 249// Options passed to the graph optimizer 250message OptimizerOptions { 251 // If true, optimize the graph using common subexpression elimination. 252 // Note: the optimization Level L1 will override this setting to true. So in 253 // order to disable common subexpression elimination the opt_level has to be 254 // set to L0. 255 bool do_common_subexpression_elimination = 1; 256 257 // If true, perform constant folding optimization on the graph. 258 // Note: the optimization Level L1 will override this setting to true. So in 259 // order to disable constant folding the opt_level has to be set to L0. 260 bool do_constant_folding = 2; 261 262 // Constant folding optimization replaces tensors whose values can be 263 // predetermined, with constant nodes. To avoid inserting too large constants, 264 // the size of each constant created can be limited. If this value is zero, a 265 // default limit of 10 MiB will be applied. If constant folding optimization 266 // is disabled, this value is ignored. 267 int64 max_folded_constant_in_bytes = 6; 268 269 // If true, perform function inlining on the graph. 270 bool do_function_inlining = 4; 271 272 // Optimization level 273 enum Level { 274 // L1 is the default level. 275 // Optimization performed at L1 : 276 // 1. Common subexpression elimination 277 // 2. Constant folding 278 L1 = 0; 279 280 // No optimizations 281 L0 = -1; 282 } 283 284 // Overall optimization level. The actual optimizations applied will be the 285 // logical OR of the flags that this level implies and any flags already set. 286 Level opt_level = 3; 287 288 // Control the use of the compiler/jit. Experimental. 289 enum GlobalJitLevel { 290 DEFAULT = 0; // Default setting ("off" now, but later expected to be "on") 291 OFF = -1; 292 // The following settings turn on compilation, with higher values being 293 // more aggressive. Higher values may reduce opportunities for parallelism 294 // and may use more memory. (At present, there is no distinction, but this 295 // is expected to change.) 296 ON_1 = 1; 297 ON_2 = 2; 298 } 299 GlobalJitLevel global_jit_level = 5; 300 301 // CPU code will be autoclustered only if global_jit_level >= ON_1 and either: 302 // - this flag is true, or 303 // - TF_XLA_FLAGS contains --tf_xla_cpu_global_jit=true. 304 bool cpu_global_jit = 7; 305} 306 307message GraphOptions { 308 // Removed, use optimizer_options below. 309 reserved "skip_common_subexpression_elimination"; 310 reserved 1; 311 312 // If true, use control flow to schedule the activation of Recv nodes. 313 // (Currently ignored.) 314 bool enable_recv_scheduling = 2; 315 316 // Options controlling how graph is optimized. 317 OptimizerOptions optimizer_options = 3; 318 319 // The number of steps to run before returning a cost model detailing 320 // the memory usage and performance of each node of the graph. 0 means 321 // no cost model. 322 int64 build_cost_model = 4; 323 324 // The number of steps to skip before collecting statistics for the 325 // cost model. 326 int64 build_cost_model_after = 9; 327 328 // Annotate each Node with Op output shape data, to the extent it can 329 // be statically inferred. 330 bool infer_shapes = 5; 331 332 // Only place the subgraphs that are run, rather than the entire graph. 333 // 334 // This is useful for interactive graph building, where one might 335 // produce graphs that cannot be placed during the debugging 336 // process. In particular, it allows the client to continue work in 337 // a session after adding a node to a graph whose placement 338 // constraints are unsatisfiable. 339 bool place_pruned_graph = 6; 340 341 // If true, transfer float values between processes as bfloat16. 342 bool enable_bfloat16_sendrecv = 7; 343 344 // If > 0, record a timeline every this many steps. 345 // EXPERIMENTAL: This currently has no effect in MasterSession. 346 int32 timeline_step = 8; 347 348 // Options that control the type and amount of graph rewriting. 349 // Not currently configurable via the public Python API (i.e. there is no API 350 // stability guarantee if you import RewriterConfig explicitly). 351 RewriterConfig rewrite_options = 10; 352} 353 354message ThreadPoolOptionProto { 355 // The number of threads in the pool. 356 // 357 // 0 means the system picks a value based on where this option proto is used 358 // (see the declaration of the specific field for more info). 359 int32 num_threads = 1; 360 361 // The global name of the threadpool. 362 // 363 // If empty, then the threadpool is made and used according to the scope it's 364 // in - e.g., for a session threadpool, it is used by that session only. 365 // 366 // If non-empty, then: 367 // - a global threadpool associated with this name is looked 368 // up or created. This allows, for example, sharing one threadpool across 369 // many sessions (e.g., like the default behavior, if 370 // inter_op_parallelism_threads is not configured), but still partitioning 371 // into a large and small pool. 372 // - if the threadpool for this global_name already exists, then it is an 373 // error if the existing pool was created using a different num_threads 374 // value as is specified on this call. 375 // - threadpools created this way are never garbage collected. 376 string global_name = 2; 377} 378 379message RPCOptions { 380 // If true, always use RPC to contact the session target. 381 // 382 // If false (the default option), TensorFlow may use an optimized 383 // transport for client-master communication that avoids the RPC 384 // stack. This option is primarily for used testing the RPC stack. 385 bool use_rpc_for_inprocess_master = 1; 386 387 // The compression algorithm to be used. One of "deflate", "gzip". 388 string compression_algorithm = 2; 389 390 // If compression_algorithm is set, the compression level to be used. 391 // From 0 (no compression), up to 3. 392 int32 compression_level = 3; 393 394 // Setting cache_rpc_response to true will enable sender side caching of 395 // response for RecvTensorAsync and RecvBufAsync to allow receiver to retry 396 // requests . This is only necessary when the network fabric is experiencing a 397 // significant error rate. Without it we'll fail a step on an network error, 398 // while with it we'll be able to complete long steps (like complex 399 // initializations) in the face of some network errors during RecvTensor. 400 bool cache_rpc_response = 4; 401 402 // Disables TCP connection sharing when opening a new RPC channel. 403 bool disable_session_connection_sharing = 5; 404 405 // Setting num_channels_per_target > 0 allows uses of multiple channels to 406 // communicate to the same target. This can be used to improve the aggregate 407 // throughput on high speed links (e.g 100G) where single connection is not 408 // sufficient to maximize link utilization. Note that a single RPC only goes 409 // on a single channel, this only helps in situations where there are multiple 410 // transfers to the same target overlapping in time. 411 int32 num_channels_per_target = 6; 412} 413 414// Metadata about the session. 415// 416// This can be used by the runtime and the Ops for debugging, monitoring, etc. 417// 418// The (name, version) tuple is expected to be a unique identifier for 419// sessions within the same process. 420// 421// NOTE: This is currently used and propagated only by the direct session. 422message SessionMetadata { 423 string name = 1; 424 425 // The version is optional. If set, needs to be >= 0. 426 int64 version = 2; 427} 428 429// Session configuration parameters. 430// The system picks appropriate values for fields that are not set. 431message ConfigProto { 432 // Map from device type name (e.g., "CPU" or "GPU" ) to maximum 433 // number of devices of that type to use. If a particular device 434 // type is not found in the map, the system picks an appropriate 435 // number. 436 map<string, int32> device_count = 1; 437 438 // The execution of an individual op (for some op types) can be 439 // parallelized on a pool of intra_op_parallelism_threads. 440 // 0 means the system picks an appropriate number. 441 // 442 // If you create an ordinary session, e.g., from Python or C++, 443 // then there is exactly one intra op thread pool per process. 444 // The first session created determines the number of threads in this pool. 445 // All subsequent sessions reuse/share this one global pool. 446 // 447 // There are notable exceptions to the default behavior described above: 448 // 1. There is an environment variable for overriding this thread pool, 449 // named TF_OVERRIDE_GLOBAL_THREADPOOL. 450 // 2. When connecting to a server, such as a remote `tf.train.Server` 451 // instance, then this option will be ignored altogether. 452 int32 intra_op_parallelism_threads = 2; 453 454 // Nodes that perform blocking operations are enqueued on a pool of 455 // inter_op_parallelism_threads available in each process. 456 // 457 // 0 means the system picks an appropriate number. 458 // Negative means all operations are performed in caller's thread. 459 // 460 // Note that the first Session created in the process sets the 461 // number of threads for all future sessions unless use_per_session_threads is 462 // true or session_inter_op_thread_pool is configured. 463 int32 inter_op_parallelism_threads = 5; 464 465 // If true, use a new set of threads for this session rather than the global 466 // pool of threads. Only supported by direct sessions. 467 // 468 // If false, use the global threads created by the first session, or the 469 // per-session thread pools configured by session_inter_op_thread_pool. 470 // 471 // This option is deprecated. The same effect can be achieved by setting 472 // session_inter_op_thread_pool to have one element, whose num_threads equals 473 // inter_op_parallelism_threads. 474 bool use_per_session_threads = 9; 475 476 // This option is experimental - it may be replaced with a different mechanism 477 // in the future. 478 // 479 // Configures session thread pools. If this is configured, then RunOptions for 480 // a Run call can select the thread pool to use. 481 // 482 // The intended use is for when some session invocations need to run in a 483 // background pool limited to a small number of threads: 484 // - For example, a session may be configured to have one large pool (for 485 // regular compute) and one small pool (for periodic, low priority work); 486 // using the small pool is currently the mechanism for limiting the inter-op 487 // parallelism of the low priority work. Note that it does not limit the 488 // parallelism of work spawned by a single op kernel implementation. 489 // - Using this setting is normally not needed in training, but may help some 490 // serving use cases. 491 // - It is also generally recommended to set the global_name field of this 492 // proto, to avoid creating multiple large pools. It is typically better to 493 // run the non-low-priority work, even across sessions, in a single large 494 // pool. 495 repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12; 496 497 // Assignment of Nodes to Devices is recomputed every placement_period 498 // steps until the system warms up (at which point the recomputation 499 // typically slows down automatically). 500 int32 placement_period = 3; 501 502 // When any filters are present sessions will ignore all devices which do not 503 // match the filters. Each filter can be partially specified, e.g. "/job:ps" 504 // "/job:worker/replica:3", etc. 505 repeated string device_filters = 4; 506 507 // Options that apply to all GPUs. 508 GPUOptions gpu_options = 6; 509 510 // Whether soft placement is allowed. If allow_soft_placement is true, 511 // an op will be placed on CPU if 512 // 1. there's no GPU implementation for the OP 513 // or 514 // 2. no GPU devices are known or registered 515 // or 516 // 3. need to co-locate with reftype input(s) which are from CPU. 517 bool allow_soft_placement = 7; 518 519 // Whether device placements should be logged. 520 bool log_device_placement = 8; 521 522 // Options that apply to all graphs. 523 GraphOptions graph_options = 10; 524 525 // Global timeout for all blocking operations in this session. If non-zero, 526 // and not overridden on a per-operation basis, this value will be used as the 527 // deadline for all blocking operations. 528 int64 operation_timeout_in_ms = 11; 529 530 // Options that apply when this session uses the distributed runtime. 531 RPCOptions rpc_options = 13; 532 533 // Optional list of all workers to use in this session. 534 ClusterDef cluster_def = 14; 535 536 // If true, any resources such as Variables used in the session will not be 537 // shared with other sessions. However, when clusterspec propagation is 538 // enabled, this field is ignored and sessions are always isolated. 539 bool isolate_session_state = 15; 540 541 // When true, WorkerSessions are created with device attributes from the 542 // full cluster. 543 // This is helpful when a worker wants to partition a graph 544 // (for example during a PartitionedCallOp). 545 bool share_cluster_devices_in_session = 17; 546 547 // Everything inside Experimental is subject to change and is not subject 548 // to API stability guarantees in 549 // https://www.tensorflow.org/guide/version_compat. 550 message Experimental { 551 // Task name for group resolution. 552 string collective_group_leader = 1; 553 554 // We removed the flag client_handles_error_formatting. Marking the tag 555 // number as reserved. 556 // TODO(shikharagarwal): Should we just remove this tag so that it can be 557 // used in future for other purpose? 558 reserved 2; 559 560 // Which executor to use, the default executor will be used 561 // if it is an empty string or "DEFAULT" 562 string executor_type = 3; 563 564 // Guidance to formatting of large RecvBuf fields for transfer. 565 // Any positive value sets the max chunk size. 0 defaults to 4096. 566 // Any negative value indicates no max, i.e. one chunk only. 567 int32 recv_buf_max_chunk = 4; 568 569 // If true, and supported by the platform, the runtime will attempt to 570 // use NUMA affinity where applicable. One consequence will be the 571 // existence of as many CPU devices as there are available NUMA nodes. 572 bool use_numa_affinity = 5; 573 574 // If true, make collective op execution order sequential and deterministic 575 // for potentially concurrent collective instances. 576 bool collective_deterministic_sequential_execution = 6; 577 578 // If true, use NCCL for CollectiveOps. This feature is highly 579 // experimental. 580 bool collective_nccl = 7; 581 582 // In the following, session state means the value of a variable, elements 583 // in a hash table, or any other resource, accessible by worker sessions 584 // held by a TF server. 585 // 586 // When ClusterSpec propagation is enabled, the value of 587 // isolate_session_state is ignored when deciding whether to share session 588 // states in a TF server (for backwards compatibility reasons). 589 // - If share_session_state_in_clusterspec_propagation is true, the session 590 // states are shared. 591 // - If share_session_state_in_clusterspec_propagation is false, session 592 // states are isolated. 593 // 594 // When clusterspec propagation is not used, the value of 595 // share_session_state_in_clusterspec_propagation is ignored when deciding 596 // whether to share session states in a TF server. 597 // - If isolate_session_state is true, session states are isolated. 598 // - If isolate_session_state is false, session states are shared. 599 // 600 // TODO(b/129330037): Add a single API that consistently treats 601 // isolate_session_state and ClusterSpec propagation. 602 bool share_session_state_in_clusterspec_propagation = 8; 603 604 // If using a direct session, disable spinning while waiting for work in 605 // the thread pool. This may result in higher latency for completing ops, 606 // but in the case where there is a lot of spinning may result in lower 607 // CPU usage. 608 bool disable_thread_spinning = 9; 609 610 // This was promoted to a non-experimental API. Please use 611 // ConfigProto.share_cluster_devices_in_session instead. 612 bool share_cluster_devices_in_session = 10; 613 614 // Metadata about the session. 615 // 616 // If set, this can be used by the runtime and the Ops for debugging, 617 // monitoring, etc. 618 // 619 // NOTE: This is currently used and propagated only by the direct session. 620 SessionMetadata session_metadata = 11; 621 622 // If true, the session may treat the graph as being static for optimization 623 // purposes. 624 // 625 // If this option is set to true when a session is created, the full 626 // GraphDef must be passed in a single call to Session::Create(), and 627 // Session::Extend() may not be supported. 628 bool optimize_for_static_graph = 12; 629 630 // This field will eventually be deprecated and replaced by 631 // mlir_bridge_rollout (b/166038521). 632 // 633 // Whether to enable the MLIR-based TF->XLA bridge. 634 // 635 // This is a replacement to the existing bridge, and not ready for 636 // production usage yet. 637 // If this option is set to true when a session is created, MLIR is used to 638 // perform the set of graph transformations to put the graph in a form that 639 // can be executed with delegation of some computations to an accelerator. 640 // This builds on the model of XLA where a subset of the graph is 641 // encapsulated and attached to a "compile" operation, whose result is fed 642 // to an "execute" operation. The kernel for these operations is responsible 643 // to lower the encapsulated graph to a particular device. 644 bool enable_mlir_bridge = 13; 645 646 // An enum that describes the state of the MLIR bridge rollout. 647 enum MlirBridgeRollout { 648 // If this field is left unspecified, the MLIR bridge may be selectively 649 // enabled on a per graph basis. 650 MLIR_BRIDGE_ROLLOUT_UNSPECIFIED = 0; 651 // Enabling the MLIR bridge enables it for all graphs in this session. 652 MLIR_BRIDGE_ROLLOUT_ENABLED = 1; 653 // Disabling the MLIR bridge disables it for all graphs in this session. 654 MLIR_BRIDGE_ROLLOUT_DISABLED = 2; 655 // Enable the MLIR bridge on a per graph basis based on an analysis of 656 // the features used in the graph. If the features used by the graph are 657 // supported by the MLIR bridge, the MLIR bridge will be used to run the 658 // graph. 659 MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED = 3; 660 // Enable the MLIR bridge in a fallback mode on a per graph basis based 661 // on an analysis of the features used in the graph. 662 // Running the MLIR bridge in the fallback mode means that it is 663 // executed and it commits all the changes to the TF graph in case 664 // of success. And it does not in case of failures and let the old bridge 665 // to process the TF graph. 666 MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED = 4; 667 } 668 // This field is underdevelopment, for now use enable_mlir_bridge 669 // (b/166038521). 670 // 671 // Whether to enable the MLIR-based TF->XLA bridge. 672 MlirBridgeRollout mlir_bridge_rollout = 17; 673 674 // Whether to enable the MLIR-based Graph optimizations. 675 // 676 // This will become a part of standard Tensorflow graph optimization 677 // pipeline, currently this is only used for gradual migration and testing 678 // new passes that are replacing existing optimizations in Grappler. 679 bool enable_mlir_graph_optimization = 16; 680 681 // If true, the session will not store an additional copy of the graph for 682 // each subgraph. 683 // 684 // If this option is set to true when a session is created, the 685 // `RunOptions.output_partition_graphs` options must not be set. 686 bool disable_output_partition_graphs = 14; 687 688 // Minimum number of batches run through the XLA graph before XLA fusion 689 // autotuner is enabled. Default value of zero disables the autotuner. 690 // 691 // The XLA fusion autotuner can improve performance by executing a heuristic 692 // search on the compiler parameters. 693 int64 xla_fusion_autotuner_thresh = 15; 694 695 // Whether runtime execution uses TFRT. 696 bool use_tfrt = 18; 697 698 // The field "coordination_service was previously specified as a string; 699 // this has been replaced with a message below. 700 reserved 19; 701 702 // We removed the flag fetch_remote_devices_in_multi_client. Marking the tag 703 // number as reserved. 704 reserved 20; 705 706 // Whether functional control flow op lowering should be disabled. This is 707 // useful when executing within a portable runtime where control flow op 708 // kernels may not be loaded due to selective registration. 709 bool disable_functional_ops_lowering = 21; 710 711 // Provides a hint to XLA auto clustering to prefer forming a single large 712 // cluster that encompases most of the graph. 713 bool xla_prefer_single_graph_cluster = 22; 714 715 // Distributed coordination service configurations. 716 CoordinationServiceConfig coordination_config = 23; 717 718 // Next: 24 719 } 720 721 Experimental experimental = 16; 722 723 // Next: 18 724} 725 726// Options for a single Run() call. 727message RunOptions { 728 // TODO(pbar) Turn this into a TraceOptions proto which allows 729 // tracing to be controlled in a more orthogonal manner? 730 enum TraceLevel { 731 NO_TRACE = 0; 732 SOFTWARE_TRACE = 1; 733 HARDWARE_TRACE = 2; 734 FULL_TRACE = 3; 735 } 736 TraceLevel trace_level = 1; 737 738 // Time to wait for operation to complete in milliseconds. 739 int64 timeout_in_ms = 2; 740 741 // The thread pool to use, if session_inter_op_thread_pool is configured. 742 // To use the caller thread set this to -1 - this uses the caller thread 743 // to execute Session::Run() and thus avoids a context switch. Using the 744 // caller thread to execute Session::Run() should be done ONLY for simple 745 // graphs, where the overhead of an additional context switch is 746 // comparable with the overhead of Session::Run(). 747 int32 inter_op_thread_pool = 3; 748 749 // Whether the partition graph(s) executed by the executor(s) should be 750 // outputted via RunMetadata. 751 bool output_partition_graphs = 5; 752 753 // EXPERIMENTAL. Options used to initialize DebuggerState, if enabled. 754 DebugOptions debug_options = 6; 755 756 // When enabled, causes tensor allocation information to be included in 757 // the error message when the Run() call fails because the allocator ran 758 // out of memory (OOM). 759 // 760 // Enabling this option can slow down the Run() call. 761 bool report_tensor_allocations_upon_oom = 7; 762 763 // Everything inside Experimental is subject to change and is not subject 764 // to API stability guarantees in 765 // https://www.tensorflow.org/guide/version_compat. 766 message Experimental { 767 // If non-zero, declares that this graph is going to use collective 768 // ops and must synchronize step_ids with any other graph with this 769 // same group_key value (in a distributed computation where tasks 770 // run disjoint graphs). 771 int64 collective_graph_key = 1; 772 // If true, then operations (using the inter-op pool) across all 773 // session::run() calls will be centrally scheduled, optimizing for (median 774 // and tail) latency. 775 // Consider using this option for CPU-bound workloads like inference. 776 bool use_run_handler_pool = 2; 777 // Options for run handler thread pool. 778 message RunHandlerPoolOptions { 779 // Priority of the request. The run handler thread pool will schedule ops 780 // based on the priority number. The larger number means higher priority. 781 int64 priority = 1; 782 } 783 RunHandlerPoolOptions run_handler_pool_options = 3; 784 } 785 786 Experimental experimental = 8; 787 788 reserved 4; 789} 790 791// Metadata output (i.e., non-Tensor) for a single Run() call. 792message RunMetadata { 793 // Statistics traced for this step. Populated if tracing is turned on via the 794 // "RunOptions" proto. 795 // EXPERIMENTAL: The format and set of events may change in future versions. 796 StepStats step_stats = 1; 797 798 // The cost graph for the computation defined by the run call. 799 CostGraphDef cost_graph = 2; 800 801 // Graphs of the partitions executed by executors. 802 repeated GraphDef partition_graphs = 3; 803 804 message FunctionGraphs { 805 // TODO(nareshmodi): Include some sort of function/cache-key identifier? 806 repeated GraphDef partition_graphs = 1; 807 808 GraphDef pre_optimization_graph = 2; 809 GraphDef post_optimization_graph = 3; 810 } 811 // This is only populated for graphs that are run as functions in TensorFlow 812 // V2. There will be an entry below for each function that is traced. 813 // The main use cases of the post_optimization_graph and the partition_graphs 814 // is to give the caller insight into the graphs that were actually run by the 815 // runtime. Additional information (such as those in step_stats) will match 816 // these graphs. 817 // We also include the pre_optimization_graph since it is usually easier to 818 // read, and is helpful in situations where the caller wants to get a high 819 // level idea of what the built graph looks like (since the various graph 820 // optimization passes might change the structure of the graph significantly). 821 repeated FunctionGraphs function_graphs = 4; 822 823 // Metadata about the session. 824 SessionMetadata session_metadata = 5; 825} 826 827// Defines a connection between two tensors in a `GraphDef`. 828message TensorConnection { 829 // A tensor name. The value of this tensor will be substituted for 830 // the tensor named in `to_tensor`. 831 string from_tensor = 1; 832 833 // A tensor name. The value of this tensor will be bound to the 834 // value of the tensor named in `from_tensor`. 835 string to_tensor = 2; 836} 837 838// Defines a subgraph in another `GraphDef` as a set of feed points and nodes 839// to be fetched or executed. 840// 841// Compare with the arguments to `Session::Run()`. 842message CallableOptions { 843 // Tensors to be fed in the callable. Each feed is the name of a tensor. 844 repeated string feed = 1; 845 846 // Fetches. A list of tensor names. The caller of the callable expects a 847 // tensor to be returned for each fetch[i] (see RunStepResponse.tensor). The 848 // order of specified fetches does not change the execution order. 849 repeated string fetch = 2; 850 851 // Target Nodes. A list of node names. The named nodes will be run by the 852 // callable but their outputs will not be returned. 853 repeated string target = 3; 854 855 // Options that will be applied to each run. 856 RunOptions run_options = 4; 857 858 // Tensors to be connected in the callable. Each TensorConnection denotes 859 // a pair of tensors in the graph, between which an edge will be created 860 // in the callable. 861 repeated TensorConnection tensor_connection = 5; 862 863 // The Tensor objects fed in the callable and fetched from the callable 864 // are expected to be backed by host (CPU) memory by default. 865 // 866 // The options below allow changing that - feeding tensors backed by 867 // device memory, or returning tensors that are backed by device memory. 868 // 869 // The maps below map the name of a feed/fetch tensor (which appears in 870 // 'feed' or 'fetch' fields above), to the fully qualified name of the device 871 // owning the memory backing the contents of the tensor. 872 // 873 // For example, creating a callable with the following options: 874 // 875 // CallableOptions { 876 // feed: "a:0" 877 // feed: "b:0" 878 // 879 // fetch: "x:0" 880 // fetch: "y:0" 881 // 882 // feed_devices: { 883 // "a:0": "/job:localhost/replica:0/task:0/device:GPU:0" 884 // } 885 // 886 // fetch_devices: { 887 // "y:0": "/job:localhost/replica:0/task:0/device:GPU:0" 888 // } 889 // } 890 // 891 // means that the Callable expects: 892 // - The first argument ("a:0") is a Tensor backed by GPU memory. 893 // - The second argument ("b:0") is a Tensor backed by host memory. 894 // and of its return values: 895 // - The first output ("x:0") will be backed by host memory. 896 // - The second output ("y:0") will be backed by GPU memory. 897 // 898 // FEEDS: 899 // It is the responsibility of the caller to ensure that the memory of the fed 900 // tensors will be correctly initialized and synchronized before it is 901 // accessed by operations executed during the call to Session::RunCallable(). 902 // 903 // This is typically ensured by using the TensorFlow memory allocators 904 // (Device::GetAllocator()) to create the Tensor to be fed. 905 // 906 // Alternatively, for CUDA-enabled GPU devices, this typically means that the 907 // operation that produced the contents of the tensor has completed, i.e., the 908 // CUDA stream has been synchronized (e.g., via cuCtxSynchronize() or 909 // cuStreamSynchronize()). 910 map<string, string> feed_devices = 6; 911 map<string, string> fetch_devices = 7; 912 913 // By default, RunCallable() will synchronize the GPU stream before returning 914 // fetched tensors on a GPU device, to ensure that the values in those tensors 915 // have been produced. This simplifies interacting with the tensors, but 916 // potentially incurs a performance hit. 917 // 918 // If this options is set to true, the caller is responsible for ensuring 919 // that the values in the fetched tensors have been produced before they are 920 // used. The caller can do this by invoking `Device::Sync()` on the underlying 921 // device(s), or by feeding the tensors back to the same Session using 922 // `feed_devices` with the same corresponding device name. 923 bool fetch_skip_sync = 8; 924 925 // Next: 9 926} 927