1syntax = "proto3"; 2 3package tensorflow; 4option cc_enable_arenas = true; 5option java_outer_classname = "ConfigProtos"; 6option java_multiple_files = true; 7option java_package = "org.tensorflow.framework"; 8// add go_package externally with copybara 9import "tensorflow/core/framework/cost_graph.proto"; 10import "tensorflow/core/framework/graph.proto"; 11import "tensorflow/core/framework/step_stats.proto"; 12import "tensorflow/core/protobuf/debug.proto"; 13import "tensorflow/core/protobuf/cluster.proto"; 14import "tensorflow/core/protobuf/rewriter_config.proto"; 15 16message GPUOptions { 17 // Fraction of the available GPU memory to allocate for each process. 18 // 1 means to allocate all of the GPU memory, 0.5 means the process 19 // allocates up to ~50% of the available GPU memory. 20 // 21 // GPU memory is pre-allocated unless the allow_growth option is enabled. 22 // 23 // If greater than 1.0, uses CUDA unified memory to potentially oversubscribe 24 // the amount of memory available on the GPU device by using host memory as a 25 // swap space. Accessing memory not available on the device will be 26 // significantly slower as that would require memory transfer between the host 27 // and the device. Options to reduce the memory requirement should be 28 // considered before enabling this option as this may come with a negative 29 // performance impact. Oversubscription using the unified memory requires 30 // Pascal class or newer GPUs and it is currently only supported on the Linux 31 // operating system. See 32 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements 33 // for the detailed requirements. 34 double per_process_gpu_memory_fraction = 1; 35 36 // If true, the allocator does not pre-allocate the entire specified 37 // GPU memory region, instead starting small and growing as needed. 38 bool allow_growth = 4; 39 40 // The type of GPU allocation strategy to use. 41 // 42 // Allowed values: 43 // "": The empty string (default) uses a system-chosen default 44 // which may change over time. 45 // 46 // "BFC": A "Best-fit with coalescing" algorithm, simplified from a 47 // version of dlmalloc. 48 string allocator_type = 2; 49 50 // Delay deletion of up to this many bytes to reduce the number of 51 // interactions with gpu driver code. If 0, the system chooses 52 // a reasonable default (several MBs). 53 int64 deferred_deletion_bytes = 3; 54 55 // A comma-separated list of GPU ids that determines the 'visible' 56 // to 'virtual' mapping of GPU devices. For example, if TensorFlow 57 // can see 8 GPU devices in the process, and one wanted to map 58 // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1", 59 // then one would specify this field as "5,3". This field is similar in 60 // spirit to the CUDA_VISIBLE_DEVICES environment variable, except 61 // it applies to the visible GPU devices in the process. 62 // 63 // NOTE: 64 // 1. The GPU driver provides the process with the visible GPUs 65 // in an order which is not guaranteed to have any correlation to 66 // the *physical* GPU id in the machine. This field is used for 67 // remapping "visible" to "virtual", which means this operates only 68 // after the process starts. Users are required to use vendor 69 // specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the 70 // physical to visible device mapping prior to invoking TensorFlow. 71 // 2. In the code, the ids in this list are also called "platform GPU id"s, 72 // and the 'virtual' ids of GPU devices (i.e. the ids in the device 73 // name "/device:GPU:<id>") are also called "TF GPU id"s. Please 74 // refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h 75 // for more information. 76 string visible_device_list = 5; 77 78 // In the event polling loop sleep this many microseconds between 79 // PollEvents calls, when the queue is not empty. If value is not 80 // set or set to 0, gets set to a non-zero default. 81 int32 polling_active_delay_usecs = 6; 82 83 // This field is deprecated and ignored. 84 int32 polling_inactive_delay_msecs = 7; 85 86 // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow, 87 // enabling this option forces all CPU tensors to be allocated with Cuda 88 // pinned memory. Normally, TensorFlow will infer which tensors should be 89 // allocated as the pinned memory. But in case where the inference is 90 // incomplete, this option can significantly speed up the cross-device memory 91 // copy performance as long as it fits the memory. 92 // Note that this option is not something that should be 93 // enabled by default for unknown or very large models, since all Cuda pinned 94 // memory is unpageable, having too much pinned memory might negatively impact 95 // the overall host system performance. 96 bool force_gpu_compatible = 8; 97 98 message Experimental { 99 // Configuration for breaking down a visible GPU into multiple "virtual" 100 // devices. 101 message VirtualDevices { 102 // Per "virtual" device memory limit, in MB. The number of elements in 103 // the list is the number of virtual devices to create on the 104 // corresponding visible GPU (see "virtual_devices" below). 105 // If empty, it will create single virtual device taking all available 106 // memory from the device. 107 // 108 // For the concept of "visible" and "virtual" GPU, see the comments for 109 // "visible_device_list" above for more information. 110 repeated float memory_limit_mb = 1; 111 } 112 113 // The multi virtual device settings. If empty (not set), it will create 114 // single virtual device on each visible GPU, according to the settings 115 // in "visible_device_list" above. Otherwise, the number of elements in the 116 // list must be the same as the number of visible GPUs (after 117 // "visible_device_list" filtering if it is set), and the string represented 118 // device names (e.g. /device:GPU:<id>) will refer to the virtual 119 // devices and have the <id> field assigned sequentially starting from 0, 120 // according to the order they appear in this list and the "memory_limit" 121 // list inside each element. For example, 122 // visible_device_list = "1,0" 123 // virtual_devices { memory_limit: 1GB memory_limit: 2GB } 124 // virtual_devices {} 125 // will create three virtual devices as: 126 // /device:GPU:0 -> visible GPU 1 with 1GB memory 127 // /device:GPU:1 -> visible GPU 1 with 2GB memory 128 // /device:GPU:2 -> visible GPU 0 with all available memory 129 // 130 // NOTE: 131 // 1. It's invalid to set both this and "per_process_gpu_memory_fraction" 132 // at the same time. 133 // 2. Currently this setting is per-process, not per-session. Using 134 // different settings in different sessions within same process will 135 // result in undefined behavior. 136 repeated VirtualDevices virtual_devices = 1; 137 138 // If true, uses CUDA unified memory for memory allocations. If 139 // per_process_gpu_memory_fraction option is greater than 1.0, then unified 140 // memory is used regardless of the value for this field. See comments for 141 // per_process_gpu_memory_fraction field for more details and requirements 142 // of the unified memory. This option is useful to oversubscribe memory if 143 // multiple processes are sharing a single GPU while individually using less 144 // than 1.0 per process memory fraction. 145 bool use_unified_memory = 2; 146 147 // If > 1, the number of device-to-device copy streams to create 148 // for each GPUDevice. Default value is 0, which is automatically 149 // converted to 1. 150 int32 num_dev_to_dev_copy_streams = 3; 151 152 // If non-empty, defines a good GPU ring order on a single worker based on 153 // device interconnect. This assumes that all workers have the same GPU 154 // topology. Specify as a comma-separated string, e.g. "3,2,1,0,7,6,5,4". 155 // This ring order is used by the RingReducer implementation of 156 // CollectiveReduce, and serves as an override to automatic ring order 157 // generation in OrderTaskDeviceMap() during CollectiveParam resolution. 158 string collective_ring_order = 4; 159 160 // If true then extra work is done by GPUDevice and GPUBFCAllocator to 161 // keep track of when GPU memory is freed and when kernels actually 162 // complete so that we can know when a nominally free memory chunk 163 // is really not subject to pending use. 164 bool timestamped_allocator = 5; 165 166 // If > 0 limit the number of pending kernels on any compute 167 // stream to this number. 168 int32 pending_cap = 6; 169 } 170 171 // Everything inside experimental is subject to change and is not subject 172 // to API stability guarantees in 173 // https://www.tensorflow.org/guide/version_compat. 174 Experimental experimental = 9; 175}; 176 177// Options passed to the graph optimizer 178message OptimizerOptions { 179 // If true, optimize the graph using common subexpression elimination. 180 bool do_common_subexpression_elimination = 1; 181 182 // If true, perform constant folding optimization on the graph. 183 bool do_constant_folding = 2; 184 185 // Constant folding optimization replaces tensors whose values can be 186 // predetermined, with constant nodes. To avoid inserting too large constants, 187 // the size of each constant created can be limited. If this value is zero, a 188 // default limit of 10 MiB will be applied. If constant folding optimization 189 // is disabled, this value is ignored. 190 int64 max_folded_constant_in_bytes = 6; 191 192 // If true, perform function inlining on the graph. 193 bool do_function_inlining = 4; 194 195 // Optimization level 196 enum Level { 197 // L1 is the default level. 198 // Optimization performed at L1 : 199 // 1. Common subexpression elimination 200 // 2. Constant folding 201 L1 = 0; 202 203 // No optimizations 204 L0 = -1; 205 } 206 207 // Overall optimization level. The actual optimizations applied will be the 208 // logical OR of the flags that this level implies and any flags already set. 209 Level opt_level = 3; 210 211 // Control the use of the compiler/jit. Experimental. 212 enum GlobalJitLevel { 213 DEFAULT = 0; // Default setting ("off" now, but later expected to be "on") 214 OFF = -1; 215 // The following settings turn on compilation, with higher values being 216 // more aggressive. Higher values may reduce opportunities for parallelism 217 // and may use more memory. (At present, there is no distinction, but this 218 // is expected to change.) 219 ON_1 = 1; 220 ON_2 = 2; 221 } 222 GlobalJitLevel global_jit_level = 5; 223} 224 225message GraphOptions { 226 // Removed, use optimizer_options below. 227 reserved "skip_common_subexpression_elimination"; 228 reserved 1; 229 230 // If true, use control flow to schedule the activation of Recv nodes. 231 // (Currently ignored.) 232 bool enable_recv_scheduling = 2; 233 234 // Options controlling how graph is optimized. 235 OptimizerOptions optimizer_options = 3; 236 237 // The number of steps to run before returning a cost model detailing 238 // the memory usage and performance of each node of the graph. 0 means 239 // no cost model. 240 int64 build_cost_model = 4; 241 242 // The number of steps to skip before collecting statistics for the 243 // cost model. 244 int64 build_cost_model_after = 9; 245 246 // Annotate each Node with Op output shape data, to the extent it can 247 // be statically inferred. 248 bool infer_shapes = 5; 249 250 // Only place the subgraphs that are run, rather than the entire graph. 251 // 252 // This is useful for interactive graph building, where one might 253 // produce graphs that cannot be placed during the debugging 254 // process. In particular, it allows the client to continue work in 255 // a session after adding a node to a graph whose placement 256 // constraints are unsatisfiable. 257 bool place_pruned_graph = 6; 258 259 // If true, transfer float values between processes as bfloat16. 260 bool enable_bfloat16_sendrecv = 7; 261 262 // If > 0, record a timeline every this many steps. 263 // EXPERIMENTAL: This currently has no effect in MasterSession. 264 int32 timeline_step = 8; 265 266 // Options that control the type and amount of graph rewriting. 267 // Not currently configurable via the public Python API (i.e. there is no API 268 // stability guarantee if you import RewriterConfig explicitly). 269 RewriterConfig rewrite_options = 10; 270}; 271 272message ThreadPoolOptionProto { 273 // The number of threads in the pool. 274 // 275 // 0 means the system picks a value based on where this option proto is used 276 // (see the declaration of the specific field for more info). 277 int32 num_threads = 1; 278 279 // The global name of the threadpool. 280 // 281 // If empty, then the threadpool is made and used according to the scope it's 282 // in - e.g., for a session threadpool, it is used by that session only. 283 // 284 // If non-empty, then: 285 // - a global threadpool associated with this name is looked 286 // up or created. This allows, for example, sharing one threadpool across 287 // many sessions (e.g., like the default behavior, if 288 // inter_op_parallelism_threads is not configured), but still partitioning 289 // into a large and small pool. 290 // - if the threadpool for this global_name already exists, then it is an 291 // error if the existing pool was created using a different num_threads 292 // value as is specified on this call. 293 // - threadpools created this way are never garbage collected. 294 string global_name = 2; 295}; 296 297message RPCOptions { 298 // If true, always use RPC to contact the session target. 299 // 300 // If false (the default option), TensorFlow may use an optimized 301 // transport for client-master communication that avoids the RPC 302 // stack. This option is primarily for used testing the RPC stack. 303 bool use_rpc_for_inprocess_master = 1; 304 305 // The compression algorithm to be used. One of "deflate", "gzip". 306 string compression_algorithm = 2; 307 308 // If compression_algorithm is set, the compression level to be used. 309 // From 0 (no compression), up to 3. 310 int32 compression_level = 3; 311}; 312 313// Session configuration parameters. 314// The system picks appropriate values for fields that are not set. 315message ConfigProto { 316 // Map from device type name (e.g., "CPU" or "GPU" ) to maximum 317 // number of devices of that type to use. If a particular device 318 // type is not found in the map, the system picks an appropriate 319 // number. 320 map<string, int32> device_count = 1; 321 322 // The execution of an individual op (for some op types) can be 323 // parallelized on a pool of intra_op_parallelism_threads. 324 // 0 means the system picks an appropriate number. 325 int32 intra_op_parallelism_threads = 2; 326 327 // Nodes that perform blocking operations are enqueued on a pool of 328 // inter_op_parallelism_threads available in each process. 329 // 330 // 0 means the system picks an appropriate number. 331 // 332 // Note that the first Session created in the process sets the 333 // number of threads for all future sessions unless use_per_session_threads is 334 // true or session_inter_op_thread_pool is configured. 335 int32 inter_op_parallelism_threads = 5; 336 337 // If true, use a new set of threads for this session rather than the global 338 // pool of threads. Only supported by direct sessions. 339 // 340 // If false, use the global threads created by the first session, or the 341 // per-session thread pools configured by session_inter_op_thread_pool. 342 // 343 // This option is deprecated. The same effect can be achieved by setting 344 // session_inter_op_thread_pool to have one element, whose num_threads equals 345 // inter_op_parallelism_threads. 346 bool use_per_session_threads = 9; 347 348 // This option is experimental - it may be replaced with a different mechanism 349 // in the future. 350 // 351 // Configures session thread pools. If this is configured, then RunOptions for 352 // a Run call can select the thread pool to use. 353 // 354 // The intended use is for when some session invocations need to run in a 355 // background pool limited to a small number of threads: 356 // - For example, a session may be configured to have one large pool (for 357 // regular compute) and one small pool (for periodic, low priority work); 358 // using the small pool is currently the mechanism for limiting the inter-op 359 // parallelism of the low priority work. Note that it does not limit the 360 // parallelism of work spawned by a single op kernel implementation. 361 // - Using this setting is normally not needed in training, but may help some 362 // serving use cases. 363 // - It is also generally recommended to set the global_name field of this 364 // proto, to avoid creating multiple large pools. It is typically better to 365 // run the non-low-priority work, even across sessions, in a single large 366 // pool. 367 repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12; 368 369 // Assignment of Nodes to Devices is recomputed every placement_period 370 // steps until the system warms up (at which point the recomputation 371 // typically slows down automatically). 372 int32 placement_period = 3; 373 374 // When any filters are present sessions will ignore all devices which do not 375 // match the filters. Each filter can be partially specified, e.g. "/job:ps" 376 // "/job:worker/replica:3", etc. 377 repeated string device_filters = 4; 378 379 // Options that apply to all GPUs. 380 GPUOptions gpu_options = 6; 381 382 // Whether soft placement is allowed. If allow_soft_placement is true, 383 // an op will be placed on CPU if 384 // 1. there's no GPU implementation for the OP 385 // or 386 // 2. no GPU devices are known or registered 387 // or 388 // 3. need to co-locate with reftype input(s) which are from CPU. 389 bool allow_soft_placement = 7; 390 391 // Whether device placements should be logged. 392 bool log_device_placement = 8; 393 394 // Options that apply to all graphs. 395 GraphOptions graph_options = 10; 396 397 // Global timeout for all blocking operations in this session. If non-zero, 398 // and not overridden on a per-operation basis, this value will be used as the 399 // deadline for all blocking operations. 400 int64 operation_timeout_in_ms = 11; 401 402 // Options that apply when this session uses the distributed runtime. 403 RPCOptions rpc_options = 13; 404 405 // Optional list of all workers to use in this session. 406 ClusterDef cluster_def = 14; 407 408 // If true, any resources such as Variables used in the session will not be 409 // shared with other sessions. 410 bool isolate_session_state = 15; 411 412 // Everything inside Experimental is subject to change and is not subject 413 // to API stability guarantees in 414 // https://www.tensorflow.org/guide/version_compat. 415 message Experimental { 416 // Task name for group resolution. 417 string collective_group_leader = 1; 418 419 // We removed the flag client_handles_error_formatting. Marking the tag 420 // number as reserved. 421 // TODO(shikharagarwal): Should we just remove this tag so that it can be 422 // used in future for other purpose? 423 reserved 2; 424 425 // Which executor to use, the default executor will be used 426 // if it is an empty string or "DEFAULT" 427 string executor_type = 3; 428 429 // Guidance to formatting of large RecvBuf fields for transfer. 430 // Any positive value sets the max chunk size. 0 defaults to 4096. 431 // Any negative value indicates no max, i.e. one chunk only. 432 int32 recv_buf_max_chunk = 4; 433 434 // If true, and supported by the platform, the runtime will attempt to 435 // use NUMA affinity where applicable. One consequence will be the 436 // existence of as many CPU devices as there are available NUMA nodes. 437 bool use_numa_affinity = 5; 438 439 // If true, make collective op execution order sequential and deterministic 440 // for potentially concurrent collective instances. 441 bool collective_deterministic_sequential_execution = 6; 442 443 // If true, use NCCL for CollectiveOps. This feature is highly 444 // experimental. 445 bool collective_nccl = 7; 446 }; 447 448 Experimental experimental = 16; 449 450 // Next: 17 451}; 452 453// Options for a single Run() call. 454message RunOptions { 455 // TODO(pbar) Turn this into a TraceOptions proto which allows 456 // tracing to be controlled in a more orthogonal manner? 457 enum TraceLevel { 458 NO_TRACE = 0; 459 SOFTWARE_TRACE = 1; 460 HARDWARE_TRACE = 2; 461 FULL_TRACE = 3; 462 } 463 TraceLevel trace_level = 1; 464 465 // Time to wait for operation to complete in milliseconds. 466 int64 timeout_in_ms = 2; 467 468 // The thread pool to use, if session_inter_op_thread_pool is configured. 469 // To use the caller thread set this to -1 - this uses the caller thread 470 // to execute Session::Run() and thus avoids a context switch. Using the 471 // caller thread to execute Session::Run() should be done ONLY for simple 472 // graphs, where the overhead of an additional context switch is 473 // comparable with the overhead of Session::Run(). 474 int32 inter_op_thread_pool = 3; 475 476 // Whether the partition graph(s) executed by the executor(s) should be 477 // outputted via RunMetadata. 478 bool output_partition_graphs = 5; 479 480 // EXPERIMENTAL. Options used to initialize DebuggerState, if enabled. 481 DebugOptions debug_options = 6; 482 483 // When enabled, causes tensor allocation information to be included in 484 // the error message when the Run() call fails because the allocator ran 485 // out of memory (OOM). 486 // 487 // Enabling this option can slow down the Run() call. 488 bool report_tensor_allocations_upon_oom = 7; 489 490 // Everything inside Experimental is subject to change and is not subject 491 // to API stability guarantees in 492 // https://www.tensorflow.org/guide/version_compat. 493 message Experimental { 494 // If non-zero, declares that this graph is going to use collective 495 // ops and must synchronize step_ids with any other graph with this 496 // same group_key value (in a distributed computation where tasks 497 // run disjoint graphs). 498 int64 collective_graph_key = 1; 499 // If true, then operations (using the inter-op pool) across all 500 // session::run() calls will be centrally scheduled, optimizing for (median 501 // and tail) latency. 502 // Consider using this option for CPU-bound workloads like inference. 503 bool use_run_handler_pool = 2; 504 }; 505 506 Experimental experimental = 8; 507 508 reserved 4; 509} 510 511// Metadata output (i.e., non-Tensor) for a single Run() call. 512message RunMetadata { 513 // Statistics traced for this step. Populated if tracing is turned on via the 514 // "RunOptions" proto. 515 // EXPERIMENTAL: The format and set of events may change in future versions. 516 StepStats step_stats = 1; 517 518 // The cost graph for the computation defined by the run call. 519 CostGraphDef cost_graph = 2; 520 521 // Graphs of the partitions executed by executors. 522 repeated GraphDef partition_graphs = 3; 523 524 message FunctionGraphs { 525 // TODO(nareshmodi): Include some sort of function/cache-key identifier? 526 repeated GraphDef partition_graphs = 1; 527 528 GraphDef pre_optimization_graph = 2; 529 GraphDef post_optimization_graph = 3; 530 } 531 // This is only populated for graphs that are run as functions in TensorFlow 532 // V2. There will be an entry below for each function that is traced. 533 // The main use cases of the post_optimization_graph and the partition_graphs 534 // is to give the caller insight into the graphs that were actually run by the 535 // runtime. Additional information (such as those in step_stats) will match 536 // these graphs. 537 // We also include the pre_optimization_graph since it is usually easier to 538 // read, and is helpful in situations where the caller wants to get a high 539 // level idea of what the built graph looks like (since the various graph 540 // optimization passes might change the structure of the graph significantly). 541 repeated FunctionGraphs function_graphs = 4; 542} 543 544// Defines a connection between two tensors in a `GraphDef`. 545message TensorConnection { 546 // A tensor name. The value of this tensor will be substituted for 547 // the tensor named in `to_tensor`. 548 string from_tensor = 1; 549 550 // A tensor name. The value of this tensor will be bound to the 551 // value of the tensor named in `from_tensor`. 552 string to_tensor = 2; 553} 554 555// Defines a subgraph in another `GraphDef` as a set of feed points and nodes 556// to be fetched or executed. 557// 558// Compare with the arguments to `Session::Run()`. 559message CallableOptions { 560 // Tensors to be fed in the callable. Each feed is the name of a tensor. 561 repeated string feed = 1; 562 563 // Fetches. A list of tensor names. The caller of the callable expects a 564 // tensor to be returned for each fetch[i] (see RunStepResponse.tensor). The 565 // order of specified fetches does not change the execution order. 566 repeated string fetch = 2; 567 568 // Target Nodes. A list of node names. The named nodes will be run by the 569 // callable but their outputs will not be returned. 570 repeated string target = 3; 571 572 // Options that will be applied to each run. 573 RunOptions run_options = 4; 574 575 // Tensors to be connected in the callable. Each TensorConnection denotes 576 // a pair of tensors in the graph, between which an edge will be created 577 // in the callable. 578 repeated TensorConnection tensor_connection = 5; 579 580 // The Tensor objects fed in the callable and fetched from the callable 581 // are expected to be backed by host (CPU) memory by default. 582 // 583 // The options below allow changing that - feeding tensors backed by 584 // device memory, or returning tensors that are backed by device memory. 585 // 586 // The maps below map the name of a feed/fetch tensor (which appears in 587 // 'feed' or 'fetch' fields above), to the fully qualified name of the device 588 // owning the memory backing the contents of the tensor. 589 // 590 // For example, creating a callable with the following options: 591 // 592 // CallableOptions { 593 // feed: "a:0" 594 // feed: "b:0" 595 // 596 // fetch: "x:0" 597 // fetch: "y:0" 598 // 599 // feed_devices: { 600 // "a:0": "/job:localhost/replica:0/task:0/device:GPU:0" 601 // } 602 // 603 // fetch_devices: { 604 // "y:0": "/job:localhost/replica:0/task:0/device:GPU:0" 605 // } 606 // } 607 // 608 // means that the Callable expects: 609 // - The first argument ("a:0") is a Tensor backed by GPU memory. 610 // - The second argument ("b:0") is a Tensor backed by host memory. 611 // and of its return values: 612 // - The first output ("x:0") will be backed by host memory. 613 // - The second output ("y:0") will be backed by GPU memory. 614 // 615 // FEEDS: 616 // It is the responsibility of the caller to ensure that the memory of the fed 617 // tensors will be correctly initialized and synchronized before it is 618 // accessed by operations executed during the call to Session::RunCallable(). 619 // 620 // This is typically ensured by using the TensorFlow memory allocators 621 // (Device::GetAllocator()) to create the Tensor to be fed. 622 // 623 // Alternatively, for CUDA-enabled GPU devices, this typically means that the 624 // operation that produced the contents of the tensor has completed, i.e., the 625 // CUDA stream has been synchronized (e.g., via cuCtxSynchronize() or 626 // cuStreamSynchronize()). 627 map<string, string> feed_devices = 6; 628 map<string, string> fetch_devices = 7; 629 630 // By default, RunCallable() will synchronize the GPU stream before returning 631 // fetched tensors on a GPU device, to ensure that the values in those tensors 632 // have been produced. This simplifies interacting with the tensors, but 633 // potentially incurs a performance hit. 634 // 635 // If this options is set to true, the caller is responsible for ensuring 636 // that the values in the fetched tensors have been produced before they are 637 // used. The caller can do this by invoking `Device::Sync()` on the underlying 638 // device(s), or by feeding the tensors back to the same Session using 639 // `feed_devices` with the same corresponding device name. 640 bool fetch_skip_sync = 8; 641 642 // Next: 9 643} 644