1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 #ifndef TENSORFLOW_CORE_TPU_TPU_GLOBAL_INIT_H_ 16 #define TENSORFLOW_CORE_TPU_TPU_GLOBAL_INIT_H_ 17 18 #include "absl/strings/string_view.h" 19 #include "tensorflow/core/common_runtime/device_set.h" 20 #include "tensorflow/core/lib/core/status.h" 21 #include "tensorflow/core/platform/env.h" 22 #include "tensorflow/core/protobuf/tpu/topology.pb.h" 23 #include "tensorflow/core/public/session.h" 24 25 namespace tensorflow { 26 27 // Initializes the TPU system globally. The state of initialization can then be 28 // shared by different sessions running on these TPUs, on the same process. This 29 // API is provided for multi-tenant usecases where multiple sessions in a 30 // process are using the same set of TPUs. 31 // 32 // Returns status errors if initialization is unsuccessful and returns the TPU 33 // TopologyProto as an output parameter. 34 // 35 // REQUIRES: 36 // * Call this API before any sessions using TPUs are run. 37 // * If you are using this API for initialization, please don't use the TPU 38 // configuration ops within your graph. This will cause errors to be returned 39 // from the API which is called second. 40 // 41 // DISTRIBUTED SETUP: 42 // To properly initialize a TPU topology that is beyond donut level, caller is 43 // required to provide correct following arguments: 44 // 45 // 1. job_name 46 // The name of the job under distributed settings. For example, if the job is 47 // '/job:tpu_worker/replica:0/task:0/...', the "tpu_worker" is the desired 48 // job_name here. 49 // 50 // 2. session_target 51 // The target string that will be used to create a Session and run the 52 // distributed TPU initialization graph. Generally this would be the master 53 // session from the cluster. 54 // 55 // 3.device_set 56 // The GLOBAL set of devices in the distributed setting, including proper 57 // "TPU_SYSTEM" devices across all tasks. 58 // For example, device_set should contain two "TPU_SYSTEM" devices on 2 tasks 59 // for a 4x2 (2 TPU workers) setup, and other non "TPU_SYSTEM" devices. 60 Status InitializeTPUSystemGlobally(absl::string_view job_name, 61 absl::string_view session_target, 62 const DeviceSet& device_set, Env* env, 63 tpu::TopologyProto* tpu_topology); 64 65 Status InitializeTPUSystemGlobally(Env* env, tpu::TopologyProto* tpu_topology); 66 67 Status InitializeTPUSystemGlobally(); 68 69 } // namespace tensorflow 70 71 #endif // TENSORFLOW_CORE_TPU_TPU_GLOBAL_INIT_H_ 72