1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_ 18 19 #include <vector> 20 21 #include "tensorflow/compiler/xla/service/generic_transfer_manager.h" 22 #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h" 23 #include "tensorflow/compiler/xla/service/transfer_manager.h" 24 #include "tensorflow/compiler/xla/shape_tree.h" 25 #include "tensorflow/compiler/xla/statusor.h" 26 #include "tensorflow/compiler/xla/xla_data.pb.h" 27 #include "tensorflow/core/platform/stream_executor_no_cuda.h" 28 29 namespace xla { 30 namespace gpu { 31 32 // An implementation of the XLA GenericTransferManager that 33 // handles GPU-specific infeed. 34 class GpuTransferManager : public GenericTransferManager { 35 public: 36 GpuTransferManager(se::Platform::Id id, unsigned pointer_size); 37 ~GpuTransferManager() override; 38 39 Status TransferLiteralToInfeed(se::StreamExecutor* executor, 40 const LiteralSlice& literal) override; 41 Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, 42 MutableBorrowingLiteral literal) override; 43 Status ReadDynamicShapes(se::Stream* stream, ShapedBuffer* device_buffer, 44 Shape* device_shape) override; 45 46 private: 47 GpuTransferManager(const GpuTransferManager&) = delete; 48 GpuTransferManager& operator=(const GpuTransferManager&) = delete; 49 50 // This class keeps a pool of pinned memory 51 // (StreamExecutor::HostMemoryAllocate()) that serves ReadDynamicShapes(). 52 // This is a bit of a hack: Callers like TensorFlow already have a full pinned 53 // memory allocator, and we could in theory use it here and elsewhere in XLA. 54 // But because GpuTransferManager is a singleton, we can't really access that. 55 // 56 // To keep things relatively simple, our allocator does the following. 57 // 58 // - Allocate one chunk of 128 KiB pinned memory. 59 // - Divide each chunk into 128-byte buffers. 60 // - During ReadDynamicShapes(), "check out" one buffer for each dynamic 61 // subshape. Copy one subshape into one buffer. If it doesn't fit or 62 // there are no free buffers, fall back to an unpinned memcpy. 63 // 64 // A 128-byte buffer is large enough to hold a shape of rank 128/sizeof(int32) 65 // = 32, which is much larger than we normally see in XLA programs. A 128 KiB 66 // chunk is large enough to hold 128 KiB/128B = 1024 dynamically-shaped 67 // buffers, which is also way larger than we should need, even if we're 68 // running multiple programs in parallel. 69 // 70 // This pool is lazily initialized on first use. It would be better to 71 // initialize it in the constructor, but doing so poses a challenge in the 72 // presence of multiple GPUs. We need a StreamExecutor in order to allocate 73 // pinned memory. We don't care which GPU's SE we use, because SE allocates 74 // pinned memory with the PORTABLE flag, making it available to all CUDA 75 // contexts. But we do need to avoid calling platform->ExecutorForDevice for 76 // a device that we're not "supposed" to use, because this will create a CUDA 77 // context for that device, consuming significant resources on the GPU, 78 // b/228207839. 79 // 80 // Lazy initialization works around this, because at that point we have a 81 // stream, and therefore we have an already-initialized StreamExecutor. 82 void EnsurePinnedBuffersAllocated(se::StreamExecutor* executor) 83 ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_); 84 85 static constexpr int64_t kPinnedChunkBytes = 128 * 1024; 86 static constexpr int64_t kPinnedBufferBytes = 128; 87 88 absl::Mutex mu_; 89 90 // The StreamExecutor on which our pinned memory was allocated. We use this 91 // when freeing the pinned memory. Lazily initialized. 92 se::StreamExecutor* pinned_chunk_se_ ABSL_GUARDED_BY(mu_) = nullptr; 93 94 // Chunk of pinned memory of size kPinnedChunkBytes. The pointers in 95 // pinned_buffers_ point into this chunk. Lazily initialized. 96 char* pinned_chunk_ ABSL_GUARDED_BY(mu_) = nullptr; 97 98 // Host buffers for reading dynamic shapes. Each buffer has size 99 // kPinnedBufferBytes. Lazily initialized. 100 std::vector<void*> pinned_buffers_ ABSL_GUARDED_BY(mu_); 101 }; 102 103 } // namespace gpu 104 } // namespace xla 105 106 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_ 107