• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_
18 
19 #include <vector>
20 
21 #include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
22 #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
23 #include "tensorflow/compiler/xla/service/transfer_manager.h"
24 #include "tensorflow/compiler/xla/shape_tree.h"
25 #include "tensorflow/compiler/xla/statusor.h"
26 #include "tensorflow/compiler/xla/xla_data.pb.h"
27 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
28 
29 namespace xla {
30 namespace gpu {
31 
32 // An implementation of the XLA GenericTransferManager that
33 // handles GPU-specific infeed.
34 class GpuTransferManager : public GenericTransferManager {
35  public:
36   GpuTransferManager(se::Platform::Id id, unsigned pointer_size);
37   ~GpuTransferManager() override;
38 
39   Status TransferLiteralToInfeed(se::StreamExecutor* executor,
40                                  const LiteralSlice& literal) override;
41   Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
42                                     MutableBorrowingLiteral literal) override;
43   Status ReadDynamicShapes(se::Stream* stream, ShapedBuffer* device_buffer,
44                            Shape* device_shape) override;
45 
46  private:
47   GpuTransferManager(const GpuTransferManager&) = delete;
48   GpuTransferManager& operator=(const GpuTransferManager&) = delete;
49 
50   // This class keeps a pool of pinned memory
51   // (StreamExecutor::HostMemoryAllocate()) that serves ReadDynamicShapes().
52   // This is a bit of a hack: Callers like TensorFlow already have a full pinned
53   // memory allocator, and we could in theory use it here and elsewhere in XLA.
54   // But because GpuTransferManager is a singleton, we can't really access that.
55   //
56   // To keep things relatively simple, our allocator does the following.
57   //
58   //  - Allocate one chunk of 128 KiB pinned memory.
59   //  - Divide each chunk into 128-byte buffers.
60   //  - During ReadDynamicShapes(), "check out" one buffer for each dynamic
61   //    subshape.  Copy one subshape into one buffer.  If it doesn't fit or
62   //    there are no free buffers, fall back to an unpinned memcpy.
63   //
64   // A 128-byte buffer is large enough to hold a shape of rank 128/sizeof(int32)
65   // = 32, which is much larger than we normally see in XLA programs.  A 128 KiB
66   // chunk is large enough to hold 128 KiB/128B = 1024 dynamically-shaped
67   // buffers, which is also way larger than we should need, even if we're
68   // running multiple programs in parallel.
69   //
70   // This pool is lazily initialized on first use.  It would be better to
71   // initialize it in the constructor, but doing so poses a challenge in the
72   // presence of multiple GPUs.  We need a StreamExecutor in order to allocate
73   // pinned memory.  We don't care which GPU's SE we use, because SE allocates
74   // pinned memory with the PORTABLE flag, making it available to all CUDA
75   // contexts.  But we do need to avoid calling platform->ExecutorForDevice for
76   // a device that we're not "supposed" to use, because this will create a CUDA
77   // context for that device, consuming significant resources on the GPU,
78   // b/228207839.
79   //
80   // Lazy initialization works around this, because at that point we have a
81   // stream, and therefore we have an already-initialized StreamExecutor.
82   void EnsurePinnedBuffersAllocated(se::StreamExecutor* executor)
83       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
84 
85   static constexpr int64_t kPinnedChunkBytes = 128 * 1024;
86   static constexpr int64_t kPinnedBufferBytes = 128;
87 
88   absl::Mutex mu_;
89 
90   // The StreamExecutor on which our pinned memory was allocated.  We use this
91   // when freeing the pinned memory.  Lazily initialized.
92   se::StreamExecutor* pinned_chunk_se_ ABSL_GUARDED_BY(mu_) = nullptr;
93 
94   // Chunk of pinned memory of size kPinnedChunkBytes.  The pointers in
95   // pinned_buffers_ point into this chunk.  Lazily initialized.
96   char* pinned_chunk_ ABSL_GUARDED_BY(mu_) = nullptr;
97 
98   // Host buffers for reading dynamic shapes.  Each buffer has size
99   // kPinnedBufferBytes.  Lazily initialized.
100   std::vector<void*> pinned_buffers_ ABSL_GUARDED_BY(mu_);
101 };
102 
103 }  // namespace gpu
104 }  // namespace xla
105 
106 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_
107