1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h"
17
18 #include <memory>
19 #include <string>
20 #include <utility>
21 #include <vector>
22
23 #include "absl/cleanup/cleanup.h"
24 #include "llvm/IR/DataLayout.h"
25 #include "tensorflow/compiler/xla/literal.h"
26 #include "tensorflow/compiler/xla/literal_util.h"
27 #include "tensorflow/compiler/xla/service/compiler.h"
28 #include "tensorflow/compiler/xla/service/gpu/outfeed_manager.h"
29 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
30 #include "tensorflow/compiler/xla/shape_util.h"
31 #include "tensorflow/compiler/xla/status_macros.h"
32 #include "tensorflow/compiler/xla/statusor.h"
33 #include "tensorflow/compiler/xla/types.h"
34 #include "tensorflow/compiler/xla/util.h"
35 #include "tensorflow/core/lib/core/errors.h"
36 #include "tensorflow/core/platform/logging.h"
37 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
38 #include "tensorflow/stream_executor/multi_platform_manager.h"
39
40 namespace xla {
41 namespace gpu {
42
43 // TODO(b/30467474) Once GPU infeed implementation settles, consider
44 // folding back the cpu and gpu infeed implementations into a generic
45 // one if possible.
GpuTransferManager(se::Platform::Id id,unsigned pointer_size)46 GpuTransferManager::GpuTransferManager(se::Platform::Id id,
47 unsigned pointer_size)
48 : GenericTransferManager(id, pointer_size) {}
49
~GpuTransferManager()50 GpuTransferManager::~GpuTransferManager() {
51 if (pinned_chunk_se_) {
52 pinned_chunk_se_->HostMemoryDeallocate(pinned_chunk_);
53 }
54 }
55
TransferLiteralToInfeed(se::StreamExecutor * executor,const LiteralSlice & literal)56 Status GpuTransferManager::TransferLiteralToInfeed(
57 se::StreamExecutor* executor, const LiteralSlice& literal) {
58 return gpu::GetOrCreateInfeedManager(executor)->TransferLiteralToInfeed(
59 executor, literal);
60 }
61
TransferLiteralFromOutfeed(se::StreamExecutor * executor,MutableBorrowingLiteral literal)62 Status GpuTransferManager::TransferLiteralFromOutfeed(
63 se::StreamExecutor* executor, MutableBorrowingLiteral literal) {
64 return gpu::GetOrCreateOutfeedManager(executor)->TransferLiteralFromOutfeed(
65 executor, literal);
66 }
67
EnsurePinnedBuffersAllocated(se::StreamExecutor * executor)68 void GpuTransferManager::EnsurePinnedBuffersAllocated(
69 se::StreamExecutor* executor) {
70 if (pinned_chunk_ != nullptr) {
71 return;
72 }
73
74 pinned_chunk_se_ = executor;
75 pinned_chunk_ =
76 reinterpret_cast<char*>(executor->HostMemoryAllocate(kPinnedChunkBytes));
77 static_assert(kPinnedChunkBytes % kPinnedBufferBytes == 0,
78 "assumption of loop below");
79 for (char* buf = pinned_chunk_; buf < pinned_chunk_ + kPinnedChunkBytes;
80 buf += kPinnedBufferBytes) {
81 pinned_buffers_.push_back(buf);
82 }
83 }
84
ReadDynamicShapes(se::Stream * stream,ShapedBuffer * device_buffer,Shape * device_shape)85 Status GpuTransferManager::ReadDynamicShapes(se::Stream* stream,
86 ShapedBuffer* device_buffer,
87 Shape* device_shape) {
88 DCHECK(device_shape->is_dynamic());
89 Shape original_device_shape = *device_shape;
90
91 TF_ASSIGN_OR_RETURN(auto compiler,
92 Compiler::GetForPlatform(stream->parent()->platform()));
93 auto shape_size_fn = compiler->ShapeSizeBytesFunction();
94
95 // First, figure out which parts of `device_shape` are dynamic and where the
96 // dynamic shapes live in GPU memory. We'll copy the bytes at the
97 // DeviceMemoryBase into the Shape*'s dimensions.
98 std::vector<std::pair<se::DeviceMemoryBase, Shape*>> copies;
99
100 TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachMutableElementWithStatus(
101 [&](const ShapeIndex& index, se::DeviceMemoryBase* buffer) {
102 const Shape& buffer_shape =
103 ShapeUtil::GetSubshape(*device_shape, index);
104 if (buffer_shape.IsTuple()) {
105 return OkStatus();
106 }
107 Shape& device_sub_shape =
108 *ShapeUtil::GetMutableSubshape(device_shape, index);
109 if (device_sub_shape.is_static()) {
110 return OkStatus();
111 }
112
113 // Read the dynamic shape metadata from the device stream. The dynamic
114 // shape itself is stored at the end of the buffer.
115 Shape buffer_shape_static = ShapeUtil::MakeStaticShape(buffer_shape);
116 const int64_t offset = shape_size_fn(buffer_shape_static);
117 int64_t metadata_size = shape_size_fn(buffer_shape) - offset;
118 if (metadata_size == 0) {
119 return InvalidArgument("Dynamic shape metadata size should not be 0");
120 }
121
122 auto buffer_8 = se::DeviceMemory<uint8_t>(*buffer);
123 auto metadata_buffer =
124 stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
125 copies.push_back(std::make_pair(metadata_buffer, &device_sub_shape));
126
127 return OkStatus();
128 }));
129
130 // Check out pinned memory for each buffer we want to copy. If there aren't
131 // enough pinned buffers available, or if one of our buffers is so big it
132 // doesn't fit, allocate an entry for it in fallback_buffers.
133 std::vector<int32_t*> h2d_memcpy_dsts;
134 std::vector<void*> checked_out_buffers;
135 std::vector<std::unique_ptr<char[]>> fallback_buffers;
136
137 // Return checked-out buffers at the end of this function.
138 absl::Cleanup cleanup = [&] {
139 absl::MutexLock lock(&mu_);
140 pinned_buffers_.insert(pinned_buffers_.end(), checked_out_buffers.begin(),
141 checked_out_buffers.end());
142 };
143
144 {
145 absl::MutexLock lock(&mu_);
146 EnsurePinnedBuffersAllocated(stream->parent());
147
148 for (const auto& src_dst : copies) {
149 se::DeviceMemoryBase src = src_dst.first;
150 if (!pinned_buffers_.empty() && src.size() <= kPinnedBufferBytes) {
151 void* buf = pinned_buffers_.back();
152 pinned_buffers_.pop_back();
153 checked_out_buffers.push_back(buf);
154 h2d_memcpy_dsts.push_back(reinterpret_cast<int32_t*>(buf));
155 } else {
156 LOG_FIRST_N(WARNING, 10)
157 << "Unable to copy dynamic shape buffer of size " << src.size()
158 << " to host using pinned memory. Falling back to unpinned "
159 "memory, which may be slow.";
160 fallback_buffers.push_back(std::make_unique<char[]>(src.size()));
161 h2d_memcpy_dsts.push_back(
162 reinterpret_cast<int32_t*>(fallback_buffers.back().get()));
163 }
164 }
165 }
166
167 // Copy into the h2d_memcpy_dsts.
168 for (int i = 0; i < copies.size(); i++) {
169 se::DeviceMemoryBase src = copies[i].first;
170 void* dst = h2d_memcpy_dsts[i];
171 stream->ThenMemcpy(dst, src, src.size());
172 }
173
174 // Wait for all the async copies to complete, then write into device_shape.
175 TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
176 for (int i = 0; i < copies.size(); i++) {
177 Shape* dst_shape = copies[i].second;
178 int32_t* dst = h2d_memcpy_dsts[i];
179 for (int64_t j = 0; j < dst_shape->rank(); j++) {
180 dst_shape->mutable_dimensions()[j] = dst[j];
181 }
182 }
183
184 device_shape->clear_dynamic_dimensions();
185 TF_RET_CHECK(ShapeUtil::DynamicShapeIsCompatible(*device_shape,
186 original_device_shape));
187 return OkStatus();
188 }
189
190 } // namespace gpu
191 } // namespace xla
192
CreateNVPTXTransferManager()193 static std::unique_ptr<xla::TransferManager> CreateNVPTXTransferManager() {
194 return std::make_unique<xla::gpu::GpuTransferManager>(
195 /*id=*/stream_executor::cuda::kCudaPlatformId,
196 /*pointer_size=*/llvm::DataLayout(xla::gpu::nvptx::DataLayout())
197 .getPointerSize(0 /* default address space */));
198 }
199
CreateAMDGPUTransferManager()200 static std::unique_ptr<xla::TransferManager> CreateAMDGPUTransferManager() {
201 return std::make_unique<xla::gpu::GpuTransferManager>(
202 /*id=*/stream_executor::rocm::kROCmPlatformId,
203 /*pointer_size=*/llvm::DataLayout(xla::gpu::amdgpu::DataLayout())
204 .getPointerSize(0 /* default address space */));
205 }
206
InitModule()207 static bool InitModule() {
208 xla::TransferManager::RegisterTransferManager(
209 stream_executor::cuda::kCudaPlatformId, &CreateNVPTXTransferManager);
210 xla::TransferManager::RegisterTransferManager(
211 stream_executor::rocm::kROCmPlatformId, &CreateAMDGPUTransferManager);
212 return true;
213 }
214 static bool module_initialized = InitModule();
215