• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h"
17 
18 #include <memory>
19 #include <string>
20 #include <utility>
21 #include <vector>
22 
23 #include "absl/cleanup/cleanup.h"
24 #include "llvm/IR/DataLayout.h"
25 #include "tensorflow/compiler/xla/literal.h"
26 #include "tensorflow/compiler/xla/literal_util.h"
27 #include "tensorflow/compiler/xla/service/compiler.h"
28 #include "tensorflow/compiler/xla/service/gpu/outfeed_manager.h"
29 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
30 #include "tensorflow/compiler/xla/shape_util.h"
31 #include "tensorflow/compiler/xla/status_macros.h"
32 #include "tensorflow/compiler/xla/statusor.h"
33 #include "tensorflow/compiler/xla/types.h"
34 #include "tensorflow/compiler/xla/util.h"
35 #include "tensorflow/core/lib/core/errors.h"
36 #include "tensorflow/core/platform/logging.h"
37 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
38 #include "tensorflow/stream_executor/multi_platform_manager.h"
39 
40 namespace xla {
41 namespace gpu {
42 
43 // TODO(b/30467474) Once GPU infeed implementation settles, consider
44 // folding back the cpu and gpu infeed implementations into a generic
45 // one if possible.
GpuTransferManager(se::Platform::Id id,unsigned pointer_size)46 GpuTransferManager::GpuTransferManager(se::Platform::Id id,
47                                        unsigned pointer_size)
48     : GenericTransferManager(id, pointer_size) {}
49 
~GpuTransferManager()50 GpuTransferManager::~GpuTransferManager() {
51   if (pinned_chunk_se_) {
52     pinned_chunk_se_->HostMemoryDeallocate(pinned_chunk_);
53   }
54 }
55 
TransferLiteralToInfeed(se::StreamExecutor * executor,const LiteralSlice & literal)56 Status GpuTransferManager::TransferLiteralToInfeed(
57     se::StreamExecutor* executor, const LiteralSlice& literal) {
58   return gpu::GetOrCreateInfeedManager(executor)->TransferLiteralToInfeed(
59       executor, literal);
60 }
61 
TransferLiteralFromOutfeed(se::StreamExecutor * executor,MutableBorrowingLiteral literal)62 Status GpuTransferManager::TransferLiteralFromOutfeed(
63     se::StreamExecutor* executor, MutableBorrowingLiteral literal) {
64   return gpu::GetOrCreateOutfeedManager(executor)->TransferLiteralFromOutfeed(
65       executor, literal);
66 }
67 
EnsurePinnedBuffersAllocated(se::StreamExecutor * executor)68 void GpuTransferManager::EnsurePinnedBuffersAllocated(
69     se::StreamExecutor* executor) {
70   if (pinned_chunk_ != nullptr) {
71     return;
72   }
73 
74   pinned_chunk_se_ = executor;
75   pinned_chunk_ =
76       reinterpret_cast<char*>(executor->HostMemoryAllocate(kPinnedChunkBytes));
77   static_assert(kPinnedChunkBytes % kPinnedBufferBytes == 0,
78                 "assumption of loop below");
79   for (char* buf = pinned_chunk_; buf < pinned_chunk_ + kPinnedChunkBytes;
80        buf += kPinnedBufferBytes) {
81     pinned_buffers_.push_back(buf);
82   }
83 }
84 
ReadDynamicShapes(se::Stream * stream,ShapedBuffer * device_buffer,Shape * device_shape)85 Status GpuTransferManager::ReadDynamicShapes(se::Stream* stream,
86                                              ShapedBuffer* device_buffer,
87                                              Shape* device_shape) {
88   DCHECK(device_shape->is_dynamic());
89   Shape original_device_shape = *device_shape;
90 
91   TF_ASSIGN_OR_RETURN(auto compiler,
92                       Compiler::GetForPlatform(stream->parent()->platform()));
93   auto shape_size_fn = compiler->ShapeSizeBytesFunction();
94 
95   // First, figure out which parts of `device_shape` are dynamic and where the
96   // dynamic shapes live in GPU memory.  We'll copy the bytes at the
97   // DeviceMemoryBase into the Shape*'s dimensions.
98   std::vector<std::pair<se::DeviceMemoryBase, Shape*>> copies;
99 
100   TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachMutableElementWithStatus(
101       [&](const ShapeIndex& index, se::DeviceMemoryBase* buffer) {
102         const Shape& buffer_shape =
103             ShapeUtil::GetSubshape(*device_shape, index);
104         if (buffer_shape.IsTuple()) {
105           return OkStatus();
106         }
107         Shape& device_sub_shape =
108             *ShapeUtil::GetMutableSubshape(device_shape, index);
109         if (device_sub_shape.is_static()) {
110           return OkStatus();
111         }
112 
113         // Read the dynamic shape metadata from the device stream.  The dynamic
114         // shape itself is stored at the end of the buffer.
115         Shape buffer_shape_static = ShapeUtil::MakeStaticShape(buffer_shape);
116         const int64_t offset = shape_size_fn(buffer_shape_static);
117         int64_t metadata_size = shape_size_fn(buffer_shape) - offset;
118         if (metadata_size == 0) {
119           return InvalidArgument("Dynamic shape metadata size should not be 0");
120         }
121 
122         auto buffer_8 = se::DeviceMemory<uint8_t>(*buffer);
123         auto metadata_buffer =
124             stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
125         copies.push_back(std::make_pair(metadata_buffer, &device_sub_shape));
126 
127         return OkStatus();
128       }));
129 
130   // Check out pinned memory for each buffer we want to copy.  If there aren't
131   // enough pinned buffers available, or if one of our buffers is so big it
132   // doesn't fit, allocate an entry for it in fallback_buffers.
133   std::vector<int32_t*> h2d_memcpy_dsts;
134   std::vector<void*> checked_out_buffers;
135   std::vector<std::unique_ptr<char[]>> fallback_buffers;
136 
137   // Return checked-out buffers at the end of this function.
138   absl::Cleanup cleanup = [&] {
139     absl::MutexLock lock(&mu_);
140     pinned_buffers_.insert(pinned_buffers_.end(), checked_out_buffers.begin(),
141                            checked_out_buffers.end());
142   };
143 
144   {
145     absl::MutexLock lock(&mu_);
146     EnsurePinnedBuffersAllocated(stream->parent());
147 
148     for (const auto& src_dst : copies) {
149       se::DeviceMemoryBase src = src_dst.first;
150       if (!pinned_buffers_.empty() && src.size() <= kPinnedBufferBytes) {
151         void* buf = pinned_buffers_.back();
152         pinned_buffers_.pop_back();
153         checked_out_buffers.push_back(buf);
154         h2d_memcpy_dsts.push_back(reinterpret_cast<int32_t*>(buf));
155       } else {
156         LOG_FIRST_N(WARNING, 10)
157             << "Unable to copy dynamic shape buffer of size " << src.size()
158             << " to host using pinned memory.  Falling back to unpinned "
159                "memory, which may be slow.";
160         fallback_buffers.push_back(std::make_unique<char[]>(src.size()));
161         h2d_memcpy_dsts.push_back(
162             reinterpret_cast<int32_t*>(fallback_buffers.back().get()));
163       }
164     }
165   }
166 
167   // Copy into the h2d_memcpy_dsts.
168   for (int i = 0; i < copies.size(); i++) {
169     se::DeviceMemoryBase src = copies[i].first;
170     void* dst = h2d_memcpy_dsts[i];
171     stream->ThenMemcpy(dst, src, src.size());
172   }
173 
174   // Wait for all the async copies to complete, then write into device_shape.
175   TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
176   for (int i = 0; i < copies.size(); i++) {
177     Shape* dst_shape = copies[i].second;
178     int32_t* dst = h2d_memcpy_dsts[i];
179     for (int64_t j = 0; j < dst_shape->rank(); j++) {
180       dst_shape->mutable_dimensions()[j] = dst[j];
181     }
182   }
183 
184   device_shape->clear_dynamic_dimensions();
185   TF_RET_CHECK(ShapeUtil::DynamicShapeIsCompatible(*device_shape,
186                                                    original_device_shape));
187   return OkStatus();
188 }
189 
190 }  // namespace gpu
191 }  // namespace xla
192 
CreateNVPTXTransferManager()193 static std::unique_ptr<xla::TransferManager> CreateNVPTXTransferManager() {
194   return std::make_unique<xla::gpu::GpuTransferManager>(
195       /*id=*/stream_executor::cuda::kCudaPlatformId,
196       /*pointer_size=*/llvm::DataLayout(xla::gpu::nvptx::DataLayout())
197           .getPointerSize(0 /* default address space */));
198 }
199 
CreateAMDGPUTransferManager()200 static std::unique_ptr<xla::TransferManager> CreateAMDGPUTransferManager() {
201   return std::make_unique<xla::gpu::GpuTransferManager>(
202       /*id=*/stream_executor::rocm::kROCmPlatformId,
203       /*pointer_size=*/llvm::DataLayout(xla::gpu::amdgpu::DataLayout())
204           .getPointerSize(0 /* default address space */));
205 }
206 
InitModule()207 static bool InitModule() {
208   xla::TransferManager::RegisterTransferManager(
209       stream_executor::cuda::kCudaPlatformId, &CreateNVPTXTransferManager);
210   xla::TransferManager::RegisterTransferManager(
211       stream_executor::rocm::kROCmPlatformId, &CreateAMDGPUTransferManager);
212   return true;
213 }
214 static bool module_initialized = InitModule();
215