1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
17
18 #include "tensorflow/core/common_runtime/copy_tensor.h"
19 #include "tensorflow/core/common_runtime/device.h"
20 #include "tensorflow/core/common_runtime/dma_helper.h"
21 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
22 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
23 #include "tensorflow/core/common_runtime/gpu_device_context.h"
24 #include "tensorflow/core/framework/tensor.h"
25 #include "tensorflow/core/framework/tensor.pb.h"
26 #include "tensorflow/core/framework/tensor_reference.h"
27 #include "tensorflow/core/framework/types.h"
28 #include "tensorflow/core/lib/core/errors.h"
29 #include "tensorflow/core/lib/core/refcount.h"
30 #include "tensorflow/core/lib/gtl/array_slice.h"
31 #include "tensorflow/core/lib/gtl/stl_util.h"
32 #include "tensorflow/core/lib/hash/hash.h"
33 #include "tensorflow/core/lib/strings/strcat.h"
34 #include "tensorflow/core/lib/strings/stringprintf.h"
35 #include "tensorflow/core/platform/logging.h"
36 #include "tensorflow/core/platform/stream_executor.h"
37 #include "tensorflow/core/platform/tensor_coding.h"
38 #include "tensorflow/core/platform/tracing.h"
39 #include "tensorflow/core/util/util.h"
40
41 // IMPLEMENTATION NOTE:
42 //
43 // 1. Within this module, we intentionally LOG(FATAL) if any stream
44 // involved in memcpy becomes !stream->ok(), because TF process
45 // today (1/2016) can not properly recover from such an error.
46 //
47 // 2. When 0-size tensor is being copied, we should not schedule a
48 // copy ThenMemcpy since there is no byte to move. However, we must
49 // ensure the causal ordering by arranging the copy done callback
50 // happens-after all activities scheduled on the given stream being
51 // finished.
52
53 // If this need to be runtime configurable, consider adding options to
54 // ConfigProto.
55 const tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128;
56 extern bool FLAGS_brain_gpu_record_mem_types;
57
58 namespace tensorflow {
59
60 using se::DeviceMemoryBase;
61 using se::Stream;
62
PrepareCopy(Device * device,const DeviceContext * ctx,const Tensor & src,const Tensor * dst,const DeviceBase::GpuDeviceInfo ** dev_info,se::Stream ** stream)63 Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src,
64 const Tensor* dst,
65 const DeviceBase::GpuDeviceInfo** dev_info,
66 se::Stream** stream) {
67 if (device == nullptr) {
68 return errors::Internal("Unexpected null device.");
69 }
70 auto di = device->tensorflow_gpu_device_info();
71 if (di == nullptr) {
72 return errors::Internal("Unexpected null device info.");
73 }
74 *dev_info = di;
75 if (ctx == nullptr) {
76 return errors::Internal("Unexpected null device context.");
77 }
78 auto gs = static_cast<const GPUDeviceContext*>(ctx)->stream();
79 if (gs == nullptr) {
80 return errors::Internal("No gpu stream is available.");
81 }
82 *stream = gs;
83 if (dst != nullptr) {
84 if (src.dtype() != dst->dtype()) {
85 return errors::Internal("Can't copy a tensor of ",
86 DataTypeString(src.dtype()), " into a tensor of ",
87 DataTypeString(dst->dtype()));
88 }
89 if (src.TotalBytes() != dst->TotalBytes()) {
90 return errors::Internal("Can't copy ", src.TotalBytes(),
91 " bytes of a tensor into another with ",
92 dst->TotalBytes(), " bytes buffer.");
93 }
94 if ((src.TotalBytes() > 0) && !src.IsInitialized()) {
95 return errors::Internal("Src tensor is not initialized.");
96 }
97 if ((dst->TotalBytes() > 0) && !dst->IsInitialized()) {
98 return errors::Internal("Dst tensor is not initialized.");
99 }
100 }
101 if (!DMAHelper::CanUseDMA(&src)) {
102 return errors::Internal("GPU copy from non-DMA ",
103 DataTypeString(src.dtype()), "tensor");
104 }
105 return Status::OK();
106 }
107
GetBase(const Tensor * src)108 void* GetBase(const Tensor* src) {
109 return const_cast<void*>(DMAHelper::base(src));
110 }
111
GetBase(Tensor * dst)112 void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
113
114 /*static*/
SetProtoFromGPU(const Tensor & tensor,Device * dev,const DeviceContext * device_context,TensorProto * proto,bool is_dead,StatusCallback done)115 void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
116 const DeviceContext* device_context,
117 TensorProto* proto, bool is_dead,
118 StatusCallback done) {
119 VLOG(1) << "SetProtoFromGPU device_context " << device_context;
120 const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
121 se::Stream* send_stream = nullptr;
122 Status s = PrepareCopy(dev, device_context, tensor, nullptr, &dev_info,
123 &send_stream);
124 if (!s.ok()) {
125 done(s);
126 return;
127 }
128
129 auto send_device_to_host_stream =
130 static_cast<const GPUDeviceContext*>(device_context)
131 ->device_to_host_stream();
132 if (send_device_to_host_stream == nullptr) {
133 done(errors::Internal("No send gpu copy-out-stream is available."));
134 return;
135 }
136 // Wait for the sender's main stream to make sure the data are available.
137 send_device_to_host_stream->ThenWaitFor(send_stream);
138
139 // Tensor values need to be copied from GPU to CPU ram so that
140 // we can build the protobuf response for a RecvTensor RPC.
141 // "device context" identifies the stream where the _Send op executed.
142 proto->set_dtype(tensor.dtype());
143 tensor.shape().AsProto(proto->mutable_tensor_shape());
144
145 // Prepare a proto with the right data buf size, and DMA the data
146 // over from the GPU buffer. Note that 0-size tensors do not have a
147 // backing buffer.
148 Allocator* alloc = nullptr;
149 char* buf = nullptr;
150 const int64 total_bytes = is_dead ? 0 : tensor.TotalBytes();
151 if (total_bytes > 0) {
152 tracing::ScopedAnnotation annotation("SetProtoFromGPU");
153 alloc = GPUProcessState::singleton()->GetGpuHostAllocator(0);
154 buf = alloc->Allocate<char>(total_bytes);
155 if (LogMemory::IsEnabled()) {
156 LogMemory::RecordRawAllocation("SetProtoFromGPU",
157 LogMemory::PROTO_BUFFER_STEP_ID,
158 total_bytes, buf, alloc);
159 }
160 void* src_ptr = GetBase(&tensor);
161 DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
162 send_device_to_host_stream->ThenMemcpy(buf, gpu_src_ptr, total_bytes);
163 }
164 // Use of tensor may outlive stack scope, so keep a ref.
165 TensorReference tensor_ref(tensor);
166 dev_info->event_mgr->ThenExecute(
167 send_device_to_host_stream, [send_device_to_host_stream, done, proto, buf,
168 total_bytes, alloc, tensor_ref]() {
169 if (!send_device_to_host_stream->ok()) {
170 LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed";
171 }
172 tensor_ref.Unref();
173 if (total_bytes > 0) {
174 port::CopyFromArray(proto->mutable_tensor_content(), buf,
175 total_bytes);
176 if (LogMemory::IsEnabled()) {
177 LogMemory::RecordRawDeallocation("SetProtoFromGPU",
178 LogMemory::PROTO_BUFFER_STEP_ID,
179 buf, alloc, false);
180 }
181 alloc->Deallocate<char>(buf, total_bytes);
182 }
183 done(Status::OK());
184 });
185 }
186
187 // static
DeviceToDeviceCopy(DeviceContext * send_dev_context,DeviceContext * recv_dev_context,Device * src,Device * dst,AllocatorAttributes src_alloc_attr,AllocatorAttributes dst_alloc_attr,const Tensor * input,Tensor * output,int dev_to_dev_stream_index,StatusCallback done)188 void GPUUtil::DeviceToDeviceCopy(
189 DeviceContext* send_dev_context, DeviceContext* recv_dev_context,
190 Device* src, Device* dst, AllocatorAttributes src_alloc_attr,
191 AllocatorAttributes dst_alloc_attr, const Tensor* input, Tensor* output,
192 int dev_to_dev_stream_index, StatusCallback done) {
193 const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
194 se::Stream* send_stream = nullptr;
195 Status s = PrepareCopy(src, send_dev_context, *input, output, &dev_info,
196 &send_stream);
197 if (!s.ok()) {
198 done(s);
199 return;
200 }
201 auto send_device_to_device_stream =
202 static_cast<const GPUDeviceContext*>(send_dev_context)
203 ->device_to_device_stream(dev_to_dev_stream_index);
204 if (send_device_to_device_stream == nullptr) {
205 done(errors::Internal("No send gpu copy-out-stream is available."));
206 return;
207 }
208 // Wait for the main stream on the sender to make sure the result is
209 // available.
210 send_device_to_device_stream->ThenWaitFor(send_stream);
211
212 const int64 total_bytes = input->TotalBytes();
213 if (total_bytes > 0) {
214 void* src_ptr = GetBase(input);
215 DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
216 void* dst_ptr = GetBase(output);
217 DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
218 auto recv_stream =
219 static_cast<const GPUDeviceContext*>(recv_dev_context)->stream();
220 if (recv_stream == nullptr) {
221 done(errors::Internal("No recv gpu stream is available."));
222 return;
223 }
224 // Since we want to use the memory from recv_stream in the
225 // send_device_to_device_stream, add a dependency to make sure the memory is
226 // truly free.
227 // TODO(zhengxq): remove this dependency when we switch to a better way
228 // to make sure the memory is free.
229 send_device_to_device_stream->ThenWaitFor(recv_stream);
230
231 VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr;
232 send_device_to_device_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr,
233 total_bytes);
234 }
235
236 // Use of input may outlive stack scope, so keep a ref.
237 TensorReference input_ref(*input);
238 dev_info->event_mgr->ThenExecute(
239 send_device_to_device_stream,
240 [done, send_device_to_device_stream, input_ref]() {
241 input_ref.Unref();
242 if (!send_device_to_device_stream->ok()) {
243 LOG(FATAL) << "GPU->GPU Memcpy failed";
244 }
245 done(Status::OK());
246 });
247 send_dev_context->MaintainLifetimeOnStream(input,
248 send_device_to_device_stream);
249 }
250
251 static CopyTensor::Registration register_gpu_gpu_copy(
252 DEVICE_GPU, DEVICE_GPU, GPUUtil::DeviceToDeviceCopy);
253
254 // static
CopyGPUTensorToCPU(Device * gpu_device,const DeviceContext * device_context,const Tensor * gpu_tensor,Tensor * cpu_tensor,StatusCallback done)255 void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
256 const DeviceContext* device_context,
257 const Tensor* gpu_tensor, Tensor* cpu_tensor,
258 StatusCallback done) {
259 VLOG(1) << "CopyGPUTensorToCPU";
260 const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
261 se::Stream* send_stream = nullptr;
262 Status s = PrepareCopy(gpu_device, device_context, *gpu_tensor, cpu_tensor,
263 &dev_info, &send_stream);
264 if (!s.ok()) {
265 done(s);
266 return;
267 }
268
269 auto send_device_to_host_stream =
270 static_cast<const GPUDeviceContext*>(device_context)
271 ->device_to_host_stream();
272 if (send_device_to_host_stream == nullptr) {
273 done(errors::Internal("No send gpu copy-out-stream is available."));
274 return;
275 }
276 // Wait for the sender's main stream to make sure the data are available.
277 send_device_to_host_stream->ThenWaitFor(send_stream);
278
279 const int64 total_bytes = gpu_tensor->TotalBytes();
280 if (total_bytes > 0) {
281 void* src_ptr = GetBase(gpu_tensor);
282 DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
283 void* dst_ptr = GetBase(cpu_tensor);
284 send_device_to_host_stream->ThenMemcpy(dst_ptr, gpu_src_ptr, total_bytes);
285 }
286 // Use of the input may outlive stack scope, so keep a ref.
287 TensorReference input_ref(*gpu_tensor);
288 dev_info->event_mgr->ThenExecute(
289 send_device_to_host_stream,
290 [send_device_to_host_stream, done, input_ref]() {
291 if (!send_device_to_host_stream->ok()) {
292 LOG(FATAL) << "GPU->CPU Memcpy failed";
293 }
294 input_ref.Unref();
295 done(Status::OK());
296 });
297 }
298
299 /* static */
CopyCPUTensorToGPU(const Tensor * cpu_tensor,const DeviceContext * device_context,Device * gpu_device,Tensor * gpu_tensor,StatusCallback done)300 void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
301 const DeviceContext* device_context,
302 Device* gpu_device, Tensor* gpu_tensor,
303 StatusCallback done) {
304 VLOG(1) << "CopyCPUTensorToGPU";
305 const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
306 se::Stream* recv_stream = nullptr;
307 Status s = PrepareCopy(gpu_device, device_context, *cpu_tensor, gpu_tensor,
308 &dev_info, &recv_stream);
309 if (!s.ok()) {
310 done(s);
311 return;
312 }
313
314 auto recv_host_to_device_stream =
315 static_cast<const GPUDeviceContext*>(device_context)
316 ->host_to_device_stream();
317 if (recv_host_to_device_stream == nullptr) {
318 done(errors::Internal("No send gpu copy-out-stream is available."));
319 return;
320 }
321 // Wait for the recv-stream to make sure the buffer is truly available.
322 recv_host_to_device_stream->ThenWaitFor(recv_stream);
323
324 const int64 total_bytes = cpu_tensor->TotalBytes();
325 // Note that 0-size tensors have no backing buffer.
326 if (total_bytes > 0) {
327 void* src_ptr = GetBase(cpu_tensor);
328 void* dst_ptr = GetBase(gpu_tensor);
329 DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
330 recv_host_to_device_stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes);
331 }
332 // Use of cpu_tensor may outlive stack scope, so keep a ref.
333 TensorReference input_ref(*cpu_tensor);
334 dev_info->event_mgr->ThenExecute(
335 recv_host_to_device_stream,
336 [recv_host_to_device_stream, done, input_ref]() {
337 input_ref.Unref();
338 if (!recv_host_to_device_stream->ok()) {
339 LOG(FATAL) << "CPU->GPU Memcpy failed";
340 }
341 done(Status::OK());
342 });
343 }
344
Sync(Device * gpu_device)345 Status GPUUtil::Sync(Device* gpu_device) {
346 VLOG(1) << "GPUUtil::Sync";
347 auto* dev_info = gpu_device->tensorflow_gpu_device_info();
348 if (!dev_info) {
349 return errors::Internal("Failed to find dest device GPUDeviceInfo");
350 }
351 return dev_info->stream->BlockHostUntilDone();
352 }
353
SyncAll(Device * gpu_device)354 Status GPUUtil::SyncAll(Device* gpu_device) {
355 VLOG(1) << "GPUUtil::SyncAll";
356 auto* dev_info = gpu_device->tensorflow_gpu_device_info();
357 if (!dev_info) {
358 return errors::Internal("Failed to find dest device GPUDeviceInfo");
359 }
360 if (!dev_info->stream->parent()->SynchronizeAllActivity() ||
361 !dev_info->stream->ok()) {
362 return errors::Internal("GPU sync failed");
363 }
364 return Status::OK();
365 }
366
MemoryDebugString(const Device * device,Tensor * tensor)367 string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) {
368 string ret;
369 CHECK(tensor);
370 const int64 num_bytes = std::min<int64>(
371 FLAGS_brain_gpu_util_debug_string_maxlen, tensor->TotalBytes());
372 void* ptr = (num_bytes > 0) ? GetBase(tensor) : nullptr;
373 strings::Appendf(&ret, "%p:", ptr);
374 if (num_bytes > 0) {
375 auto* dev_info = device->tensorflow_gpu_device_info();
376 if (!dev_info) {
377 strings::StrAppend(
378 &ret, PrintMemory(reinterpret_cast<const char*>(ptr), num_bytes));
379 } else {
380 string buf;
381 buf.resize(num_bytes);
382 DeviceMemoryBase gpu_ptr(ptr, num_bytes);
383 auto s = dev_info->stream->parent()->SynchronousMemcpyD2H(
384 gpu_ptr, num_bytes, gtl::string_as_array(&buf));
385 strings::StrAppend(&ret,
386 PrintMemory(gtl::string_as_array(&buf), num_bytes));
387 }
388 }
389 return ret;
390 }
391
392 // TODO(pbar) Checksum is called from places without a valid device context.
Checksum(Device * gpu_device,const DeviceContext * device_context,const Tensor & tensor)393 uint64 GPUUtil::Checksum(Device* gpu_device,
394 const DeviceContext* device_context,
395 const Tensor& tensor) {
396 Tensor copy(tensor.dtype(), tensor.shape());
397 Status s;
398 Notification n;
399 CopyGPUTensorToCPU(gpu_device, device_context, &tensor, ©,
400 [&s, &n](Status status) {
401 s.Update(status);
402 n.Notify();
403 });
404 n.WaitForNotification();
405 CHECK(s.ok()) << s;
406 return Checksum(copy);
407 }
408
Checksum(const Tensor & tensor)409 uint64 GPUUtil::Checksum(const Tensor& tensor) {
410 const float* fptr = reinterpret_cast<const float*>(GetBase(&tensor));
411 size_t num_bytes = tensor.TotalBytes();
412 size_t num_floats = num_bytes / sizeof(float);
413 for (size_t i = 0; i < num_floats; ++i) {
414 CHECK(!std::isnan(fptr[i])) << " i " << i;
415 }
416 // TODO(tucker): consider using crc32c instead.
417 return Hash64(reinterpret_cast<const char*>(GetBase(&tensor)),
418 tensor.TotalBytes(), 0);
419 }
420
421 // static
CopyGPUTensorToSameGPU(Device * gpu_device,const DeviceContext * device_context,const Tensor * src_gpu_tensor,Tensor * dst_gpu_tensor,StatusCallback done)422 void GPUUtil::CopyGPUTensorToSameGPU(Device* gpu_device,
423 const DeviceContext* device_context,
424 const Tensor* src_gpu_tensor,
425 Tensor* dst_gpu_tensor,
426 StatusCallback done) {
427 VLOG(1) << "CopyGPUTensorToSameGPU";
428 const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
429 se::Stream* send_stream = nullptr;
430 Status s = PrepareCopy(gpu_device, device_context, *src_gpu_tensor,
431 dst_gpu_tensor, &dev_info, &send_stream);
432 if (!s.ok()) {
433 done(s);
434 return;
435 }
436
437 const int64 total_bytes = src_gpu_tensor->TotalBytes();
438 if (total_bytes > 0) {
439 void* src_ptr = GetBase(src_gpu_tensor);
440 DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
441 void* dst_ptr = GetBase(dst_gpu_tensor);
442 DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
443 send_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, total_bytes);
444 }
445
446 done(Status::OK());
447 }
448
449 } // namespace tensorflow
450