1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
17
18 #include <cstddef>
19 #include <vector>
20
21 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
22 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
23 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
24 #include "tensorflow/core/platform/stream_executor.h"
25
26 #define MASK_WORDS 2
27 #define MASK_BYTES (MASK_WORDS * sizeof(int64))
28
29 namespace tensorflow {
30 namespace {
31
NewMask(int64 word)32 int64* NewMask(int64 word) {
33 int64* m = new int64[MASK_WORDS];
34 for (int i = 0; i < MASK_WORDS; ++i) {
35 m[i] = word;
36 }
37 return m;
38 }
39
40 int64* before_mask = NewMask(0xabababababababab);
41 int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd);
42
CheckMask(se::StreamExecutor * exec,void * ptr,int64 * mask)43 bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
44 se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
45 int64 tmp[MASK_WORDS];
46
47 Status result = exec->SynchronousMemcpyD2H(gpu_ptr, MASK_BYTES, tmp);
48 if (!result.ok()) {
49 LOG(FATAL) << "Could not copy debug mask, " << result;
50 }
51
52 bool ok = true;
53 for (int i = 0; i < MASK_WORDS; ++i) {
54 ok &= (mask[i] == tmp[i]);
55 if (!ok) {
56 LOG(ERROR) << "i=" << i
57 << " mask=" << reinterpret_cast<const void*>(mask[i])
58 << " field=" << reinterpret_cast<const void*>(tmp[i]);
59 }
60 }
61
62 return ok;
63 }
64
InitMask(se::StreamExecutor * exec,void * ptr,int64 * mask)65 void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
66 se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
67 Status result = exec->SynchronousMemcpyH2D(mask, MASK_BYTES, &gpu_ptr);
68 if (!result.ok()) {
69 LOG(FATAL) << "Could not copy debug mask, " << result;
70 }
71 }
72
73 } // namespace
74
75 // -----------------------------------------------------------------------------
76 // GPUDebugAllocator
77 // -----------------------------------------------------------------------------
GPUDebugAllocator(Allocator * allocator,PlatformGpuId platform_gpu_id)78 GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator,
79 PlatformGpuId platform_gpu_id)
80 : base_allocator_(allocator) {
81 stream_exec_ =
82 GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie();
83 }
84
~GPUDebugAllocator()85 GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; }
86
AllocateRaw(size_t alignment,size_t num_bytes)87 void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
88 num_bytes += (2 * MASK_BYTES);
89 void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
90 if (allocated_ptr == nullptr) return allocated_ptr;
91
92 // Return the pointer after the header
93 void* rv = static_cast<char*>(allocated_ptr) + MASK_BYTES;
94
95 // Write the header at allocated_ptr
96 InitMask(stream_exec_, allocated_ptr, before_mask);
97
98 // Write the footer at the end.
99 size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
100 InitMask(stream_exec_,
101 static_cast<char*>(allocated_ptr) + req_size - MASK_BYTES,
102 after_mask);
103 return rv;
104 }
DeallocateRaw(void * ptr)105 void GPUDebugAllocator::DeallocateRaw(void* ptr) {
106 if (ptr != nullptr) {
107 CHECK(CheckHeader(ptr)) << "before_mask has been overwritten";
108 CHECK(CheckFooter(ptr)) << "after_mask has been overwritten";
109
110 // Backtrack to the beginning of the header.
111 ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES);
112 }
113 // Deallocate the memory
114 base_allocator_->DeallocateRaw(ptr);
115 }
116
TracksAllocationSizes()117 bool GPUDebugAllocator::TracksAllocationSizes() { return true; }
118
RequestedSize(const void * ptr)119 size_t GPUDebugAllocator::RequestedSize(const void* ptr) {
120 auto req_size = base_allocator_->RequestedSize(static_cast<const char*>(ptr) -
121 MASK_BYTES);
122 return req_size - 2 * MASK_BYTES;
123 }
124
AllocatedSize(const void * ptr)125 size_t GPUDebugAllocator::AllocatedSize(const void* ptr) {
126 return base_allocator_->AllocatedSize(static_cast<const char*>(ptr) -
127 MASK_BYTES);
128 }
129
AllocationId(const void * ptr)130 int64 GPUDebugAllocator::AllocationId(const void* ptr) {
131 return base_allocator_->AllocationId(static_cast<const char*>(ptr) -
132 MASK_BYTES);
133 }
134
GetStats()135 absl::optional<AllocatorStats> GPUDebugAllocator::GetStats() {
136 return base_allocator_->GetStats();
137 }
138
ClearStats()139 void GPUDebugAllocator::ClearStats() { base_allocator_->ClearStats(); }
140
CheckHeader(void * ptr)141 bool GPUDebugAllocator::CheckHeader(void* ptr) {
142 return CheckMask(stream_exec_, static_cast<char*>(ptr) - MASK_BYTES,
143 before_mask);
144 }
145
CheckFooter(void * ptr)146 bool GPUDebugAllocator::CheckFooter(void* ptr) {
147 char* original_ptr = static_cast<char*>(ptr) - MASK_BYTES;
148 size_t req_size = base_allocator_->RequestedSize(original_ptr);
149 return CheckMask(stream_exec_, original_ptr + req_size - MASK_BYTES,
150 after_mask);
151 }
152
153 // -----------------------------------------------------------------------------
154 // GPUNanResetAllocator
155 // -----------------------------------------------------------------------------
GPUNanResetAllocator(Allocator * allocator,PlatformGpuId platform_gpu_id)156 GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator,
157 PlatformGpuId platform_gpu_id)
158 : base_allocator_(allocator) {
159 stream_exec_ =
160 GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie();
161 }
162
~GPUNanResetAllocator()163 GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; }
164
AllocateRaw(size_t alignment,size_t num_bytes)165 void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
166 void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
167 if (allocated_ptr == nullptr) return allocated_ptr;
168
169 // Initialize the buffer to Nans
170 size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
171 std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
172 std::nanf(""));
173 se::DeviceMemory<float> nan_ptr{
174 se::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
175
176 Status result =
177 stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
178 if (!result.ok()) {
179 LOG(ERROR) << "Could not initialize to NaNs, " << result;
180 }
181
182 return allocated_ptr;
183 }
DeallocateRaw(void * ptr)184 void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
185 if (ptr != nullptr) {
186 // Reset the buffer to Nans
187 size_t req_size = base_allocator_->RequestedSize(ptr);
188 std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
189 std::nanf(""));
190 se::DeviceMemory<float> nan_ptr{
191 se::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
192 Status result =
193 stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
194 if (!result.ok()) {
195 LOG(ERROR) << "Could not initialize to NaNs, " << result;
196 }
197 }
198
199 // Deallocate the memory
200 base_allocator_->DeallocateRaw(ptr);
201 }
202
RequestedSize(const void * ptr)203 size_t GPUNanResetAllocator::RequestedSize(const void* ptr) {
204 return base_allocator_->RequestedSize(ptr);
205 }
206
AllocatedSize(const void * ptr)207 size_t GPUNanResetAllocator::AllocatedSize(const void* ptr) {
208 return base_allocator_->AllocatedSize(ptr);
209 }
210
GetStats()211 absl::optional<AllocatorStats> GPUNanResetAllocator::GetStats() {
212 return base_allocator_->GetStats();
213 }
214
ClearStats()215 void GPUNanResetAllocator::ClearStats() { base_allocator_->ClearStats(); }
216
217 } // namespace tensorflow
218