1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
17
18 #include <cstddef>
19 #include <vector>
20
21 #include "tensorflow/core/common_runtime/device/device_id_utils.h"
22 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
23 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
24 #include "tensorflow/core/platform/stream_executor.h"
25
26 #define MASK_WORDS 2
27 #define MASK_BYTES (MASK_WORDS * sizeof(int64))
28
29 namespace tensorflow {
30 namespace {
31
NewMask(int64 word)32 int64* NewMask(int64 word) {
33 int64* m = new int64[MASK_WORDS];
34 for (int i = 0; i < MASK_WORDS; ++i) {
35 m[i] = word;
36 }
37 return m;
38 }
39
40 int64* before_mask = NewMask(0xabababababababab);
41 int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd);
42
CheckMask(se::StreamExecutor * exec,void * ptr,int64 * mask)43 bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
44 se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
45 int64 tmp[MASK_WORDS];
46
47 Status result = exec->SynchronousMemcpyD2H(gpu_ptr, MASK_BYTES, tmp);
48 if (!result.ok()) {
49 LOG(FATAL) << "Could not copy debug mask, " << result;
50 }
51
52 bool ok = true;
53 for (int i = 0; i < MASK_WORDS; ++i) {
54 ok &= (mask[i] == tmp[i]);
55 if (!ok) {
56 LOG(ERROR) << "i=" << i
57 << " mask=" << reinterpret_cast<const void*>(mask[i])
58 << " field=" << reinterpret_cast<const void*>(tmp[i]);
59 }
60 }
61
62 return ok;
63 }
64
InitMask(se::StreamExecutor * exec,void * ptr,int64 * mask)65 void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
66 se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
67 Status result = exec->SynchronousMemcpyH2D(mask, MASK_BYTES, &gpu_ptr);
68 if (!result.ok()) {
69 LOG(FATAL) << "Could not copy debug mask, " << result;
70 }
71 }
72
73 } // namespace
74
75 // -----------------------------------------------------------------------------
76 // GPUDebugAllocator
77 // -----------------------------------------------------------------------------
GPUDebugAllocator(Allocator * allocator,PlatformGpuId platform_gpu_id)78 GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator,
79 PlatformGpuId platform_gpu_id)
80 : base_allocator_(allocator) {
81 stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
82 platform_gpu_id)
83 .ValueOrDie();
84 }
85
~GPUDebugAllocator()86 GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; }
87
AllocateRaw(size_t alignment,size_t num_bytes)88 void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
89 num_bytes += (2 * MASK_BYTES);
90 void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
91 if (allocated_ptr == nullptr) return allocated_ptr;
92
93 // Return the pointer after the header
94 void* rv = static_cast<char*>(allocated_ptr) + MASK_BYTES;
95
96 // Write the header at allocated_ptr
97 InitMask(stream_exec_, allocated_ptr, before_mask);
98
99 // Write the footer at the end.
100 size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
101 InitMask(stream_exec_,
102 static_cast<char*>(allocated_ptr) + req_size - MASK_BYTES,
103 after_mask);
104 return rv;
105 }
DeallocateRaw(void * ptr)106 void GPUDebugAllocator::DeallocateRaw(void* ptr) {
107 if (ptr != nullptr) {
108 CHECK(CheckHeader(ptr)) << "before_mask has been overwritten";
109 CHECK(CheckFooter(ptr)) << "after_mask has been overwritten";
110
111 // Backtrack to the beginning of the header.
112 ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES);
113 }
114 // Deallocate the memory
115 base_allocator_->DeallocateRaw(ptr);
116 }
117
TracksAllocationSizes() const118 bool GPUDebugAllocator::TracksAllocationSizes() const { return true; }
119
RequestedSize(const void * ptr) const120 size_t GPUDebugAllocator::RequestedSize(const void* ptr) const {
121 auto req_size = base_allocator_->RequestedSize(static_cast<const char*>(ptr) -
122 MASK_BYTES);
123 return req_size - 2 * MASK_BYTES;
124 }
125
AllocatedSize(const void * ptr) const126 size_t GPUDebugAllocator::AllocatedSize(const void* ptr) const {
127 return base_allocator_->AllocatedSize(static_cast<const char*>(ptr) -
128 MASK_BYTES);
129 }
130
AllocationId(const void * ptr) const131 int64 GPUDebugAllocator::AllocationId(const void* ptr) const {
132 return base_allocator_->AllocationId(static_cast<const char*>(ptr) -
133 MASK_BYTES);
134 }
135
GetStats()136 absl::optional<AllocatorStats> GPUDebugAllocator::GetStats() {
137 return base_allocator_->GetStats();
138 }
139
ClearStats()140 void GPUDebugAllocator::ClearStats() { base_allocator_->ClearStats(); }
141
CheckHeader(void * ptr)142 bool GPUDebugAllocator::CheckHeader(void* ptr) {
143 return CheckMask(stream_exec_, static_cast<char*>(ptr) - MASK_BYTES,
144 before_mask);
145 }
146
CheckFooter(void * ptr)147 bool GPUDebugAllocator::CheckFooter(void* ptr) {
148 char* original_ptr = static_cast<char*>(ptr) - MASK_BYTES;
149 size_t req_size = base_allocator_->RequestedSize(original_ptr);
150 return CheckMask(stream_exec_, original_ptr + req_size - MASK_BYTES,
151 after_mask);
152 }
153
154 // -----------------------------------------------------------------------------
155 // GPUNanResetAllocator
156 // -----------------------------------------------------------------------------
GPUNanResetAllocator(Allocator * allocator,PlatformGpuId platform_gpu_id)157 GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator,
158 PlatformGpuId platform_gpu_id)
159 : base_allocator_(allocator) {
160 stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
161 platform_gpu_id)
162 .ValueOrDie();
163 }
164
~GPUNanResetAllocator()165 GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; }
166
AllocateRaw(size_t alignment,size_t num_bytes)167 void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
168 void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
169 if (allocated_ptr == nullptr) return allocated_ptr;
170
171 // Initialize the buffer to Nans
172 size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
173 std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
174 std::nanf(""));
175 se::DeviceMemory<float> nan_ptr{
176 se::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
177
178 Status result =
179 stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
180 if (!result.ok()) {
181 LOG(ERROR) << "Could not initialize to NaNs, " << result;
182 }
183
184 return allocated_ptr;
185 }
DeallocateRaw(void * ptr)186 void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
187 if (ptr != nullptr) {
188 // Reset the buffer to Nans
189 size_t req_size = base_allocator_->RequestedSize(ptr);
190 std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
191 std::nanf(""));
192 se::DeviceMemory<float> nan_ptr{
193 se::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
194 Status result =
195 stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
196 if (!result.ok()) {
197 LOG(ERROR) << "Could not initialize to NaNs, " << result;
198 }
199 }
200
201 // Deallocate the memory
202 base_allocator_->DeallocateRaw(ptr);
203 }
204
RequestedSize(const void * ptr) const205 size_t GPUNanResetAllocator::RequestedSize(const void* ptr) const {
206 return base_allocator_->RequestedSize(ptr);
207 }
208
AllocatedSize(const void * ptr) const209 size_t GPUNanResetAllocator::AllocatedSize(const void* ptr) const {
210 return base_allocator_->AllocatedSize(ptr);
211 }
212
GetStats()213 absl::optional<AllocatorStats> GPUNanResetAllocator::GetStats() {
214 return base_allocator_->GetStats();
215 }
216
ClearStats()217 void GPUNanResetAllocator::ClearStats() { base_allocator_->ClearStats(); }
218
219 } // namespace tensorflow
220