1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
17
18 #include "tensorflow/core/platform/logging.h"
19
20 #if GOOGLE_CUDA && GOOGLE_TENSORRT
21 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
22 #endif // GOOGLE_CUDA && GOOGLE_TENSORRT
23
24 namespace tensorflow {
25 namespace tensorrt {
26
27 // std::align is not supported, so this method mimic its behavior.
28 //
29 // NOTE(aaroey): according to the TensorRT API,
30 // nvinfer1::IGpuAllocator::allocate() uses uint64_t type for size and alignment
31 // parameters, so here we use the same type to make it compatible.
Align(uint64_t alignment,uint64_t size,void * & ptr,uint64_t & space)32 void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space) {
33 QCHECK_GT(alignment, 0ul) << "alignment must be greater than 0.";
34 QCHECK_EQ(0, alignment & (alignment - 1)) << "Alignment must be power of 2.";
35 QCHECK_GT(size, 0ul) << "size must be greater than 0.";
36 QCHECK(ptr) << "ptr must not be nullptr.";
37 QCHECK_GT(space, 0ul) << "space must be greater than 0.";
38 const uintptr_t ptr_val = reinterpret_cast<uintptr_t>(ptr);
39 QCHECK_GE(ptr_val + space, ptr_val) << "Provided space overflows.";
40
41 if (size > space) return nullptr;
42 const uintptr_t aligned_ptr_val = ((ptr_val + alignment - 1) & -alignment);
43 if (aligned_ptr_val > ptr_val + space - size) return nullptr;
44 ptr = reinterpret_cast<void*>(aligned_ptr_val);
45 const uintptr_t diff = aligned_ptr_val - ptr_val;
46 space -= diff;
47 return ptr;
48 }
49
50 } // namespace tensorrt
51 } // namespace tensorflow
52
53 #if GOOGLE_CUDA && GOOGLE_TENSORRT
54
55 namespace tensorflow {
56 namespace tensorrt {
57
allocate(uint64_t size,uint64_t alignment,uint32_t flags)58 void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
59 uint32_t flags) {
60 if (size == 0) return nullptr;
61 // WAR for allocator alignment requirement. Certain cuda API calls require GPU
62 // memory with alignment to cudaDeviceProp::textureAlignment.
63 // See issue #20856
64 alignment = 512;
65 assert((alignment & (alignment - 1)) == 0); // zero or a power of 2.
66 uint64_t total_size = size + alignment;
67 // TODO(aaroey): AllocateRaw takes size_t size as input, so it'll produce
68 // unexpected result when TRT tries to allocate more bytes than size_t can
69 // carry. Fix this.
70 //
71 // Fail immediately if allocation fails, rather than waiting 10 seconds and
72 // failing then anyway.
73 // TensorRT 7 can also switch to a different algorithm for a layer if an
74 // algorithm uses too much memory. If we don't fail immediately building the
75 // engine can be *very* slow with TensorRT7 when GPU memory is limited.
76 AllocationAttributes attributes;
77 attributes.retry_on_failure = false;
78 void* mem = allocator_->AllocateRaw(alignment, total_size, attributes);
79 if (!mem) return nullptr;
80
81 void* alloc_mem = mem;
82 QCHECK(Align(alignment, size, mem, total_size));
83 if (mem != alloc_mem) {
84 QCHECK(mem_map_.insert({mem, alloc_mem}).second);
85 }
86 VLOG(2) << "Allocated " << total_size << " bytes memory @" << alloc_mem
87 << "; aligned to " << size << " bytes @" << mem << " with alignment "
88 << alignment;
89 return mem;
90 }
91
TRTDeviceAllocator(Allocator * allocator)92 TRTDeviceAllocator::TRTDeviceAllocator(Allocator* allocator)
93 : allocator_(allocator) {
94 VLOG(1) << "Using " << allocator->Name() << " allocator from TensorFlow";
95 }
96
free(void * memory)97 void TRTDeviceAllocator::free(void* memory) {
98 VLOG(2) << "Deallocating @ " << memory;
99 // allocated memory adjusted for alignment, restore the original pointer
100 if (memory) {
101 auto alloc_mem = mem_map_.find(memory);
102 if (alloc_mem != mem_map_.end()) {
103 memory = alloc_mem->second;
104 mem_map_.erase(alloc_mem->first);
105 }
106 allocator_->DeallocateRaw(memory);
107 }
108 }
109
110 } // namespace tensorrt
111 } // namespace tensorflow
112
113 #endif // GOOGLE_CUDA && GOOGLE_TENSORRT
114