1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
17
18 #include "tensorflow/core/platform/logging.h"
19
20 #if GOOGLE_CUDA
21 #if GOOGLE_TENSORRT
22 #include "cuda/include/cuda_runtime_api.h"
23 #endif // GOOGLE_TENSORRT
24 #endif // GOOGLE_CUDA
25
26 namespace tensorflow {
27 namespace tensorrt {
28
29 // std::align is not supported, so this method mimic its behavior.
30 //
31 // NOTE(aaroey): according to the TensorRT API,
32 // nvinfer1::IGpuAllocator::allocate() uses uint64_t type for size and alignment
33 // parameters, so here we use the same type to make it compatible.
Align(uint64_t alignment,uint64_t size,void * & ptr,uint64_t & space)34 void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space) {
35 QCHECK_GT(alignment, 0ul) << "alignment must be greater than 0.";
36 QCHECK_EQ(0, alignment & (alignment - 1)) << "Alignment must be power of 2.";
37 QCHECK_GT(size, 0ul) << "size must be greater than 0.";
38 QCHECK(ptr) << "ptr must not be nullptr.";
39 QCHECK_GT(space, 0ul) << "space must be greater than 0.";
40 const uintptr_t ptr_val = reinterpret_cast<uintptr_t>(ptr);
41 QCHECK_GE(ptr_val + space, ptr_val) << "Provided space overflows.";
42
43 if (size > space) return nullptr;
44 const uintptr_t aligned_ptr_val = ((ptr_val + alignment - 1) & -alignment);
45 if (aligned_ptr_val > ptr_val + space - size) return nullptr;
46 ptr = reinterpret_cast<void*>(aligned_ptr_val);
47 const uintptr_t diff = aligned_ptr_val - ptr_val;
48 space -= diff;
49 return ptr;
50 }
51
52 } // namespace tensorrt
53 } // namespace tensorflow
54
55 #if GOOGLE_CUDA
56 #if GOOGLE_TENSORRT
57
58 namespace tensorflow {
59 namespace tensorrt {
60
allocate(uint64_t size,uint64_t alignment,uint32_t flags)61 void* TRTCudaAllocator::allocate(uint64_t size, uint64_t alignment,
62 uint32_t flags) {
63 assert((alignment & (alignment - 1)) == 0); // zero or a power of 2.
64 void* memory;
65 cudaMalloc(&memory, size);
66 return memory;
67 }
68
free(void * memory)69 void TRTCudaAllocator::free(void* memory) { cudaFree(memory); }
70
allocate(uint64_t size,uint64_t alignment,uint32_t flags)71 void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
72 uint32_t flags) {
73 if (size == 0) return nullptr;
74 // WAR for allocator alignment requirement. Certain cuda API calls require GPU
75 // memory with alignment to cudaDeviceProp::textureAlignment.
76 // See issue #20856
77 alignment = 512;
78 assert((alignment & (alignment - 1)) == 0); // zero or a power of 2.
79 uint64_t total_size = size + alignment;
80 // TODO(aaroey): AllocateRaw takes size_t size as input, so it'll produce
81 // unexpected result when TRT tries to allocate more bytes than size_t can
82 // carry. Fix this.
83 void* mem = allocator_->AllocateRaw(alignment, total_size);
84 if (!mem) return nullptr;
85
86 void* alloc_mem = mem;
87 QCHECK(Align(alignment, size, mem, total_size));
88 if (mem != alloc_mem) {
89 QCHECK(mem_map_.insert({mem, alloc_mem}).second);
90 }
91 VLOG(2) << "Allocated " << total_size << " bytes memory @" << alloc_mem
92 << "; aligned to " << size << " bytes @" << mem << " with alignment "
93 << alignment;
94 return mem;
95 }
96
TRTDeviceAllocator(Allocator * allocator)97 TRTDeviceAllocator::TRTDeviceAllocator(Allocator* allocator)
98 : allocator_(allocator) {
99 VLOG(1) << "Using " << allocator->Name() << " allocator from TensorFlow";
100 }
101
free(void * memory)102 void TRTDeviceAllocator::free(void* memory) {
103 VLOG(2) << "Deallocating @ " << memory;
104 // allocated memory adjusted for alignment, restore the original pointer
105 if (memory) {
106 auto alloc_mem = mem_map_.find(memory);
107 if (alloc_mem != mem_map_.end()) {
108 memory = alloc_mem->second;
109 mem_map_.erase(alloc_mem->first);
110 }
111 allocator_->DeallocateRaw(memory);
112 }
113 }
114
115 } // namespace tensorrt
116 } // namespace tensorflow
117
118 #endif // GOOGLE_TENSORRT
119 #endif // GOOGLE_CUDA
120