1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // Creates some GPU activity to test functionalities of gpuperfcounter/gputrace.
17 #include "tensorflow/core/profiler/backends/gpu/cuda_test.h"
18
19 #if GOOGLE_CUDA
20 #include <stdio.h>
21
22 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
23 #include "third_party/gpus/cuda/include/driver_types.h"
24 #endif
25
26 #include "tensorflow/core/platform/test.h"
27
28 namespace tensorflow {
29 namespace profiler {
30 namespace test {
31
32 #if GOOGLE_CUDA
33 namespace {
34
35 // Simple printf kernel.
simple_print()36 __global__ void simple_print() { printf("hello, world!\n"); }
37
38 // Empty kernel.
empty()39 __global__ void empty() {}
40
41 // Simple kernel accesses memory.
access(int * addr)42 __global__ void access(int *addr) { *addr = *addr * 2; }
43
44 unsigned *g_device_copy;
45
46 unsigned *gpu0_buf, *gpu1_buf;
47
48 } // namespace
49 #endif // GOOGLE_CUDA
50
PrintfKernel(int iters)51 void PrintfKernel(int iters) {
52 #if GOOGLE_CUDA
53 for (int i = 0; i < iters; ++i) {
54 simple_print<<<1, 1>>>();
55 }
56 #else
57 GTEST_FAIL() << "Build with --config=cuda";
58 #endif
59 }
60
EmptyKernel(int iters)61 void EmptyKernel(int iters) {
62 #if GOOGLE_CUDA
63 for (int i = 0; i < iters; ++i) {
64 empty<<<1, 1>>>();
65 }
66 #else
67 GTEST_FAIL() << "Build with --config=cuda";
68 #endif
69 }
70
AccessKernel(int * addr)71 void AccessKernel(int *addr) {
72 #if GOOGLE_CUDA
73 access<<<1, 1>>>(addr);
74 #else
75 GTEST_FAIL() << "Build with --config=cuda";
76 #endif
77 }
78
Synchronize()79 void Synchronize() {
80 #if GOOGLE_CUDA
81 cudaDeviceSynchronize();
82 #else
83 GTEST_FAIL() << "Build with --config=cuda";
84 #endif
85 }
86
UnifiedMemoryHtoDAndDtoH()87 void UnifiedMemoryHtoDAndDtoH() {
88 #if GOOGLE_CUDA
89 int *addr = nullptr;
90 cudaMallocManaged(reinterpret_cast<void **>(&addr), sizeof(int));
91 // The page is now in host memory.
92 *addr = 1;
93 // The kernel wants to access the page. HtoD transfer happens.
94 AccessKernel(addr);
95 Synchronize();
96 // The page is now in device memory. CPU wants to access the page. DtoH
97 // transfer happens.
98 EXPECT_EQ(*addr, 2);
99 #else
100 GTEST_FAIL() << "Build with --config=cuda";
101 #endif
102 }
103
MemCopyH2D()104 void MemCopyH2D() {
105 #if GOOGLE_CUDA
106 unsigned host_val = 0x12345678;
107 cudaMalloc(reinterpret_cast<void **>(&g_device_copy), sizeof(unsigned));
108 cudaMemcpy(g_device_copy, &host_val, sizeof(unsigned),
109 cudaMemcpyHostToDevice);
110 #else
111 GTEST_FAIL() << "Build with --config=cuda";
112 #endif
113 }
114
MemCopyH2D_Async()115 void MemCopyH2D_Async() {
116 #if GOOGLE_CUDA
117 unsigned host_val = 0x12345678;
118 cudaMalloc(reinterpret_cast<void **>(&g_device_copy), sizeof(unsigned));
119 cudaMemcpyAsync(g_device_copy, &host_val, sizeof(unsigned),
120 cudaMemcpyHostToDevice);
121 #else
122 GTEST_FAIL() << "Build with --config=cuda";
123 #endif
124 }
125
MemCopyD2H()126 void MemCopyD2H() {
127 #if GOOGLE_CUDA
128 unsigned host_val = 0;
129 cudaMalloc(reinterpret_cast<void **>(&g_device_copy), sizeof(unsigned));
130 cudaMemcpy(&host_val, g_device_copy, sizeof(unsigned),
131 cudaMemcpyDeviceToHost);
132 #else
133 GTEST_FAIL() << "Build with --config=cuda";
134 #endif
135 }
136
137 namespace {
138
139 // Helper function to set up memory buffers on two devices.
P2PMemcpyHelper()140 void P2PMemcpyHelper() {
141 #if GOOGLE_CUDA
142 cudaSetDevice(0);
143 cudaMalloc(reinterpret_cast<void **>(&gpu0_buf), sizeof(unsigned));
144 cudaDeviceEnablePeerAccess(/*peerDevice=*/1, /*flags=*/0);
145 cudaSetDevice(1);
146 cudaMalloc(reinterpret_cast<void **>(&gpu1_buf), sizeof(unsigned));
147 cudaDeviceEnablePeerAccess(/*peerDevice=*/0, /*flags=*/0);
148 #else
149 GTEST_FAIL() << "Build with --config=cuda";
150 #endif
151 }
152
153 } // namespace
154
MemCopyP2PAvailable()155 bool MemCopyP2PAvailable() {
156 #if GOOGLE_CUDA
157 int can_access_01 = 0;
158 cudaDeviceCanAccessPeer(&can_access_01, /*device=*/0, /*peerDevice=*/1);
159 int can_access_10 = 0;
160 cudaDeviceCanAccessPeer(&can_access_01, /*device=*/1, /*peerDevice=*/0);
161 return can_access_01 && can_access_10;
162 #else
163 return false;
164 #endif
165 }
166
MemCopyP2PImplicit()167 void MemCopyP2PImplicit() {
168 #if GOOGLE_CUDA
169 P2PMemcpyHelper();
170 cudaMemcpy(gpu1_buf, gpu0_buf, sizeof(unsigned), cudaMemcpyDefault);
171 #else
172 GTEST_FAIL() << "Build with --config=cuda";
173 #endif
174 }
175
MemCopyP2PExplicit()176 void MemCopyP2PExplicit() {
177 #if GOOGLE_CUDA
178 P2PMemcpyHelper();
179 cudaMemcpyPeer(gpu1_buf, 1 /* device */, gpu0_buf, 0 /* device */,
180 sizeof(unsigned));
181 #else
182 GTEST_FAIL() << "Build with --config=cuda";
183 #endif
184 }
185
186 } // namespace test
187 } // namespace profiler
188 } // namespace tensorflow
189