1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <vector>
21
22 #include "Debug.h"
23 #include "omptargetplugin.h"
24
25 #define TARGET_NAME CUDA
26 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
27
28 // Utility for retrieving and printing CUDA error string.
29 #ifdef OMPTARGET_DEBUG
30 #define CUDA_ERR_STRING(err) \
31 do { \
32 if (getDebugLevel() > 0) { \
33 const char *errStr = nullptr; \
34 CUresult errStr_status = cuGetErrorString(err, &errStr); \
35 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \
36 DP("Unrecognized CUDA error code: %d\n", err); \
37 else if (errStr_status == CUDA_SUCCESS) \
38 DP("CUDA error is: %s\n", errStr); \
39 else { \
40 DP("Unresolved CUDA error code: %d\n", err); \
41 DP("Unsuccessful cuGetErrorString return status: %d\n", \
42 errStr_status); \
43 } \
44 } \
45 } while (false)
46 #else // OMPTARGET_DEBUG
47 #define CUDA_ERR_STRING(err) {}
48 #endif // OMPTARGET_DEBUG
49
50 #include "../../common/elf_common.c"
51
52 /// Keep entries table per device.
53 struct FuncOrGblEntryTy {
54 __tgt_target_table Table;
55 std::vector<__tgt_offload_entry> Entries;
56 };
57
58 enum ExecutionModeType {
59 SPMD, // constructors, destructors,
60 // combined constructs (`teams distribute parallel for [simd]`)
61 GENERIC, // everything else
62 NONE
63 };
64
65 /// Use a single entity to encode a kernel and a set of flags.
66 struct KernelTy {
67 CUfunction Func;
68
69 // execution mode of kernel
70 // 0 - SPMD mode (without master warp)
71 // 1 - Generic mode (with master warp)
72 int8_t ExecutionMode;
73
74 /// Maximal number of threads per block for this kernel.
75 int MaxThreadsPerBlock = 0;
76
KernelTyKernelTy77 KernelTy(CUfunction _Func, int8_t _ExecutionMode)
78 : Func(_Func), ExecutionMode(_ExecutionMode) {}
79 };
80
81 /// Device environment data
82 /// Manually sync with the deviceRTL side for now, move to a dedicated header
83 /// file later.
84 struct omptarget_device_environmentTy {
85 int32_t debug_level;
86 };
87
88 namespace {
checkResult(CUresult Err,const char * ErrMsg)89 bool checkResult(CUresult Err, const char *ErrMsg) {
90 if (Err == CUDA_SUCCESS)
91 return true;
92
93 DP("%s", ErrMsg);
94 CUDA_ERR_STRING(Err);
95 return false;
96 }
97
memcpyDtoD(const void * SrcPtr,void * DstPtr,int64_t Size,CUstream Stream)98 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
99 CUstream Stream) {
100 CUresult Err =
101 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
102
103 if (Err != CUDA_SUCCESS) {
104 DP("Error when copying data from device to device. Pointers: src "
105 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
106 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
107 CUDA_ERR_STRING(Err);
108 return OFFLOAD_FAIL;
109 }
110
111 return OFFLOAD_SUCCESS;
112 }
113
114 // Structure contains per-device data
115 struct DeviceDataTy {
116 /// List that contains all the kernels.
117 std::list<KernelTy> KernelsList;
118
119 std::list<FuncOrGblEntryTy> FuncGblEntries;
120
121 CUcontext Context = nullptr;
122 // Device properties
123 int ThreadsPerBlock = 0;
124 int BlocksPerGrid = 0;
125 int WarpSize = 0;
126 // OpenMP properties
127 int NumTeams = 0;
128 int NumThreads = 0;
129 };
130
131 class StreamManagerTy {
132 int NumberOfDevices;
133 // The initial size of stream pool
134 int EnvNumInitialStreams;
135 // Per-device stream mutex
136 std::vector<std::unique_ptr<std::mutex>> StreamMtx;
137 // Per-device stream Id indicates the next available stream in the pool
138 std::vector<int> NextStreamId;
139 // Per-device stream pool
140 std::vector<std::vector<CUstream>> StreamPool;
141 // Reference to per-device data
142 std::vector<DeviceDataTy> &DeviceData;
143
144 // If there is no CUstream left in the pool, we will resize the pool to
145 // allocate more CUstream. This function should be called with device mutex,
146 // and we do not resize to smaller one.
resizeStreamPool(const int DeviceId,const size_t NewSize)147 void resizeStreamPool(const int DeviceId, const size_t NewSize) {
148 std::vector<CUstream> &Pool = StreamPool[DeviceId];
149 const size_t CurrentSize = Pool.size();
150 assert(NewSize > CurrentSize && "new size is not larger than current size");
151
152 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
153 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
154 // We will return if cannot switch to the right context in case of
155 // creating bunch of streams that are not corresponding to the right
156 // device. The offloading will fail later because selected CUstream is
157 // nullptr.
158 return;
159 }
160
161 Pool.resize(NewSize, nullptr);
162
163 for (size_t I = CurrentSize; I < NewSize; ++I) {
164 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
165 "Error returned from cuStreamCreate\n");
166 }
167 }
168
169 public:
StreamManagerTy(const int NumberOfDevices,std::vector<DeviceDataTy> & DeviceData)170 StreamManagerTy(const int NumberOfDevices,
171 std::vector<DeviceDataTy> &DeviceData)
172 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
173 DeviceData(DeviceData) {
174 StreamPool.resize(NumberOfDevices);
175 NextStreamId.resize(NumberOfDevices);
176 StreamMtx.resize(NumberOfDevices);
177
178 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
179 EnvNumInitialStreams = std::stoi(EnvStr);
180
181 // Initialize the next stream id
182 std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
183
184 // Initialize stream mutex
185 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
186 Ptr = std::make_unique<std::mutex>();
187 }
188
~StreamManagerTy()189 ~StreamManagerTy() {
190 // Destroy streams
191 for (int I = 0; I < NumberOfDevices; ++I) {
192 checkResult(cuCtxSetCurrent(DeviceData[I].Context),
193 "Error returned from cuCtxSetCurrent\n");
194
195 for (CUstream &S : StreamPool[I]) {
196 if (S)
197 checkResult(cuStreamDestroy(S),
198 "Error returned from cuStreamDestroy\n");
199 }
200 }
201 }
202
203 // Get a CUstream from pool. Per-device next stream id always points to the
204 // next available CUstream. That means, CUstreams [0, id-1] have been
205 // assigned, and [id,] are still available. If there is no CUstream left, we
206 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
207 // the id will increase one.
208 // xxxxxs+++++++++
209 // ^
210 // id
211 // After assignment, the pool becomes the following and s is assigned.
212 // xxxxxs+++++++++
213 // ^
214 // id
getStream(const int DeviceId)215 CUstream getStream(const int DeviceId) {
216 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
217 int &Id = NextStreamId[DeviceId];
218 // No CUstream left in the pool, we need to request from CUDA RT
219 if (Id == StreamPool[DeviceId].size()) {
220 // By default we double the stream pool every time
221 resizeStreamPool(DeviceId, Id * 2);
222 }
223 return StreamPool[DeviceId][Id++];
224 }
225
226 // Return a CUstream back to pool. As mentioned above, per-device next
227 // stream is always points to the next available CUstream, so when we return
228 // a CUstream, we need to first decrease the id, and then copy the CUstream
229 // back.
230 // It is worth noting that, the order of streams return might be different
231 // from that they're assigned, that saying, at some point, there might be
232 // two identical CUstreams.
233 // xxax+a+++++
234 // ^
235 // id
236 // However, it doesn't matter, because they're always on the two sides of
237 // id. The left one will in the end be overwritten by another CUstream.
238 // Therefore, after several execution, the order of pool might be different
239 // from its initial state.
returnStream(const int DeviceId,CUstream Stream)240 void returnStream(const int DeviceId, CUstream Stream) {
241 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
242 int &Id = NextStreamId[DeviceId];
243 assert(Id > 0 && "Wrong stream ID");
244 StreamPool[DeviceId][--Id] = Stream;
245 }
246
initializeDeviceStreamPool(const int DeviceId)247 bool initializeDeviceStreamPool(const int DeviceId) {
248 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
249
250 resizeStreamPool(DeviceId, EnvNumInitialStreams);
251
252 // Check the size of stream pool
253 if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
254 return false;
255
256 // Check whether each stream is valid
257 for (CUstream &S : StreamPool[DeviceId])
258 if (!S)
259 return false;
260
261 return true;
262 }
263 };
264
265 class DeviceRTLTy {
266 int NumberOfDevices;
267 // OpenMP environment properties
268 int EnvNumTeams;
269 int EnvTeamLimit;
270 // OpenMP requires flags
271 int64_t RequiresFlags;
272
273 static constexpr const int HardTeamLimit = 1U << 16U; // 64k
274 static constexpr const int HardThreadLimit = 1024;
275 static constexpr const int DefaultNumTeams = 128;
276 static constexpr const int DefaultNumThreads = 128;
277
278 std::unique_ptr<StreamManagerTy> StreamManager;
279 std::vector<DeviceDataTy> DeviceData;
280 std::vector<CUmodule> Modules;
281
282 // Record entry point associated with device
addOffloadEntry(const int DeviceId,const __tgt_offload_entry entry)283 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
284 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
285 E.Entries.push_back(entry);
286 }
287
288 // Return a pointer to the entry associated with the pointer
getOffloadEntry(const int DeviceId,const void * Addr) const289 const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
290 const void *Addr) const {
291 for (const __tgt_offload_entry &Itr :
292 DeviceData[DeviceId].FuncGblEntries.back().Entries)
293 if (Itr.addr == Addr)
294 return &Itr;
295
296 return nullptr;
297 }
298
299 // Return the pointer to the target entries table
getOffloadEntriesTable(const int DeviceId)300 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
301 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
302
303 if (E.Entries.empty())
304 return nullptr;
305
306 // Update table info according to the entries and return the pointer
307 E.Table.EntriesBegin = E.Entries.data();
308 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
309
310 return &E.Table;
311 }
312
313 // Clear entries table for a device
clearOffloadEntriesTable(const int DeviceId)314 void clearOffloadEntriesTable(const int DeviceId) {
315 DeviceData[DeviceId].FuncGblEntries.emplace_back();
316 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
317 E.Entries.clear();
318 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
319 }
320
getStream(const int DeviceId,__tgt_async_info * AsyncInfoPtr) const321 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
322 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
323
324 if (!AsyncInfoPtr->Queue)
325 AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
326
327 return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
328 }
329
330 public:
331 // This class should not be copied
332 DeviceRTLTy(const DeviceRTLTy &) = delete;
333 DeviceRTLTy(DeviceRTLTy &&) = delete;
334
DeviceRTLTy()335 DeviceRTLTy()
336 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
337 RequiresFlags(OMP_REQ_UNDEFINED) {
338
339 DP("Start initializing CUDA\n");
340
341 CUresult Err = cuInit(0);
342 if (!checkResult(Err, "Error returned from cuInit\n")) {
343 return;
344 }
345
346 Err = cuDeviceGetCount(&NumberOfDevices);
347 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
348 return;
349
350 if (NumberOfDevices == 0) {
351 DP("There are no devices supporting CUDA.\n");
352 return;
353 }
354
355 DeviceData.resize(NumberOfDevices);
356
357 // Get environment variables regarding teams
358 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
359 // OMP_TEAM_LIMIT has been set
360 EnvTeamLimit = std::stoi(EnvStr);
361 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
362 }
363 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
364 // OMP_NUM_TEAMS has been set
365 EnvNumTeams = std::stoi(EnvStr);
366 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
367 }
368
369 StreamManager =
370 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
371 }
372
~DeviceRTLTy()373 ~DeviceRTLTy() {
374 // First destruct stream manager in case of Contexts is destructed before it
375 StreamManager = nullptr;
376
377 for (CUmodule &M : Modules)
378 // Close module
379 if (M)
380 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
381
382 for (DeviceDataTy &D : DeviceData) {
383 // Destroy context
384 if (D.Context) {
385 checkResult(cuCtxSetCurrent(D.Context),
386 "Error returned from cuCtxSetCurrent\n");
387 CUdevice Device;
388 checkResult(cuCtxGetDevice(&Device),
389 "Error returned from cuCtxGetDevice\n");
390 checkResult(cuDevicePrimaryCtxRelease(Device),
391 "Error returned from cuDevicePrimaryCtxRelease\n");
392 }
393 }
394 }
395
396 // Check whether a given DeviceId is valid
isValidDeviceId(const int DeviceId) const397 bool isValidDeviceId(const int DeviceId) const {
398 return DeviceId >= 0 && DeviceId < NumberOfDevices;
399 }
400
getNumOfDevices() const401 int getNumOfDevices() const { return NumberOfDevices; }
402
setRequiresFlag(const int64_t Flags)403 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
404
initDevice(const int DeviceId)405 int initDevice(const int DeviceId) {
406 CUdevice Device;
407
408 DP("Getting device %d\n", DeviceId);
409 CUresult Err = cuDeviceGet(&Device, DeviceId);
410 if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
411 return OFFLOAD_FAIL;
412
413 // Query the current flags of the primary context and set its flags if
414 // it is inactive
415 unsigned int FormerPrimaryCtxFlags = 0;
416 int FormerPrimaryCtxIsActive = 0;
417 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
418 &FormerPrimaryCtxIsActive);
419 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
420 return OFFLOAD_FAIL;
421
422 if (FormerPrimaryCtxIsActive) {
423 DP("The primary context is active, no change to its flags\n");
424 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
425 CU_CTX_SCHED_BLOCKING_SYNC)
426 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
427 } else {
428 DP("The primary context is inactive, set its flags to "
429 "CU_CTX_SCHED_BLOCKING_SYNC\n");
430 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
431 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
432 return OFFLOAD_FAIL;
433 }
434
435 // Retain the per device primary context and save it to use whenever this
436 // device is selected.
437 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
438 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
439 return OFFLOAD_FAIL;
440
441 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
442 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
443 return OFFLOAD_FAIL;
444
445 // Initialize stream pool
446 if (!StreamManager->initializeDeviceStreamPool(DeviceId))
447 return OFFLOAD_FAIL;
448
449 // Query attributes to determine number of threads/block and blocks/grid.
450 int MaxGridDimX;
451 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
452 Device);
453 if (Err != CUDA_SUCCESS) {
454 DP("Error getting max grid dimension, use default value %d\n",
455 DeviceRTLTy::DefaultNumTeams);
456 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
457 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
458 DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
459 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
460 } else {
461 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
462 "at the hard limit\n",
463 MaxGridDimX, DeviceRTLTy::HardTeamLimit);
464 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
465 }
466
467 // We are only exploiting threads along the x axis.
468 int MaxBlockDimX;
469 Err = cuDeviceGetAttribute(&MaxBlockDimX,
470 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
471 if (Err != CUDA_SUCCESS) {
472 DP("Error getting max block dimension, use default value %d\n",
473 DeviceRTLTy::DefaultNumThreads);
474 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
475 } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
476 DP("Using %d CUDA threads per block\n", MaxBlockDimX);
477 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
478 } else {
479 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
480 "capping at the hard limit\n",
481 MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
482 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
483 }
484
485 // Get and set warp size
486 int WarpSize;
487 Err =
488 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
489 if (Err != CUDA_SUCCESS) {
490 DP("Error getting warp size, assume default value 32\n");
491 DeviceData[DeviceId].WarpSize = 32;
492 } else {
493 DP("Using warp size %d\n", WarpSize);
494 DeviceData[DeviceId].WarpSize = WarpSize;
495 }
496
497 // Adjust teams to the env variables
498 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
499 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
500 EnvTeamLimit);
501 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
502 }
503
504 INFO(DeviceId,
505 "Device supports up to %d CUDA blocks and %d threads with a "
506 "warp size of %d\n",
507 DeviceData[DeviceId].BlocksPerGrid,
508 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
509
510 // Set default number of teams
511 if (EnvNumTeams > 0) {
512 DP("Default number of teams set according to environment %d\n",
513 EnvNumTeams);
514 DeviceData[DeviceId].NumTeams = EnvNumTeams;
515 } else {
516 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
517 DP("Default number of teams set according to library's default %d\n",
518 DeviceRTLTy::DefaultNumTeams);
519 }
520
521 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
522 DP("Default number of teams exceeds device limit, capping at %d\n",
523 DeviceData[DeviceId].BlocksPerGrid);
524 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
525 }
526
527 // Set default number of threads
528 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
529 DP("Default number of threads set according to library's default %d\n",
530 DeviceRTLTy::DefaultNumThreads);
531 if (DeviceData[DeviceId].NumThreads >
532 DeviceData[DeviceId].ThreadsPerBlock) {
533 DP("Default number of threads exceeds device limit, capping at %d\n",
534 DeviceData[DeviceId].ThreadsPerBlock);
535 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
536 }
537
538 return OFFLOAD_SUCCESS;
539 }
540
loadBinary(const int DeviceId,const __tgt_device_image * Image)541 __tgt_target_table *loadBinary(const int DeviceId,
542 const __tgt_device_image *Image) {
543 // Set the context we are using
544 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
545 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
546 return nullptr;
547
548 // Clear the offload table as we are going to create a new one.
549 clearOffloadEntriesTable(DeviceId);
550
551 // Create the module and extract the function pointers.
552 CUmodule Module;
553 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
554 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
555 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
556 return nullptr;
557
558 DP("CUDA module successfully loaded!\n");
559
560 Modules.push_back(Module);
561
562 // Find the symbols in the module by name.
563 const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
564 const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
565
566 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
567 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
568 if (!E->addr) {
569 // We return nullptr when something like this happens, the host should
570 // have always something in the address to uniquely identify the target
571 // region.
572 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
573 return nullptr;
574 }
575
576 if (E->size) {
577 __tgt_offload_entry Entry = *E;
578 CUdeviceptr CUPtr;
579 size_t CUSize;
580 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
581 // We keep this style here because we need the name
582 if (Err != CUDA_SUCCESS) {
583 DP("Loading global '%s' (Failed)\n", E->name);
584 CUDA_ERR_STRING(Err);
585 return nullptr;
586 }
587
588 if (CUSize != E->size) {
589 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
590 CUSize, E->size);
591 return nullptr;
592 }
593
594 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
595 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
596
597 Entry.addr = (void *)(CUPtr);
598
599 // Note: In the current implementation declare target variables
600 // can either be link or to. This means that once unified
601 // memory is activated via the requires directive, the variable
602 // can be used directly from the host in both cases.
603 // TODO: when variables types other than to or link are added,
604 // the below condition should be changed to explicitly
605 // check for to and link variables types:
606 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
607 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
608 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
609 // If unified memory is present any target link or to variables
610 // can access host addresses directly. There is no longer a
611 // need for device copies.
612 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
613 DP("Copy linked variable host address (" DPxMOD
614 ") to device address (" DPxMOD ")\n",
615 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
616 }
617
618 addOffloadEntry(DeviceId, Entry);
619
620 continue;
621 }
622
623 CUfunction Func;
624 Err = cuModuleGetFunction(&Func, Module, E->name);
625 // We keep this style here because we need the name
626 if (Err != CUDA_SUCCESS) {
627 DP("Loading '%s' (Failed)\n", E->name);
628 CUDA_ERR_STRING(Err);
629 return nullptr;
630 }
631
632 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
633 DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
634
635 // default value GENERIC (in case symbol is missing from cubin file)
636 int8_t ExecModeVal = ExecutionModeType::GENERIC;
637 std::string ExecModeNameStr(E->name);
638 ExecModeNameStr += "_exec_mode";
639 const char *ExecModeName = ExecModeNameStr.c_str();
640
641 CUdeviceptr ExecModePtr;
642 size_t CUSize;
643 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
644 if (Err == CUDA_SUCCESS) {
645 if (CUSize != sizeof(int8_t)) {
646 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
647 ExecModeName, CUSize, sizeof(int8_t));
648 return nullptr;
649 }
650
651 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
652 if (Err != CUDA_SUCCESS) {
653 DP("Error when copying data from device to host. Pointers: "
654 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
655 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
656 CUDA_ERR_STRING(Err);
657 return nullptr;
658 }
659
660 if (ExecModeVal < 0 || ExecModeVal > 1) {
661 DP("Error wrong exec_mode value specified in cubin file: %d\n",
662 ExecModeVal);
663 return nullptr;
664 }
665 } else {
666 DP("Loading global exec_mode '%s' - symbol missing, using default "
667 "value GENERIC (1)\n",
668 ExecModeName);
669 CUDA_ERR_STRING(Err);
670 }
671
672 KernelsList.emplace_back(Func, ExecModeVal);
673
674 __tgt_offload_entry Entry = *E;
675 Entry.addr = &KernelsList.back();
676 addOffloadEntry(DeviceId, Entry);
677 }
678
679 // send device environment data to the device
680 {
681 omptarget_device_environmentTy DeviceEnv{0};
682
683 #ifdef OMPTARGET_DEBUG
684 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
685 DeviceEnv.debug_level = std::stoi(EnvStr);
686 #endif
687
688 const char *DeviceEnvName = "omptarget_device_environment";
689 CUdeviceptr DeviceEnvPtr;
690 size_t CUSize;
691
692 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
693 if (Err == CUDA_SUCCESS) {
694 if (CUSize != sizeof(DeviceEnv)) {
695 DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
696 DeviceEnvName, CUSize, sizeof(int32_t));
697 CUDA_ERR_STRING(Err);
698 return nullptr;
699 }
700
701 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
702 if (Err != CUDA_SUCCESS) {
703 DP("Error when copying data from host to device. Pointers: "
704 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
705 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
706 CUDA_ERR_STRING(Err);
707 return nullptr;
708 }
709
710 DP("Sending global device environment data %zu bytes\n", CUSize);
711 } else {
712 DP("Finding global device environment '%s' - symbol missing.\n",
713 DeviceEnvName);
714 DP("Continue, considering this is a device RTL which does not accept "
715 "environment setting.\n");
716 }
717 }
718
719 return getOffloadEntriesTable(DeviceId);
720 }
721
dataAlloc(const int DeviceId,const int64_t Size) const722 void *dataAlloc(const int DeviceId, const int64_t Size) const {
723 if (Size == 0)
724 return nullptr;
725
726 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
727 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
728 return nullptr;
729
730 CUdeviceptr DevicePtr;
731 Err = cuMemAlloc(&DevicePtr, Size);
732 if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
733 return nullptr;
734
735 return (void *)DevicePtr;
736 }
737
dataSubmit(const int DeviceId,const void * TgtPtr,const void * HstPtr,const int64_t Size,__tgt_async_info * AsyncInfoPtr) const738 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
739 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
740 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
741
742 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
743 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
744 return OFFLOAD_FAIL;
745
746 CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
747
748 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
749 if (Err != CUDA_SUCCESS) {
750 DP("Error when copying data from host to device. Pointers: host = " DPxMOD
751 ", device = " DPxMOD ", size = %" PRId64 "\n",
752 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
753 CUDA_ERR_STRING(Err);
754 return OFFLOAD_FAIL;
755 }
756
757 return OFFLOAD_SUCCESS;
758 }
759
dataRetrieve(const int DeviceId,void * HstPtr,const void * TgtPtr,const int64_t Size,__tgt_async_info * AsyncInfoPtr) const760 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
761 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
762 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
763
764 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
765 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
766 return OFFLOAD_FAIL;
767
768 CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
769
770 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
771 if (Err != CUDA_SUCCESS) {
772 DP("Error when copying data from device to host. Pointers: host = " DPxMOD
773 ", device = " DPxMOD ", size = %" PRId64 "\n",
774 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
775 CUDA_ERR_STRING(Err);
776 return OFFLOAD_FAIL;
777 }
778
779 return OFFLOAD_SUCCESS;
780 }
781
dataExchange(int SrcDevId,const void * SrcPtr,int DstDevId,void * DstPtr,int64_t Size,__tgt_async_info * AsyncInfoPtr) const782 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
783 int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
784 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
785
786 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
787 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
788 return OFFLOAD_FAIL;
789
790 CUstream Stream = getStream(SrcDevId, AsyncInfoPtr);
791
792 // If they are two devices, we try peer to peer copy first
793 if (SrcDevId != DstDevId) {
794 int CanAccessPeer = 0;
795 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
796 if (Err != CUDA_SUCCESS) {
797 DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
798 ", dst = %" PRId32 "\n",
799 SrcDevId, DstDevId);
800 CUDA_ERR_STRING(Err);
801 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
802 }
803
804 if (!CanAccessPeer) {
805 DP("P2P memcpy not supported so fall back to D2D memcpy");
806 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
807 }
808
809 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
810 if (Err != CUDA_SUCCESS) {
811 DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
812 ", dst = %" PRId32 "\n",
813 SrcDevId, DstDevId);
814 CUDA_ERR_STRING(Err);
815 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
816 }
817
818 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
819 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
820 Size, Stream);
821 if (Err == CUDA_SUCCESS)
822 return OFFLOAD_SUCCESS;
823
824 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
825 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
826 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
827 CUDA_ERR_STRING(Err);
828 }
829
830 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
831 }
832
dataDelete(const int DeviceId,void * TgtPtr) const833 int dataDelete(const int DeviceId, void *TgtPtr) const {
834 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
835 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
836 return OFFLOAD_FAIL;
837
838 Err = cuMemFree((CUdeviceptr)TgtPtr);
839 if (!checkResult(Err, "Error returned from cuMemFree\n"))
840 return OFFLOAD_FAIL;
841
842 return OFFLOAD_SUCCESS;
843 }
844
runTargetTeamRegion(const int DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,const int ArgNum,const int TeamNum,const int ThreadLimit,const unsigned int LoopTripCount,__tgt_async_info * AsyncInfo) const845 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
846 ptrdiff_t *TgtOffsets, const int ArgNum,
847 const int TeamNum, const int ThreadLimit,
848 const unsigned int LoopTripCount,
849 __tgt_async_info *AsyncInfo) const {
850 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
851 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
852 return OFFLOAD_FAIL;
853
854 // All args are references.
855 std::vector<void *> Args(ArgNum);
856 std::vector<void *> Ptrs(ArgNum);
857
858 for (int I = 0; I < ArgNum; ++I) {
859 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
860 Args[I] = &Ptrs[I];
861 }
862
863 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
864
865 int CudaThreadsPerBlock;
866 if (ThreadLimit > 0) {
867 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
868 CudaThreadsPerBlock = ThreadLimit;
869 // Add master warp if necessary
870 if (KernelInfo->ExecutionMode == GENERIC) {
871 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
872 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
873 }
874 } else {
875 DP("Setting CUDA threads per block to default %d\n",
876 DeviceData[DeviceId].NumThreads);
877 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
878 }
879
880 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
881 DP("Threads per block capped at device limit %d\n",
882 DeviceData[DeviceId].ThreadsPerBlock);
883 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
884 }
885
886 if (!KernelInfo->MaxThreadsPerBlock) {
887 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
888 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
889 KernelInfo->Func);
890 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
891 return OFFLOAD_FAIL;
892 }
893
894 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
895 DP("Threads per block capped at kernel limit %d\n",
896 KernelInfo->MaxThreadsPerBlock);
897 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
898 }
899
900 unsigned int CudaBlocksPerGrid;
901 if (TeamNum <= 0) {
902 if (LoopTripCount > 0 && EnvNumTeams < 0) {
903 if (KernelInfo->ExecutionMode == SPMD) {
904 // We have a combined construct, i.e. `target teams distribute
905 // parallel for [simd]`. We launch so many teams so that each thread
906 // will execute one iteration of the loop. round up to the nearest
907 // integer
908 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
909 } else {
910 // If we reach this point, then we have a non-combined construct, i.e.
911 // `teams distribute` with a nested `parallel for` and each team is
912 // assigned one iteration of the `distribute` loop. E.g.:
913 //
914 // #pragma omp target teams distribute
915 // for(...loop_tripcount...) {
916 // #pragma omp parallel for
917 // for(...) {}
918 // }
919 //
920 // Threads within a team will execute the iterations of the `parallel`
921 // loop.
922 CudaBlocksPerGrid = LoopTripCount;
923 }
924 DP("Using %d teams due to loop trip count %" PRIu32
925 " and number of threads per block %d\n",
926 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
927 } else {
928 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
929 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
930 }
931 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
932 DP("Capping number of teams to team limit %d\n",
933 DeviceData[DeviceId].BlocksPerGrid);
934 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
935 } else {
936 DP("Using requested number of teams %d\n", TeamNum);
937 CudaBlocksPerGrid = TeamNum;
938 }
939
940 INFO(DeviceId,
941 "Launching kernel %s with %d blocks and %d threads in %s "
942 "mode\n",
943 (getOffloadEntry(DeviceId, TgtEntryPtr))
944 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
945 : "(null)",
946 CudaBlocksPerGrid, CudaThreadsPerBlock,
947 (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic");
948
949 CUstream Stream = getStream(DeviceId, AsyncInfo);
950 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
951 /* gridDimZ */ 1, CudaThreadsPerBlock,
952 /* blockDimY */ 1, /* blockDimZ */ 1,
953 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
954 if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
955 return OFFLOAD_FAIL;
956
957 DP("Launch of entry point at " DPxMOD " successful!\n",
958 DPxPTR(TgtEntryPtr));
959
960 return OFFLOAD_SUCCESS;
961 }
962
synchronize(const int DeviceId,__tgt_async_info * AsyncInfoPtr) const963 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
964 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
965 CUresult Err = cuStreamSynchronize(Stream);
966 if (Err != CUDA_SUCCESS) {
967 DP("Error when synchronizing stream. stream = " DPxMOD
968 ", async info ptr = " DPxMOD "\n",
969 DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
970 CUDA_ERR_STRING(Err);
971 return OFFLOAD_FAIL;
972 }
973
974 // Once the stream is synchronized, return it to stream pool and reset
975 // async_info. This is to make sure the synchronization only works for its
976 // own tasks.
977 StreamManager->returnStream(
978 DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
979 AsyncInfoPtr->Queue = nullptr;
980
981 return OFFLOAD_SUCCESS;
982 }
983 };
984
985 DeviceRTLTy DeviceRTL;
986 } // namespace
987
988 // Exposed library API function
989 #ifdef __cplusplus
990 extern "C" {
991 #endif
992
__tgt_rtl_is_valid_binary(__tgt_device_image * image)993 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
994 return elf_check_machine(image, /* EM_CUDA */ 190);
995 }
996
__tgt_rtl_number_of_devices()997 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
998
__tgt_rtl_init_requires(int64_t RequiresFlags)999 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1000 DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1001 DeviceRTL.setRequiresFlag(RequiresFlags);
1002 return RequiresFlags;
1003 }
1004
__tgt_rtl_is_data_exchangable(int32_t src_dev_id,int dst_dev_id)1005 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1006 if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1007 DeviceRTL.isValidDeviceId(dst_dev_id))
1008 return 1;
1009
1010 return 0;
1011 }
1012
__tgt_rtl_init_device(int32_t device_id)1013 int32_t __tgt_rtl_init_device(int32_t device_id) {
1014 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1015
1016 return DeviceRTL.initDevice(device_id);
1017 }
1018
__tgt_rtl_load_binary(int32_t device_id,__tgt_device_image * image)1019 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1020 __tgt_device_image *image) {
1021 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1022
1023 return DeviceRTL.loadBinary(device_id, image);
1024 }
1025
__tgt_rtl_data_alloc(int32_t device_id,int64_t size,void *)1026 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
1027 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1028
1029 return DeviceRTL.dataAlloc(device_id, size);
1030 }
1031
__tgt_rtl_data_submit(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size)1032 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1033 int64_t size) {
1034 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1035
1036 __tgt_async_info async_info;
1037 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1038 size, &async_info);
1039 if (rc != OFFLOAD_SUCCESS)
1040 return OFFLOAD_FAIL;
1041
1042 return __tgt_rtl_synchronize(device_id, &async_info);
1043 }
1044
__tgt_rtl_data_submit_async(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1045 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1046 void *hst_ptr, int64_t size,
1047 __tgt_async_info *async_info_ptr) {
1048 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1049 assert(async_info_ptr && "async_info_ptr is nullptr");
1050
1051 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1052 async_info_ptr);
1053 }
1054
__tgt_rtl_data_retrieve(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size)1055 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1056 int64_t size) {
1057 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1058
1059 __tgt_async_info async_info;
1060 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1061 size, &async_info);
1062 if (rc != OFFLOAD_SUCCESS)
1063 return OFFLOAD_FAIL;
1064
1065 return __tgt_rtl_synchronize(device_id, &async_info);
1066 }
1067
__tgt_rtl_data_retrieve_async(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size,__tgt_async_info * async_info_ptr)1068 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1069 void *tgt_ptr, int64_t size,
1070 __tgt_async_info *async_info_ptr) {
1071 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1072 assert(async_info_ptr && "async_info_ptr is nullptr");
1073
1074 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1075 async_info_ptr);
1076 }
1077
__tgt_rtl_data_exchange_async(int32_t src_dev_id,void * src_ptr,int dst_dev_id,void * dst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1078 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1079 int dst_dev_id, void *dst_ptr,
1080 int64_t size,
1081 __tgt_async_info *async_info_ptr) {
1082 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1083 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1084 assert(async_info_ptr && "async_info_ptr is nullptr");
1085
1086 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1087 async_info_ptr);
1088 }
1089
__tgt_rtl_data_exchange(int32_t src_dev_id,void * src_ptr,int32_t dst_dev_id,void * dst_ptr,int64_t size)1090 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1091 int32_t dst_dev_id, void *dst_ptr,
1092 int64_t size) {
1093 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1094 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1095
1096 __tgt_async_info async_info;
1097 const int32_t rc = __tgt_rtl_data_exchange_async(
1098 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info);
1099 if (rc != OFFLOAD_SUCCESS)
1100 return OFFLOAD_FAIL;
1101
1102 return __tgt_rtl_synchronize(src_dev_id, &async_info);
1103 }
1104
__tgt_rtl_data_delete(int32_t device_id,void * tgt_ptr)1105 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1106 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1107
1108 return DeviceRTL.dataDelete(device_id, tgt_ptr);
1109 }
1110
__tgt_rtl_run_target_team_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount)1111 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1112 void **tgt_args,
1113 ptrdiff_t *tgt_offsets,
1114 int32_t arg_num, int32_t team_num,
1115 int32_t thread_limit,
1116 uint64_t loop_tripcount) {
1117 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1118
1119 __tgt_async_info async_info;
1120 const int32_t rc = __tgt_rtl_run_target_team_region_async(
1121 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1122 thread_limit, loop_tripcount, &async_info);
1123 if (rc != OFFLOAD_SUCCESS)
1124 return OFFLOAD_FAIL;
1125
1126 return __tgt_rtl_synchronize(device_id, &async_info);
1127 }
1128
__tgt_rtl_run_target_team_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount,__tgt_async_info * async_info_ptr)1129 int32_t __tgt_rtl_run_target_team_region_async(
1130 int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1131 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1132 int32_t thread_limit, uint64_t loop_tripcount,
1133 __tgt_async_info *async_info_ptr) {
1134 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1135
1136 return DeviceRTL.runTargetTeamRegion(
1137 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1138 thread_limit, loop_tripcount, async_info_ptr);
1139 }
1140
__tgt_rtl_run_target_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num)1141 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1142 void **tgt_args, ptrdiff_t *tgt_offsets,
1143 int32_t arg_num) {
1144 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1145
1146 __tgt_async_info async_info;
1147 const int32_t rc = __tgt_rtl_run_target_region_async(
1148 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
1149 if (rc != OFFLOAD_SUCCESS)
1150 return OFFLOAD_FAIL;
1151
1152 return __tgt_rtl_synchronize(device_id, &async_info);
1153 }
1154
__tgt_rtl_run_target_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,__tgt_async_info * async_info_ptr)1155 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1156 void *tgt_entry_ptr, void **tgt_args,
1157 ptrdiff_t *tgt_offsets,
1158 int32_t arg_num,
1159 __tgt_async_info *async_info_ptr) {
1160 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1161
1162 return __tgt_rtl_run_target_team_region_async(
1163 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1164 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1165 async_info_ptr);
1166 }
1167
__tgt_rtl_synchronize(int32_t device_id,__tgt_async_info * async_info_ptr)1168 int32_t __tgt_rtl_synchronize(int32_t device_id,
1169 __tgt_async_info *async_info_ptr) {
1170 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1171 assert(async_info_ptr && "async_info_ptr is nullptr");
1172 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1173
1174 return DeviceRTL.synchronize(device_id, async_info_ptr);
1175 }
1176
1177 #ifdef __cplusplus
1178 }
1179 #endif
1180