• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <vector>
21 
22 #include "Debug.h"
23 #include "omptargetplugin.h"
24 
25 #define TARGET_NAME CUDA
26 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
27 
28 // Utility for retrieving and printing CUDA error string.
29 #ifdef OMPTARGET_DEBUG
30 #define CUDA_ERR_STRING(err)                                                   \
31   do {                                                                         \
32     if (getDebugLevel() > 0) {                                                 \
33       const char *errStr = nullptr;                                            \
34       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
35       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
36         DP("Unrecognized CUDA error code: %d\n", err);                         \
37       else if (errStr_status == CUDA_SUCCESS)                                  \
38         DP("CUDA error is: %s\n", errStr);                                     \
39       else {                                                                   \
40         DP("Unresolved CUDA error code: %d\n", err);                           \
41         DP("Unsuccessful cuGetErrorString return status: %d\n",                \
42            errStr_status);                                                     \
43       }                                                                        \
44     }                                                                          \
45   } while (false)
46 #else // OMPTARGET_DEBUG
47 #define CUDA_ERR_STRING(err) {}
48 #endif // OMPTARGET_DEBUG
49 
50 #include "../../common/elf_common.c"
51 
52 /// Keep entries table per device.
53 struct FuncOrGblEntryTy {
54   __tgt_target_table Table;
55   std::vector<__tgt_offload_entry> Entries;
56 };
57 
58 enum ExecutionModeType {
59   SPMD, // constructors, destructors,
60   // combined constructs (`teams distribute parallel for [simd]`)
61   GENERIC, // everything else
62   NONE
63 };
64 
65 /// Use a single entity to encode a kernel and a set of flags.
66 struct KernelTy {
67   CUfunction Func;
68 
69   // execution mode of kernel
70   // 0 - SPMD mode (without master warp)
71   // 1 - Generic mode (with master warp)
72   int8_t ExecutionMode;
73 
74   /// Maximal number of threads per block for this kernel.
75   int MaxThreadsPerBlock = 0;
76 
KernelTyKernelTy77   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
78       : Func(_Func), ExecutionMode(_ExecutionMode) {}
79 };
80 
81 /// Device environment data
82 /// Manually sync with the deviceRTL side for now, move to a dedicated header
83 /// file later.
84 struct omptarget_device_environmentTy {
85   int32_t debug_level;
86 };
87 
88 namespace {
checkResult(CUresult Err,const char * ErrMsg)89 bool checkResult(CUresult Err, const char *ErrMsg) {
90   if (Err == CUDA_SUCCESS)
91     return true;
92 
93   DP("%s", ErrMsg);
94   CUDA_ERR_STRING(Err);
95   return false;
96 }
97 
memcpyDtoD(const void * SrcPtr,void * DstPtr,int64_t Size,CUstream Stream)98 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
99                CUstream Stream) {
100   CUresult Err =
101       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
102 
103   if (Err != CUDA_SUCCESS) {
104     DP("Error when copying data from device to device. Pointers: src "
105        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
106        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
107     CUDA_ERR_STRING(Err);
108     return OFFLOAD_FAIL;
109   }
110 
111   return OFFLOAD_SUCCESS;
112 }
113 
114 // Structure contains per-device data
115 struct DeviceDataTy {
116   /// List that contains all the kernels.
117   std::list<KernelTy> KernelsList;
118 
119   std::list<FuncOrGblEntryTy> FuncGblEntries;
120 
121   CUcontext Context = nullptr;
122   // Device properties
123   int ThreadsPerBlock = 0;
124   int BlocksPerGrid = 0;
125   int WarpSize = 0;
126   // OpenMP properties
127   int NumTeams = 0;
128   int NumThreads = 0;
129 };
130 
131 class StreamManagerTy {
132   int NumberOfDevices;
133   // The initial size of stream pool
134   int EnvNumInitialStreams;
135   // Per-device stream mutex
136   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
137   // Per-device stream Id indicates the next available stream in the pool
138   std::vector<int> NextStreamId;
139   // Per-device stream pool
140   std::vector<std::vector<CUstream>> StreamPool;
141   // Reference to per-device data
142   std::vector<DeviceDataTy> &DeviceData;
143 
144   // If there is no CUstream left in the pool, we will resize the pool to
145   // allocate more CUstream. This function should be called with device mutex,
146   // and we do not resize to smaller one.
resizeStreamPool(const int DeviceId,const size_t NewSize)147   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
148     std::vector<CUstream> &Pool = StreamPool[DeviceId];
149     const size_t CurrentSize = Pool.size();
150     assert(NewSize > CurrentSize && "new size is not larger than current size");
151 
152     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
153     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
154       // We will return if cannot switch to the right context in case of
155       // creating bunch of streams that are not corresponding to the right
156       // device. The offloading will fail later because selected CUstream is
157       // nullptr.
158       return;
159     }
160 
161     Pool.resize(NewSize, nullptr);
162 
163     for (size_t I = CurrentSize; I < NewSize; ++I) {
164       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
165                   "Error returned from cuStreamCreate\n");
166     }
167   }
168 
169 public:
StreamManagerTy(const int NumberOfDevices,std::vector<DeviceDataTy> & DeviceData)170   StreamManagerTy(const int NumberOfDevices,
171                   std::vector<DeviceDataTy> &DeviceData)
172       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
173         DeviceData(DeviceData) {
174     StreamPool.resize(NumberOfDevices);
175     NextStreamId.resize(NumberOfDevices);
176     StreamMtx.resize(NumberOfDevices);
177 
178     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
179       EnvNumInitialStreams = std::stoi(EnvStr);
180 
181     // Initialize the next stream id
182     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
183 
184     // Initialize stream mutex
185     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
186       Ptr = std::make_unique<std::mutex>();
187   }
188 
~StreamManagerTy()189   ~StreamManagerTy() {
190     // Destroy streams
191     for (int I = 0; I < NumberOfDevices; ++I) {
192       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
193                   "Error returned from cuCtxSetCurrent\n");
194 
195       for (CUstream &S : StreamPool[I]) {
196         if (S)
197           checkResult(cuStreamDestroy(S),
198                       "Error returned from cuStreamDestroy\n");
199       }
200     }
201   }
202 
203   // Get a CUstream from pool. Per-device next stream id always points to the
204   // next available CUstream. That means, CUstreams [0, id-1] have been
205   // assigned, and [id,] are still available. If there is no CUstream left, we
206   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
207   // the id will increase one.
208   // xxxxxs+++++++++
209   //      ^
210   //      id
211   // After assignment, the pool becomes the following and s is assigned.
212   // xxxxxs+++++++++
213   //       ^
214   //       id
getStream(const int DeviceId)215   CUstream getStream(const int DeviceId) {
216     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
217     int &Id = NextStreamId[DeviceId];
218     // No CUstream left in the pool, we need to request from CUDA RT
219     if (Id == StreamPool[DeviceId].size()) {
220       // By default we double the stream pool every time
221       resizeStreamPool(DeviceId, Id * 2);
222     }
223     return StreamPool[DeviceId][Id++];
224   }
225 
226   // Return a CUstream back to pool. As mentioned above, per-device next
227   // stream is always points to the next available CUstream, so when we return
228   // a CUstream, we need to first decrease the id, and then copy the CUstream
229   // back.
230   // It is worth noting that, the order of streams return might be different
231   // from that they're assigned, that saying, at some point, there might be
232   // two identical CUstreams.
233   // xxax+a+++++
234   //     ^
235   //     id
236   // However, it doesn't matter, because they're always on the two sides of
237   // id. The left one will in the end be overwritten by another CUstream.
238   // Therefore, after several execution, the order of pool might be different
239   // from its initial state.
returnStream(const int DeviceId,CUstream Stream)240   void returnStream(const int DeviceId, CUstream Stream) {
241     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
242     int &Id = NextStreamId[DeviceId];
243     assert(Id > 0 && "Wrong stream ID");
244     StreamPool[DeviceId][--Id] = Stream;
245   }
246 
initializeDeviceStreamPool(const int DeviceId)247   bool initializeDeviceStreamPool(const int DeviceId) {
248     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
249 
250     resizeStreamPool(DeviceId, EnvNumInitialStreams);
251 
252     // Check the size of stream pool
253     if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
254       return false;
255 
256     // Check whether each stream is valid
257     for (CUstream &S : StreamPool[DeviceId])
258       if (!S)
259         return false;
260 
261     return true;
262   }
263 };
264 
265 class DeviceRTLTy {
266   int NumberOfDevices;
267   // OpenMP environment properties
268   int EnvNumTeams;
269   int EnvTeamLimit;
270   // OpenMP requires flags
271   int64_t RequiresFlags;
272 
273   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
274   static constexpr const int HardThreadLimit = 1024;
275   static constexpr const int DefaultNumTeams = 128;
276   static constexpr const int DefaultNumThreads = 128;
277 
278   std::unique_ptr<StreamManagerTy> StreamManager;
279   std::vector<DeviceDataTy> DeviceData;
280   std::vector<CUmodule> Modules;
281 
282   // Record entry point associated with device
addOffloadEntry(const int DeviceId,const __tgt_offload_entry entry)283   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
284     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
285     E.Entries.push_back(entry);
286   }
287 
288   // Return a pointer to the entry associated with the pointer
getOffloadEntry(const int DeviceId,const void * Addr) const289   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
290                                              const void *Addr) const {
291     for (const __tgt_offload_entry &Itr :
292          DeviceData[DeviceId].FuncGblEntries.back().Entries)
293       if (Itr.addr == Addr)
294         return &Itr;
295 
296     return nullptr;
297   }
298 
299   // Return the pointer to the target entries table
getOffloadEntriesTable(const int DeviceId)300   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
301     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
302 
303     if (E.Entries.empty())
304       return nullptr;
305 
306     // Update table info according to the entries and return the pointer
307     E.Table.EntriesBegin = E.Entries.data();
308     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
309 
310     return &E.Table;
311   }
312 
313   // Clear entries table for a device
clearOffloadEntriesTable(const int DeviceId)314   void clearOffloadEntriesTable(const int DeviceId) {
315     DeviceData[DeviceId].FuncGblEntries.emplace_back();
316     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
317     E.Entries.clear();
318     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
319   }
320 
getStream(const int DeviceId,__tgt_async_info * AsyncInfoPtr) const321   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
322     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
323 
324     if (!AsyncInfoPtr->Queue)
325       AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
326 
327     return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
328   }
329 
330 public:
331   // This class should not be copied
332   DeviceRTLTy(const DeviceRTLTy &) = delete;
333   DeviceRTLTy(DeviceRTLTy &&) = delete;
334 
DeviceRTLTy()335   DeviceRTLTy()
336       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
337         RequiresFlags(OMP_REQ_UNDEFINED) {
338 
339     DP("Start initializing CUDA\n");
340 
341     CUresult Err = cuInit(0);
342     if (!checkResult(Err, "Error returned from cuInit\n")) {
343       return;
344     }
345 
346     Err = cuDeviceGetCount(&NumberOfDevices);
347     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
348       return;
349 
350     if (NumberOfDevices == 0) {
351       DP("There are no devices supporting CUDA.\n");
352       return;
353     }
354 
355     DeviceData.resize(NumberOfDevices);
356 
357     // Get environment variables regarding teams
358     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
359       // OMP_TEAM_LIMIT has been set
360       EnvTeamLimit = std::stoi(EnvStr);
361       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
362     }
363     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
364       // OMP_NUM_TEAMS has been set
365       EnvNumTeams = std::stoi(EnvStr);
366       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
367     }
368 
369     StreamManager =
370         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
371   }
372 
~DeviceRTLTy()373   ~DeviceRTLTy() {
374     // First destruct stream manager in case of Contexts is destructed before it
375     StreamManager = nullptr;
376 
377     for (CUmodule &M : Modules)
378       // Close module
379       if (M)
380         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
381 
382     for (DeviceDataTy &D : DeviceData) {
383       // Destroy context
384       if (D.Context) {
385         checkResult(cuCtxSetCurrent(D.Context),
386                     "Error returned from cuCtxSetCurrent\n");
387         CUdevice Device;
388         checkResult(cuCtxGetDevice(&Device),
389                     "Error returned from cuCtxGetDevice\n");
390         checkResult(cuDevicePrimaryCtxRelease(Device),
391                     "Error returned from cuDevicePrimaryCtxRelease\n");
392       }
393     }
394   }
395 
396   // Check whether a given DeviceId is valid
isValidDeviceId(const int DeviceId) const397   bool isValidDeviceId(const int DeviceId) const {
398     return DeviceId >= 0 && DeviceId < NumberOfDevices;
399   }
400 
getNumOfDevices() const401   int getNumOfDevices() const { return NumberOfDevices; }
402 
setRequiresFlag(const int64_t Flags)403   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
404 
initDevice(const int DeviceId)405   int initDevice(const int DeviceId) {
406     CUdevice Device;
407 
408     DP("Getting device %d\n", DeviceId);
409     CUresult Err = cuDeviceGet(&Device, DeviceId);
410     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
411       return OFFLOAD_FAIL;
412 
413     // Query the current flags of the primary context and set its flags if
414     // it is inactive
415     unsigned int FormerPrimaryCtxFlags = 0;
416     int FormerPrimaryCtxIsActive = 0;
417     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
418                                      &FormerPrimaryCtxIsActive);
419     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
420       return OFFLOAD_FAIL;
421 
422     if (FormerPrimaryCtxIsActive) {
423       DP("The primary context is active, no change to its flags\n");
424       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
425           CU_CTX_SCHED_BLOCKING_SYNC)
426         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
427     } else {
428       DP("The primary context is inactive, set its flags to "
429          "CU_CTX_SCHED_BLOCKING_SYNC\n");
430       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
431       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
432         return OFFLOAD_FAIL;
433     }
434 
435     // Retain the per device primary context and save it to use whenever this
436     // device is selected.
437     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
438     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
439       return OFFLOAD_FAIL;
440 
441     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
442     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
443       return OFFLOAD_FAIL;
444 
445     // Initialize stream pool
446     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
447       return OFFLOAD_FAIL;
448 
449     // Query attributes to determine number of threads/block and blocks/grid.
450     int MaxGridDimX;
451     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
452                                Device);
453     if (Err != CUDA_SUCCESS) {
454       DP("Error getting max grid dimension, use default value %d\n",
455          DeviceRTLTy::DefaultNumTeams);
456       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
457     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
458       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
459       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
460     } else {
461       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
462          "at the hard limit\n",
463          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
464       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
465     }
466 
467     // We are only exploiting threads along the x axis.
468     int MaxBlockDimX;
469     Err = cuDeviceGetAttribute(&MaxBlockDimX,
470                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
471     if (Err != CUDA_SUCCESS) {
472       DP("Error getting max block dimension, use default value %d\n",
473          DeviceRTLTy::DefaultNumThreads);
474       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
475     } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
476       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
477       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
478     } else {
479       DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
480          "capping at the hard limit\n",
481          MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
482       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
483     }
484 
485     // Get and set warp size
486     int WarpSize;
487     Err =
488         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
489     if (Err != CUDA_SUCCESS) {
490       DP("Error getting warp size, assume default value 32\n");
491       DeviceData[DeviceId].WarpSize = 32;
492     } else {
493       DP("Using warp size %d\n", WarpSize);
494       DeviceData[DeviceId].WarpSize = WarpSize;
495     }
496 
497     // Adjust teams to the env variables
498     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
499       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
500          EnvTeamLimit);
501       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
502     }
503 
504     INFO(DeviceId,
505          "Device supports up to %d CUDA blocks and %d threads with a "
506          "warp size of %d\n",
507          DeviceData[DeviceId].BlocksPerGrid,
508          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
509 
510     // Set default number of teams
511     if (EnvNumTeams > 0) {
512       DP("Default number of teams set according to environment %d\n",
513          EnvNumTeams);
514       DeviceData[DeviceId].NumTeams = EnvNumTeams;
515     } else {
516       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
517       DP("Default number of teams set according to library's default %d\n",
518          DeviceRTLTy::DefaultNumTeams);
519     }
520 
521     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
522       DP("Default number of teams exceeds device limit, capping at %d\n",
523          DeviceData[DeviceId].BlocksPerGrid);
524       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
525     }
526 
527     // Set default number of threads
528     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
529     DP("Default number of threads set according to library's default %d\n",
530        DeviceRTLTy::DefaultNumThreads);
531     if (DeviceData[DeviceId].NumThreads >
532         DeviceData[DeviceId].ThreadsPerBlock) {
533       DP("Default number of threads exceeds device limit, capping at %d\n",
534          DeviceData[DeviceId].ThreadsPerBlock);
535       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
536     }
537 
538     return OFFLOAD_SUCCESS;
539   }
540 
loadBinary(const int DeviceId,const __tgt_device_image * Image)541   __tgt_target_table *loadBinary(const int DeviceId,
542                                  const __tgt_device_image *Image) {
543     // Set the context we are using
544     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
545     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
546       return nullptr;
547 
548     // Clear the offload table as we are going to create a new one.
549     clearOffloadEntriesTable(DeviceId);
550 
551     // Create the module and extract the function pointers.
552     CUmodule Module;
553     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
554     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
555     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
556       return nullptr;
557 
558     DP("CUDA module successfully loaded!\n");
559 
560     Modules.push_back(Module);
561 
562     // Find the symbols in the module by name.
563     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
564     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
565 
566     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
567     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
568       if (!E->addr) {
569         // We return nullptr when something like this happens, the host should
570         // have always something in the address to uniquely identify the target
571         // region.
572         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
573         return nullptr;
574       }
575 
576       if (E->size) {
577         __tgt_offload_entry Entry = *E;
578         CUdeviceptr CUPtr;
579         size_t CUSize;
580         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
581         // We keep this style here because we need the name
582         if (Err != CUDA_SUCCESS) {
583           DP("Loading global '%s' (Failed)\n", E->name);
584           CUDA_ERR_STRING(Err);
585           return nullptr;
586         }
587 
588         if (CUSize != E->size) {
589           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
590              CUSize, E->size);
591           return nullptr;
592         }
593 
594         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
595            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
596 
597         Entry.addr = (void *)(CUPtr);
598 
599         // Note: In the current implementation declare target variables
600         // can either be link or to. This means that once unified
601         // memory is activated via the requires directive, the variable
602         // can be used directly from the host in both cases.
603         // TODO: when variables types other than to or link are added,
604         // the below condition should be changed to explicitly
605         // check for to and link variables types:
606         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
607         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
608         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
609           // If unified memory is present any target link or to variables
610           // can access host addresses directly. There is no longer a
611           // need for device copies.
612           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
613           DP("Copy linked variable host address (" DPxMOD
614              ") to device address (" DPxMOD ")\n",
615              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
616         }
617 
618         addOffloadEntry(DeviceId, Entry);
619 
620         continue;
621       }
622 
623       CUfunction Func;
624       Err = cuModuleGetFunction(&Func, Module, E->name);
625       // We keep this style here because we need the name
626       if (Err != CUDA_SUCCESS) {
627         DP("Loading '%s' (Failed)\n", E->name);
628         CUDA_ERR_STRING(Err);
629         return nullptr;
630       }
631 
632       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
633          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
634 
635       // default value GENERIC (in case symbol is missing from cubin file)
636       int8_t ExecModeVal = ExecutionModeType::GENERIC;
637       std::string ExecModeNameStr(E->name);
638       ExecModeNameStr += "_exec_mode";
639       const char *ExecModeName = ExecModeNameStr.c_str();
640 
641       CUdeviceptr ExecModePtr;
642       size_t CUSize;
643       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
644       if (Err == CUDA_SUCCESS) {
645         if (CUSize != sizeof(int8_t)) {
646           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
647              ExecModeName, CUSize, sizeof(int8_t));
648           return nullptr;
649         }
650 
651         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
652         if (Err != CUDA_SUCCESS) {
653           DP("Error when copying data from device to host. Pointers: "
654              "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
655              DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
656           CUDA_ERR_STRING(Err);
657           return nullptr;
658         }
659 
660         if (ExecModeVal < 0 || ExecModeVal > 1) {
661           DP("Error wrong exec_mode value specified in cubin file: %d\n",
662              ExecModeVal);
663           return nullptr;
664         }
665       } else {
666         DP("Loading global exec_mode '%s' - symbol missing, using default "
667            "value GENERIC (1)\n",
668            ExecModeName);
669         CUDA_ERR_STRING(Err);
670       }
671 
672       KernelsList.emplace_back(Func, ExecModeVal);
673 
674       __tgt_offload_entry Entry = *E;
675       Entry.addr = &KernelsList.back();
676       addOffloadEntry(DeviceId, Entry);
677     }
678 
679     // send device environment data to the device
680     {
681       omptarget_device_environmentTy DeviceEnv{0};
682 
683 #ifdef OMPTARGET_DEBUG
684       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
685         DeviceEnv.debug_level = std::stoi(EnvStr);
686 #endif
687 
688       const char *DeviceEnvName = "omptarget_device_environment";
689       CUdeviceptr DeviceEnvPtr;
690       size_t CUSize;
691 
692       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
693       if (Err == CUDA_SUCCESS) {
694         if (CUSize != sizeof(DeviceEnv)) {
695           DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
696              DeviceEnvName, CUSize, sizeof(int32_t));
697           CUDA_ERR_STRING(Err);
698           return nullptr;
699         }
700 
701         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
702         if (Err != CUDA_SUCCESS) {
703           DP("Error when copying data from host to device. Pointers: "
704              "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
705              DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
706           CUDA_ERR_STRING(Err);
707           return nullptr;
708         }
709 
710         DP("Sending global device environment data %zu bytes\n", CUSize);
711       } else {
712         DP("Finding global device environment '%s' - symbol missing.\n",
713            DeviceEnvName);
714         DP("Continue, considering this is a device RTL which does not accept "
715            "environment setting.\n");
716       }
717     }
718 
719     return getOffloadEntriesTable(DeviceId);
720   }
721 
dataAlloc(const int DeviceId,const int64_t Size) const722   void *dataAlloc(const int DeviceId, const int64_t Size) const {
723     if (Size == 0)
724       return nullptr;
725 
726     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
727     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
728       return nullptr;
729 
730     CUdeviceptr DevicePtr;
731     Err = cuMemAlloc(&DevicePtr, Size);
732     if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
733       return nullptr;
734 
735     return (void *)DevicePtr;
736   }
737 
dataSubmit(const int DeviceId,const void * TgtPtr,const void * HstPtr,const int64_t Size,__tgt_async_info * AsyncInfoPtr) const738   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
739                  const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
740     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
741 
742     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
743     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
744       return OFFLOAD_FAIL;
745 
746     CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
747 
748     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
749     if (Err != CUDA_SUCCESS) {
750       DP("Error when copying data from host to device. Pointers: host = " DPxMOD
751          ", device = " DPxMOD ", size = %" PRId64 "\n",
752          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
753       CUDA_ERR_STRING(Err);
754       return OFFLOAD_FAIL;
755     }
756 
757     return OFFLOAD_SUCCESS;
758   }
759 
dataRetrieve(const int DeviceId,void * HstPtr,const void * TgtPtr,const int64_t Size,__tgt_async_info * AsyncInfoPtr) const760   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
761                    const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
762     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
763 
764     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
765     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
766       return OFFLOAD_FAIL;
767 
768     CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
769 
770     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
771     if (Err != CUDA_SUCCESS) {
772       DP("Error when copying data from device to host. Pointers: host = " DPxMOD
773          ", device = " DPxMOD ", size = %" PRId64 "\n",
774          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
775       CUDA_ERR_STRING(Err);
776       return OFFLOAD_FAIL;
777     }
778 
779     return OFFLOAD_SUCCESS;
780   }
781 
dataExchange(int SrcDevId,const void * SrcPtr,int DstDevId,void * DstPtr,int64_t Size,__tgt_async_info * AsyncInfoPtr) const782   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
783                    int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
784     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
785 
786     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
787     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
788       return OFFLOAD_FAIL;
789 
790     CUstream Stream = getStream(SrcDevId, AsyncInfoPtr);
791 
792     // If they are two devices, we try peer to peer copy first
793     if (SrcDevId != DstDevId) {
794       int CanAccessPeer = 0;
795       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
796       if (Err != CUDA_SUCCESS) {
797         DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
798            ", dst = %" PRId32 "\n",
799            SrcDevId, DstDevId);
800         CUDA_ERR_STRING(Err);
801         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
802       }
803 
804       if (!CanAccessPeer) {
805         DP("P2P memcpy not supported so fall back to D2D memcpy");
806         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
807       }
808 
809       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
810       if (Err != CUDA_SUCCESS) {
811         DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
812            ", dst = %" PRId32 "\n",
813            SrcDevId, DstDevId);
814         CUDA_ERR_STRING(Err);
815         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
816       }
817 
818       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
819                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
820                               Size, Stream);
821       if (Err == CUDA_SUCCESS)
822         return OFFLOAD_SUCCESS;
823 
824       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
825          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
826          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
827       CUDA_ERR_STRING(Err);
828     }
829 
830     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
831   }
832 
dataDelete(const int DeviceId,void * TgtPtr) const833   int dataDelete(const int DeviceId, void *TgtPtr) const {
834     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
835     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
836       return OFFLOAD_FAIL;
837 
838     Err = cuMemFree((CUdeviceptr)TgtPtr);
839     if (!checkResult(Err, "Error returned from cuMemFree\n"))
840       return OFFLOAD_FAIL;
841 
842     return OFFLOAD_SUCCESS;
843   }
844 
runTargetTeamRegion(const int DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,const int ArgNum,const int TeamNum,const int ThreadLimit,const unsigned int LoopTripCount,__tgt_async_info * AsyncInfo) const845   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
846                           ptrdiff_t *TgtOffsets, const int ArgNum,
847                           const int TeamNum, const int ThreadLimit,
848                           const unsigned int LoopTripCount,
849                           __tgt_async_info *AsyncInfo) const {
850     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
851     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
852       return OFFLOAD_FAIL;
853 
854     // All args are references.
855     std::vector<void *> Args(ArgNum);
856     std::vector<void *> Ptrs(ArgNum);
857 
858     for (int I = 0; I < ArgNum; ++I) {
859       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
860       Args[I] = &Ptrs[I];
861     }
862 
863     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
864 
865     int CudaThreadsPerBlock;
866     if (ThreadLimit > 0) {
867       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
868       CudaThreadsPerBlock = ThreadLimit;
869       // Add master warp if necessary
870       if (KernelInfo->ExecutionMode == GENERIC) {
871         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
872         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
873       }
874     } else {
875       DP("Setting CUDA threads per block to default %d\n",
876          DeviceData[DeviceId].NumThreads);
877       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
878     }
879 
880     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
881       DP("Threads per block capped at device limit %d\n",
882          DeviceData[DeviceId].ThreadsPerBlock);
883       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
884     }
885 
886     if (!KernelInfo->MaxThreadsPerBlock) {
887       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
888                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
889                                KernelInfo->Func);
890       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
891         return OFFLOAD_FAIL;
892     }
893 
894     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
895       DP("Threads per block capped at kernel limit %d\n",
896          KernelInfo->MaxThreadsPerBlock);
897       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
898     }
899 
900     unsigned int CudaBlocksPerGrid;
901     if (TeamNum <= 0) {
902       if (LoopTripCount > 0 && EnvNumTeams < 0) {
903         if (KernelInfo->ExecutionMode == SPMD) {
904           // We have a combined construct, i.e. `target teams distribute
905           // parallel for [simd]`. We launch so many teams so that each thread
906           // will execute one iteration of the loop. round up to the nearest
907           // integer
908           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
909         } else {
910           // If we reach this point, then we have a non-combined construct, i.e.
911           // `teams distribute` with a nested `parallel for` and each team is
912           // assigned one iteration of the `distribute` loop. E.g.:
913           //
914           // #pragma omp target teams distribute
915           // for(...loop_tripcount...) {
916           //   #pragma omp parallel for
917           //   for(...) {}
918           // }
919           //
920           // Threads within a team will execute the iterations of the `parallel`
921           // loop.
922           CudaBlocksPerGrid = LoopTripCount;
923         }
924         DP("Using %d teams due to loop trip count %" PRIu32
925            " and number of threads per block %d\n",
926            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
927       } else {
928         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
929         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
930       }
931     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
932       DP("Capping number of teams to team limit %d\n",
933          DeviceData[DeviceId].BlocksPerGrid);
934       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
935     } else {
936       DP("Using requested number of teams %d\n", TeamNum);
937       CudaBlocksPerGrid = TeamNum;
938     }
939 
940     INFO(DeviceId,
941          "Launching kernel %s with %d blocks and %d threads in %s "
942          "mode\n",
943          (getOffloadEntry(DeviceId, TgtEntryPtr))
944              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
945              : "(null)",
946          CudaBlocksPerGrid, CudaThreadsPerBlock,
947          (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic");
948 
949     CUstream Stream = getStream(DeviceId, AsyncInfo);
950     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
951                          /* gridDimZ */ 1, CudaThreadsPerBlock,
952                          /* blockDimY */ 1, /* blockDimZ */ 1,
953                          /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
954     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
955       return OFFLOAD_FAIL;
956 
957     DP("Launch of entry point at " DPxMOD " successful!\n",
958        DPxPTR(TgtEntryPtr));
959 
960     return OFFLOAD_SUCCESS;
961   }
962 
synchronize(const int DeviceId,__tgt_async_info * AsyncInfoPtr) const963   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
964     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
965     CUresult Err = cuStreamSynchronize(Stream);
966     if (Err != CUDA_SUCCESS) {
967       DP("Error when synchronizing stream. stream = " DPxMOD
968          ", async info ptr = " DPxMOD "\n",
969          DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
970       CUDA_ERR_STRING(Err);
971       return OFFLOAD_FAIL;
972     }
973 
974     // Once the stream is synchronized, return it to stream pool and reset
975     // async_info. This is to make sure the synchronization only works for its
976     // own tasks.
977     StreamManager->returnStream(
978         DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
979     AsyncInfoPtr->Queue = nullptr;
980 
981     return OFFLOAD_SUCCESS;
982   }
983 };
984 
985 DeviceRTLTy DeviceRTL;
986 } // namespace
987 
988 // Exposed library API function
989 #ifdef __cplusplus
990 extern "C" {
991 #endif
992 
__tgt_rtl_is_valid_binary(__tgt_device_image * image)993 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
994   return elf_check_machine(image, /* EM_CUDA */ 190);
995 }
996 
__tgt_rtl_number_of_devices()997 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
998 
__tgt_rtl_init_requires(int64_t RequiresFlags)999 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1000   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1001   DeviceRTL.setRequiresFlag(RequiresFlags);
1002   return RequiresFlags;
1003 }
1004 
__tgt_rtl_is_data_exchangable(int32_t src_dev_id,int dst_dev_id)1005 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1006   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1007       DeviceRTL.isValidDeviceId(dst_dev_id))
1008     return 1;
1009 
1010   return 0;
1011 }
1012 
__tgt_rtl_init_device(int32_t device_id)1013 int32_t __tgt_rtl_init_device(int32_t device_id) {
1014   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1015 
1016   return DeviceRTL.initDevice(device_id);
1017 }
1018 
__tgt_rtl_load_binary(int32_t device_id,__tgt_device_image * image)1019 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1020                                           __tgt_device_image *image) {
1021   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1022 
1023   return DeviceRTL.loadBinary(device_id, image);
1024 }
1025 
__tgt_rtl_data_alloc(int32_t device_id,int64_t size,void *)1026 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
1027   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1028 
1029   return DeviceRTL.dataAlloc(device_id, size);
1030 }
1031 
__tgt_rtl_data_submit(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size)1032 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1033                               int64_t size) {
1034   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1035 
1036   __tgt_async_info async_info;
1037   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1038                                                  size, &async_info);
1039   if (rc != OFFLOAD_SUCCESS)
1040     return OFFLOAD_FAIL;
1041 
1042   return __tgt_rtl_synchronize(device_id, &async_info);
1043 }
1044 
__tgt_rtl_data_submit_async(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1045 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1046                                     void *hst_ptr, int64_t size,
1047                                     __tgt_async_info *async_info_ptr) {
1048   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1049   assert(async_info_ptr && "async_info_ptr is nullptr");
1050 
1051   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1052                               async_info_ptr);
1053 }
1054 
__tgt_rtl_data_retrieve(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size)1055 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1056                                 int64_t size) {
1057   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1058 
1059   __tgt_async_info async_info;
1060   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1061                                                    size, &async_info);
1062   if (rc != OFFLOAD_SUCCESS)
1063     return OFFLOAD_FAIL;
1064 
1065   return __tgt_rtl_synchronize(device_id, &async_info);
1066 }
1067 
__tgt_rtl_data_retrieve_async(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size,__tgt_async_info * async_info_ptr)1068 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1069                                       void *tgt_ptr, int64_t size,
1070                                       __tgt_async_info *async_info_ptr) {
1071   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1072   assert(async_info_ptr && "async_info_ptr is nullptr");
1073 
1074   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1075                                 async_info_ptr);
1076 }
1077 
__tgt_rtl_data_exchange_async(int32_t src_dev_id,void * src_ptr,int dst_dev_id,void * dst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1078 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1079                                       int dst_dev_id, void *dst_ptr,
1080                                       int64_t size,
1081                                       __tgt_async_info *async_info_ptr) {
1082   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1083   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1084   assert(async_info_ptr && "async_info_ptr is nullptr");
1085 
1086   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1087                                 async_info_ptr);
1088 }
1089 
__tgt_rtl_data_exchange(int32_t src_dev_id,void * src_ptr,int32_t dst_dev_id,void * dst_ptr,int64_t size)1090 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1091                                 int32_t dst_dev_id, void *dst_ptr,
1092                                 int64_t size) {
1093   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1094   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1095 
1096   __tgt_async_info async_info;
1097   const int32_t rc = __tgt_rtl_data_exchange_async(
1098       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info);
1099   if (rc != OFFLOAD_SUCCESS)
1100     return OFFLOAD_FAIL;
1101 
1102   return __tgt_rtl_synchronize(src_dev_id, &async_info);
1103 }
1104 
__tgt_rtl_data_delete(int32_t device_id,void * tgt_ptr)1105 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1106   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1107 
1108   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1109 }
1110 
__tgt_rtl_run_target_team_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount)1111 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1112                                          void **tgt_args,
1113                                          ptrdiff_t *tgt_offsets,
1114                                          int32_t arg_num, int32_t team_num,
1115                                          int32_t thread_limit,
1116                                          uint64_t loop_tripcount) {
1117   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1118 
1119   __tgt_async_info async_info;
1120   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1121       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1122       thread_limit, loop_tripcount, &async_info);
1123   if (rc != OFFLOAD_SUCCESS)
1124     return OFFLOAD_FAIL;
1125 
1126   return __tgt_rtl_synchronize(device_id, &async_info);
1127 }
1128 
__tgt_rtl_run_target_team_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount,__tgt_async_info * async_info_ptr)1129 int32_t __tgt_rtl_run_target_team_region_async(
1130     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1131     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1132     int32_t thread_limit, uint64_t loop_tripcount,
1133     __tgt_async_info *async_info_ptr) {
1134   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1135 
1136   return DeviceRTL.runTargetTeamRegion(
1137       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1138       thread_limit, loop_tripcount, async_info_ptr);
1139 }
1140 
__tgt_rtl_run_target_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num)1141 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1142                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1143                                     int32_t arg_num) {
1144   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1145 
1146   __tgt_async_info async_info;
1147   const int32_t rc = __tgt_rtl_run_target_region_async(
1148       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
1149   if (rc != OFFLOAD_SUCCESS)
1150     return OFFLOAD_FAIL;
1151 
1152   return __tgt_rtl_synchronize(device_id, &async_info);
1153 }
1154 
__tgt_rtl_run_target_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,__tgt_async_info * async_info_ptr)1155 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1156                                           void *tgt_entry_ptr, void **tgt_args,
1157                                           ptrdiff_t *tgt_offsets,
1158                                           int32_t arg_num,
1159                                           __tgt_async_info *async_info_ptr) {
1160   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1161 
1162   return __tgt_rtl_run_target_team_region_async(
1163       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1164       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1165       async_info_ptr);
1166 }
1167 
__tgt_rtl_synchronize(int32_t device_id,__tgt_async_info * async_info_ptr)1168 int32_t __tgt_rtl_synchronize(int32_t device_id,
1169                               __tgt_async_info *async_info_ptr) {
1170   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1171   assert(async_info_ptr && "async_info_ptr is nullptr");
1172   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1173 
1174   return DeviceRTL.synchronize(device_id, async_info_ptr);
1175 }
1176 
1177 #ifdef __cplusplus
1178 }
1179 #endif
1180