1 //===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the declarations of all library macros, types, 10 // and functions. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef OMPTARGET_H 15 #define OMPTARGET_H 16 17 #include "target_impl.h" 18 #include "common/debug.h" // debug 19 #include "interface.h" // interfaces with omp, compiler, and user 20 #include "common/state-queue.h" 21 #include "common/support.h" 22 23 #define OMPTARGET_NVPTX_VERSION 1.1 24 25 // used by the library for the interface with the app 26 #define DISPATCH_FINISHED 0 27 #define DISPATCH_NOTFINISHED 1 28 29 // used by dynamic scheduling 30 #define FINISHED 0 31 #define NOT_FINISHED 1 32 #define LAST_CHUNK 2 33 34 #define BARRIER_COUNTER 0 35 #define ORDERED_COUNTER 1 36 37 // arguments needed for L0 parallelism only. 38 class omptarget_nvptx_SharedArgs { 39 public: 40 // All these methods must be called by the master thread only. Init()41 INLINE void Init() { 42 args = buffer; 43 nArgs = MAX_SHARED_ARGS; 44 } DeInit()45 INLINE void DeInit() { 46 // Free any memory allocated for outlined parallel function with a large 47 // number of arguments. 48 if (nArgs > MAX_SHARED_ARGS) { 49 SafeFree(args, "new extended args"); 50 Init(); 51 } 52 } EnsureSize(size_t size)53 INLINE void EnsureSize(size_t size) { 54 if (size > nArgs) { 55 if (nArgs > MAX_SHARED_ARGS) { 56 SafeFree(args, "new extended args"); 57 } 58 args = (void **)SafeMalloc(size * sizeof(void *), "new extended args"); 59 nArgs = size; 60 } 61 } 62 // Called by all threads. GetArgs()63 INLINE void **GetArgs() const { return args; }; 64 private: 65 // buffer of pre-allocated arguments. 66 void *buffer[MAX_SHARED_ARGS]; 67 // pointer to arguments buffer. 68 // starts off as a pointer to 'buffer' but can be dynamically allocated. 69 void **args; 70 // starts off as MAX_SHARED_ARGS but can increase in size. 71 uint32_t nArgs; 72 }; 73 74 extern DEVICE SHARED omptarget_nvptx_SharedArgs 75 omptarget_nvptx_globalArgs; 76 77 // Data structure to keep in shared memory that traces the current slot, stack, 78 // and frame pointer as well as the active threads that didn't exit the current 79 // environment. 80 struct DataSharingStateTy { 81 __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; 82 void *StackPtr[DS_Max_Warp_Number]; 83 void * volatile FramePtr[DS_Max_Warp_Number]; 84 __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number]; 85 }; 86 // Additional worker slot type which is initialized with the default worker slot 87 // size of 4*32 bytes. 88 struct __kmpc_data_sharing_worker_slot_static { 89 __kmpc_data_sharing_slot *Next; 90 __kmpc_data_sharing_slot *Prev; 91 void *PrevSlotStackPtr; 92 void *DataEnd; 93 char Data[DS_Worker_Warp_Slot_Size]; 94 }; 95 96 extern DEVICE SHARED DataSharingStateTy DataSharingState; 97 98 //////////////////////////////////////////////////////////////////////////////// 99 // task ICV and (implicit & explicit) task state 100 101 class omptarget_nvptx_TaskDescr { 102 public: 103 // methods for flags 104 INLINE omp_sched_t GetRuntimeSched() const; 105 INLINE void SetRuntimeSched(omp_sched_t sched); InParallelRegion()106 INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; } InL2OrHigherParallelRegion()107 INLINE int InL2OrHigherParallelRegion() const { 108 return items.flags & TaskDescr_InParL2P; 109 } IsParallelConstruct()110 INLINE int IsParallelConstruct() const { 111 return items.flags & TaskDescr_IsParConstr; 112 } IsTaskConstruct()113 INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); } 114 // methods for other fields ThreadId()115 INLINE uint16_t &ThreadId() { return items.threadId; } RuntimeChunkSize()116 INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } GetPrevTaskDescr()117 INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; } SetPrevTaskDescr(omptarget_nvptx_TaskDescr * taskDescr)118 INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { 119 prev = taskDescr; 120 } 121 // init & copy 122 INLINE void InitLevelZeroTaskDescr(); 123 INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr); 124 INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); 125 INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); 126 INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); 127 INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); 128 INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr); 129 INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); 130 INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, 131 uint16_t tid, uint16_t tnum); 132 INLINE void SaveLoopData(); 133 INLINE void RestoreLoopData() const; 134 135 private: 136 // bits for flags: (6 used, 2 free) 137 // 3 bits (SchedMask) for runtime schedule 138 // 1 bit (InPar) if this thread has encountered one or more parallel region 139 // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) 140 // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel 141 // region 142 static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); 143 static const uint8_t TaskDescr_InPar = 0x10; 144 static const uint8_t TaskDescr_IsParConstr = 0x20; 145 static const uint8_t TaskDescr_InParL2P = 0x40; 146 147 struct SavedLoopDescr_items { 148 int64_t loopUpperBound; 149 int64_t nextLowerBound; 150 int64_t chunk; 151 int64_t stride; 152 kmp_sched_t schedule; 153 } loopData; 154 155 struct TaskDescr_items { 156 uint8_t flags; // 6 bit used (see flag above) 157 uint8_t unused; 158 uint16_t threadId; // thread id 159 uint64_t runtimeChunkSize; // runtime chunk size 160 } items; 161 omptarget_nvptx_TaskDescr *prev; 162 }; 163 164 // build on kmp 165 typedef struct omptarget_nvptx_ExplicitTaskDescr { 166 omptarget_nvptx_TaskDescr 167 taskDescr; // omptarget_nvptx task description (must be first) 168 kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) 169 } omptarget_nvptx_ExplicitTaskDescr; 170 171 //////////////////////////////////////////////////////////////////////////////// 172 // Descriptor of a parallel region (worksharing in general) 173 174 class omptarget_nvptx_WorkDescr { 175 176 public: 177 // access to data WorkTaskDescr()178 INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } 179 180 private: 181 omptarget_nvptx_TaskDescr masterTaskICV; 182 }; 183 184 //////////////////////////////////////////////////////////////////////////////// 185 186 class omptarget_nvptx_TeamDescr { 187 public: 188 // access to data LevelZeroTaskDescr()189 INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { 190 return &levelZeroTaskDescr; 191 } WorkDescr()192 INLINE omptarget_nvptx_WorkDescr &WorkDescr() { 193 return workDescrForActiveParallel; 194 } 195 196 // init 197 INLINE void InitTeamDescr(); 198 GetPreallocatedSlotAddr(int wid)199 INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) { 200 worker_rootS[wid].DataEnd = 201 &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; 202 // We currently do not have a next slot. 203 worker_rootS[wid].Next = 0; 204 worker_rootS[wid].Prev = 0; 205 worker_rootS[wid].PrevSlotStackPtr = 0; 206 return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; 207 } 208 209 private: 210 omptarget_nvptx_TaskDescr 211 levelZeroTaskDescr; // icv for team master initial thread 212 omptarget_nvptx_WorkDescr 213 workDescrForActiveParallel; // one, ONLY for the active par 214 215 ALIGN(16) 216 __kmpc_data_sharing_worker_slot_static worker_rootS[DS_Max_Warp_Number]; 217 }; 218 219 //////////////////////////////////////////////////////////////////////////////// 220 // thread private data (struct of arrays for better coalescing) 221 // tid refers here to the global thread id 222 // do not support multiple concurrent kernel a this time 223 class omptarget_nvptx_ThreadPrivateContext { 224 public: 225 // task Level1TaskDescr(int tid)226 INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { 227 return &levelOneTaskDescr[tid]; 228 } SetTopLevelTaskDescr(int tid,omptarget_nvptx_TaskDescr * taskICV)229 INLINE void SetTopLevelTaskDescr(int tid, 230 omptarget_nvptx_TaskDescr *taskICV) { 231 topTaskDescr[tid] = taskICV; 232 } 233 INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const; 234 // parallel NumThreadsForNextParallel(int tid)235 INLINE uint16_t &NumThreadsForNextParallel(int tid) { 236 return nextRegion.tnum[tid]; 237 } 238 // schedule (for dispatch) ScheduleType(int tid)239 INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } Chunk(int tid)240 INLINE int64_t &Chunk(int tid) { return chunk[tid]; } LoopUpperBound(int tid)241 INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } NextLowerBound(int tid)242 INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; } Stride(int tid)243 INLINE int64_t &Stride(int tid) { return stride[tid]; } 244 TeamContext()245 INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } 246 247 INLINE void InitThreadPrivateContext(int tid); Cnt()248 INLINE uint64_t &Cnt() { return cnt; } 249 250 private: 251 // team context for this team 252 omptarget_nvptx_TeamDescr teamContext; 253 // task ICV for implicit threads in the only parallel region 254 omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; 255 // pointer where to find the current task ICV (top of the stack) 256 omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; 257 union { 258 // Only one of the two is live at the same time. 259 // parallel 260 uint16_t tnum[MAX_THREADS_PER_TEAM]; 261 } nextRegion; 262 // schedule (for dispatch) 263 kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for 264 int64_t chunk[MAX_THREADS_PER_TEAM]; 265 int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; 266 // state for dispatch with dyn/guided OR static (never use both at a time) 267 int64_t nextLowerBound[MAX_THREADS_PER_TEAM]; 268 int64_t stride[MAX_THREADS_PER_TEAM]; 269 uint64_t cnt; 270 }; 271 272 /// Memory manager for statically allocated memory. 273 class omptarget_nvptx_SimpleMemoryManager { 274 private: 275 ALIGN(128) struct MemDataTy { 276 volatile unsigned keys[OMP_STATE_COUNT]; 277 } MemData[MAX_SM]; 278 hash(unsigned key)279 INLINE static uint32_t hash(unsigned key) { 280 return key & (OMP_STATE_COUNT - 1); 281 } 282 283 public: 284 INLINE void Release(); 285 INLINE const void *Acquire(const void *buf, size_t size); 286 }; 287 288 //////////////////////////////////////////////////////////////////////////////// 289 290 //////////////////////////////////////////////////////////////////////////////// 291 // global data tables 292 //////////////////////////////////////////////////////////////////////////////// 293 294 extern DEVICE omptarget_nvptx_SimpleMemoryManager 295 omptarget_nvptx_simpleMemoryManager; 296 extern DEVICE SHARED uint32_t usedMemIdx; 297 extern DEVICE SHARED uint32_t usedSlotIdx; 298 extern DEVICE SHARED uint8_t 299 parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; 300 extern DEVICE SHARED uint16_t threadLimit; 301 extern DEVICE SHARED uint16_t threadsInTeam; 302 extern DEVICE SHARED uint16_t nThreads; 303 extern DEVICE SHARED 304 omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; 305 306 extern DEVICE SHARED uint32_t execution_param; 307 extern DEVICE SHARED void *ReductionScratchpadPtr; 308 309 //////////////////////////////////////////////////////////////////////////////// 310 // work function (outlined parallel/simd functions) and arguments. 311 // needed for L1 parallelism only. 312 //////////////////////////////////////////////////////////////////////////////// 313 314 typedef void *omptarget_nvptx_WorkFn; 315 extern volatile DEVICE SHARED omptarget_nvptx_WorkFn 316 omptarget_nvptx_workFn; 317 318 //////////////////////////////////////////////////////////////////////////////// 319 // get private data structures 320 //////////////////////////////////////////////////////////////////////////////// 321 322 INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); 323 INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); 324 INLINE omptarget_nvptx_TaskDescr * 325 getMyTopTaskDescriptor(bool isSPMDExecutionMode); 326 INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); 327 328 //////////////////////////////////////////////////////////////////////////////// 329 // inlined implementation 330 //////////////////////////////////////////////////////////////////////////////// 331 332 #include "common/omptargeti.h" 333 334 #endif 335