1 //===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Parallel implementation in the GPU. Here is the pattern:
10 //
11 // while (not finished) {
12 //
13 // if (master) {
14 // sequential code, decide which par loop to do, or if finished
15 // __kmpc_kernel_prepare_parallel() // exec by master only
16 // }
17 // syncthreads // A
18 // __kmpc_kernel_parallel() // exec by all
19 // if (this thread is included in the parallel) {
20 // switch () for all parallel loops
21 // __kmpc_kernel_end_parallel() // exec only by threads in parallel
22 // }
23 //
24 //
25 // The reason we don't exec end_parallel for the threads not included
26 // in the parallel loop is that for each barrier in the parallel
27 // region, these non-included threads will cycle through the
28 // syncthread A. Thus they must preserve their current threadId that
29 // is larger than thread in team.
30 //
31 // To make a long story short...
32 //
33 //===----------------------------------------------------------------------===//
34
35 #include "common/omptarget.h"
36 #include "target_impl.h"
37
38 ////////////////////////////////////////////////////////////////////////////////
39 // support for parallel that goes parallel (1 static level only)
40 ////////////////////////////////////////////////////////////////////////////////
41
determineNumberOfThreads(uint16_t NumThreadsClause,uint16_t NThreadsICV,uint16_t ThreadLimit)42 INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
43 uint16_t NThreadsICV,
44 uint16_t ThreadLimit) {
45 uint16_t ThreadsRequested = NThreadsICV;
46 if (NumThreadsClause != 0) {
47 ThreadsRequested = NumThreadsClause;
48 }
49
50 uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
51 if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
52 ThreadsAvailable = ThreadLimit;
53 }
54
55 uint16_t NumThreads = ThreadsAvailable;
56 if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
57 NumThreads = ThreadsRequested;
58 }
59
60 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
61 // On Volta and newer architectures we require that all lanes in
62 // a warp participate in the parallel region. Round down to a
63 // multiple of WARPSIZE since it is legal to do so in OpenMP.
64 if (NumThreads < WARPSIZE) {
65 NumThreads = 1;
66 } else {
67 NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
68 }
69 #endif
70
71 return NumThreads;
72 }
73
74 // This routine is always called by the team master..
__kmpc_kernel_prepare_parallel(void * WorkFn)75 EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn) {
76 PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
77
78 omptarget_nvptx_workFn = WorkFn;
79
80 // This routine is only called by the team master. The team master is
81 // the first thread of the last warp. It always has the logical thread
82 // id of 0 (since it is a shadow for the first worker thread).
83 const int threadId = 0;
84 omptarget_nvptx_TaskDescr *currTaskDescr =
85 omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
86 ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
87 ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
88 "cannot be called in a parallel region.");
89 if (currTaskDescr->InParallelRegion()) {
90 PRINT0(LD_PAR, "already in parallel: go seq\n");
91 return;
92 }
93
94 uint16_t &NumThreadsClause =
95 omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
96
97 uint16_t NumThreads =
98 determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
99
100 if (NumThreadsClause != 0) {
101 // Reset request to avoid propagating to successive #parallel
102 NumThreadsClause = 0;
103 }
104
105 ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
106 (int)NumThreads);
107 ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
108 "only team master can create parallel");
109
110 // Set number of threads on work descriptor.
111 omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
112 workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
113 threadsInTeam = NumThreads;
114 }
115
116 // All workers call this function. Deactivate those not needed.
117 // Fn - the outlined work function to execute.
118 // returns True if this thread is active, else False.
119 //
120 // Only the worker threads call this routine.
__kmpc_kernel_parallel(void ** WorkFn)121 EXTERN bool __kmpc_kernel_parallel(void **WorkFn) {
122 PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
123
124 // Work function and arguments for L1 parallel region.
125 *WorkFn = omptarget_nvptx_workFn;
126
127 // If this is the termination signal from the master, quit early.
128 if (!*WorkFn) {
129 PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n");
130 return false;
131 }
132
133 // Only the worker threads call this routine and the master warp
134 // never arrives here. Therefore, use the nvptx thread id.
135 int threadId = GetThreadIdInBlock();
136 omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
137 // Set to true for workers participating in the parallel region.
138 bool isActive = false;
139 // Initialize state for active threads.
140 if (threadId < threadsInTeam) {
141 // init work descriptor from workdesccr
142 omptarget_nvptx_TaskDescr *newTaskDescr =
143 omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
144 ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
145 newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
146 // install new top descriptor
147 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
148 newTaskDescr);
149 // init private from int value
150 PRINT(LD_PAR,
151 "thread will execute parallel region with id %d in a team of "
152 "%d threads\n",
153 (int)newTaskDescr->ThreadId(), (int)nThreads);
154
155 isActive = true;
156 // Reconverge the threads at the end of the parallel region to correctly
157 // handle parallel levels.
158 // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
159 // warp. If only 1 thread is active, not need to reconverge the threads.
160 // If we have the whole warp, reconverge all the threads in the warp before
161 // actually trying to change the parallel level. Otherwise, parallel level
162 // can be changed incorrectly because of threads divergence.
163 bool IsActiveParallelRegion = threadsInTeam != 1;
164 IncParallelLevel(IsActiveParallelRegion,
165 IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u);
166 }
167
168 return isActive;
169 }
170
__kmpc_kernel_end_parallel()171 EXTERN void __kmpc_kernel_end_parallel() {
172 // pop stack
173 PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
174 ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
175
176 // Only the worker threads call this routine and the master warp
177 // never arrives here. Therefore, use the nvptx thread id.
178 int threadId = GetThreadIdInBlock();
179 omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
180 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
181 threadId, currTaskDescr->GetPrevTaskDescr());
182
183 // Reconverge the threads at the end of the parallel region to correctly
184 // handle parallel levels.
185 // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
186 // warp. If only 1 thread is active, not need to reconverge the threads.
187 // If we have the whole warp, reconverge all the threads in the warp before
188 // actually trying to change the parallel level. Otherwise, parallel level can
189 // be changed incorrectly because of threads divergence.
190 bool IsActiveParallelRegion = threadsInTeam != 1;
191 DecParallelLevel(IsActiveParallelRegion,
192 IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u);
193 }
194
195 ////////////////////////////////////////////////////////////////////////////////
196 // support for parallel that goes sequential
197 ////////////////////////////////////////////////////////////////////////////////
198
__kmpc_serialized_parallel(kmp_Ident * loc,uint32_t global_tid)199 EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
200 PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
201
202 IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
203
204 if (checkRuntimeUninitialized(loc)) {
205 ASSERT0(LT_FUSSY, checkSPMDMode(loc),
206 "Expected SPMD mode with uninitialized runtime.");
207 return;
208 }
209
210 // assume this is only called for nested parallel
211 int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
212
213 // unlike actual parallel, threads in the same team do not share
214 // the workTaskDescr in this case and num threads is fixed to 1
215
216 // get current task
217 omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
218 currTaskDescr->SaveLoopData();
219
220 // allocate new task descriptor and copy value from current one, set prev to
221 // it
222 omptarget_nvptx_TaskDescr *newTaskDescr =
223 (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
224 "new seq parallel task");
225 newTaskDescr->CopyParent(currTaskDescr);
226
227 // tweak values for serialized parallel case:
228 // - each thread becomes ID 0 in its serialized parallel, and
229 // - there is only one thread per team
230 newTaskDescr->ThreadId() = 0;
231
232 // set new task descriptor as top
233 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
234 newTaskDescr);
235 }
236
__kmpc_end_serialized_parallel(kmp_Ident * loc,uint32_t global_tid)237 EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
238 uint32_t global_tid) {
239 PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
240
241 DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
242
243 if (checkRuntimeUninitialized(loc)) {
244 ASSERT0(LT_FUSSY, checkSPMDMode(loc),
245 "Expected SPMD mode with uninitialized runtime.");
246 return;
247 }
248
249 // pop stack
250 int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
251 omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
252 // set new top
253 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
254 threadId, currTaskDescr->GetPrevTaskDescr());
255 // free
256 SafeFree(currTaskDescr, "new seq parallel task");
257 currTaskDescr = getMyTopTaskDescriptor(threadId);
258 currTaskDescr->RestoreLoopData();
259 }
260
__kmpc_parallel_level(kmp_Ident * loc,uint32_t global_tid)261 EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
262 PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
263
264 return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
265 }
266
267 // This kmpc call returns the thread id across all teams. It's value is
268 // cached by the compiler and used when calling the runtime. On nvptx
269 // it's cheap to recalculate this value so we never use the result
270 // of this call.
__kmpc_global_thread_num(kmp_Ident * loc)271 EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
272 int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
273 return GetOmpThreadId(tid, checkSPMDMode(loc));
274 }
275
276 ////////////////////////////////////////////////////////////////////////////////
277 // push params
278 ////////////////////////////////////////////////////////////////////////////////
279
__kmpc_push_num_threads(kmp_Ident * loc,int32_t tid,int32_t num_threads)280 EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
281 int32_t num_threads) {
282 PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
283 ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
284 tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
285 omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
286 num_threads;
287 }
288
289 // Do nothing. The host guarantees we started the requested number of
290 // teams and we only need inspection of gridDim.
291
__kmpc_push_num_teams(kmp_Ident * loc,int32_t tid,int32_t num_teams,int32_t thread_limit)292 EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
293 int32_t num_teams, int32_t thread_limit) {
294 PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
295 ASSERT0(LT_FUSSY, 0,
296 "should never have anything with new teams on device");
297 }
298
__kmpc_push_proc_bind(kmp_Ident * loc,uint32_t tid,int proc_bind)299 EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,
300 int proc_bind) {
301 PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
302 }
303