1 //===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of data sharing environments
10 //
11 //===----------------------------------------------------------------------===//
12 #include "common/omptarget.h"
13 #include "target_impl.h"
14
15 // Return true if this is the master thread.
IsMasterThread(bool isSPMDExecutionMode)16 INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
17 return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
18 }
19
20 ////////////////////////////////////////////////////////////////////////////////
21 // Runtime functions for trunk data sharing scheme.
22 ////////////////////////////////////////////////////////////////////////////////
23
data_sharing_init_stack_common()24 INLINE static void data_sharing_init_stack_common() {
25 ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
26 omptarget_nvptx_TeamDescr *teamDescr =
27 &omptarget_nvptx_threadPrivateContext->TeamContext();
28
29 for (int WID = 0; WID < DS_Max_Warp_Number; WID++) {
30 __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
31 DataSharingState.SlotPtr[WID] = RootS;
32 DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
33 }
34 }
35
36 // Initialize data sharing data structure. This function needs to be called
37 // once at the beginning of a data sharing context (coincides with the kernel
38 // initialization). This function is called only by the MASTER thread of each
39 // team in non-SPMD mode.
__kmpc_data_sharing_init_stack()40 EXTERN void __kmpc_data_sharing_init_stack() {
41 ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
42 // This function initializes the stack pointer with the pointer to the
43 // statically allocated shared memory slots. The size of a shared memory
44 // slot is pre-determined to be 256 bytes.
45 data_sharing_init_stack_common();
46 omptarget_nvptx_globalArgs.Init();
47 }
48
49 // Initialize data sharing data structure. This function needs to be called
50 // once at the beginning of a data sharing context (coincides with the kernel
51 // initialization). This function is called in SPMD mode only.
__kmpc_data_sharing_init_stack_spmd()52 EXTERN void __kmpc_data_sharing_init_stack_spmd() {
53 ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
54 // This function initializes the stack pointer with the pointer to the
55 // statically allocated shared memory slots. The size of a shared memory
56 // slot is pre-determined to be 256 bytes.
57 if (GetThreadIdInBlock() == 0)
58 data_sharing_init_stack_common();
59
60 __kmpc_impl_threadfence_block();
61 }
62
data_sharing_push_stack_common(size_t PushSize)63 INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
64 ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
65
66 // Only warp active master threads manage the stack.
67 bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
68
69 // Add worst-case padding to DataSize so that future stack allocations are
70 // correctly aligned.
71 const size_t Alignment = 8;
72 PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
73
74 // Frame pointer must be visible to all workers in the same warp.
75 const unsigned WID = GetWarpId();
76 void *FrameP = 0;
77 __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
78
79 if (IsWarpMaster) {
80 // SlotP will point to either the shared memory slot or an existing
81 // global memory slot.
82 __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
83 void *&StackP = DataSharingState.StackPtr[WID];
84
85 // Check if we have room for the data in the current slot.
86 const uintptr_t StartAddress = (uintptr_t)StackP;
87 const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
88 const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
89
90 // If we requested more data than there is room for in the rest
91 // of the slot then we need to either re-use the next slot, if one exists,
92 // or create a new slot.
93 if (EndAddress < RequestedEndAddress) {
94 __kmpc_data_sharing_slot *NewSlot = 0;
95 size_t NewSize = PushSize;
96
97 // Allocate at least the default size for each type of slot.
98 // Master is a special case and even though there is only one thread,
99 // it can share more things with the workers. For uniformity, it uses
100 // the full size of a worker warp slot.
101 size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
102 if (DefaultSlotSize > NewSize)
103 NewSize = DefaultSlotSize;
104 NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
105 sizeof(__kmpc_data_sharing_slot) + NewSize,
106 "Global memory slot allocation.");
107
108 NewSlot->Next = 0;
109 NewSlot->Prev = SlotP;
110 NewSlot->PrevSlotStackPtr = StackP;
111 NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
112
113 // Make previous slot point to the newly allocated slot.
114 SlotP->Next = NewSlot;
115 // The current slot becomes the new slot.
116 SlotP = NewSlot;
117 // The stack pointer always points to the next free stack frame.
118 StackP = &NewSlot->Data[0] + PushSize;
119 // The frame pointer always points to the beginning of the frame.
120 FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
121 } else {
122 // Add the data chunk to the current slot. The frame pointer is set to
123 // point to the start of the new frame held in StackP.
124 FrameP = DataSharingState.FramePtr[WID] = StackP;
125 // Reset stack pointer to the requested address.
126 StackP = (void *)RequestedEndAddress;
127 }
128 }
129 // Get address from lane 0.
130 int *FP = (int *)&FrameP;
131 FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
132 if (sizeof(FrameP) == 8)
133 FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
134
135 return FrameP;
136 }
137
__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,int16_t UseSharedMemory)138 EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
139 int16_t UseSharedMemory) {
140 return data_sharing_push_stack_common(DataSize);
141 }
142
143 // Called at the time of the kernel initialization. This is used to initilize
144 // the list of references to shared variables and to pre-allocate global storage
145 // for holding the globalized variables.
146 //
147 // By default the globalized variables are stored in global memory. If the
148 // UseSharedMemory is set to true, the runtime will attempt to use shared memory
149 // as long as the size requested fits the pre-allocated size.
__kmpc_data_sharing_push_stack(size_t DataSize,int16_t UseSharedMemory)150 EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
151 int16_t UseSharedMemory) {
152 // Compute the total memory footprint of the requested data.
153 // The master thread requires a stack only for itself. A worker
154 // thread (which at this point is a warp master) will require
155 // space for the variables of each thread in the warp,
156 // i.e. one DataSize chunk per warp lane.
157 // TODO: change WARPSIZE to the number of active threads in the warp.
158 size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
159 ? DataSize
160 : WARPSIZE * DataSize;
161
162 // Compute the start address of the frame of each thread in the warp.
163 uintptr_t FrameStartAddress =
164 (uintptr_t) data_sharing_push_stack_common(PushSize);
165 FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize);
166 return (void *)FrameStartAddress;
167 }
168
169 // Pop the stack and free any memory which can be reclaimed.
170 //
171 // When the pop operation removes the last global memory slot,
172 // reclaim all outstanding global memory slots since it is
173 // likely we have reached the end of the kernel.
__kmpc_data_sharing_pop_stack(void * FrameStart)174 EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
175 ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
176
177 __kmpc_impl_threadfence_block();
178
179 if (GetThreadIdInBlock() % WARPSIZE == 0) {
180 unsigned WID = GetWarpId();
181
182 // Current slot
183 __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
184
185 // Pointer to next available stack.
186 void *&StackP = DataSharingState.StackPtr[WID];
187
188 // Pop the frame.
189 StackP = FrameStart;
190
191 // If the current slot is empty, we need to free the slot after the
192 // pop.
193 bool SlotEmpty = (StackP == &SlotP->Data[0]);
194
195 if (SlotEmpty && SlotP->Prev) {
196 // Before removing the slot we need to reset StackP.
197 StackP = SlotP->PrevSlotStackPtr;
198
199 // Remove the slot.
200 SlotP = SlotP->Prev;
201 SafeFree(SlotP->Next, "Free slot.");
202 SlotP->Next = 0;
203 }
204 }
205 }
206
207 // Begin a data sharing context. Maintain a list of references to shared
208 // variables. This list of references to shared variables will be passed
209 // to one or more threads.
210 // In L0 data sharing this is called by master thread.
211 // In L1 data sharing this is called by active warp master thread.
__kmpc_begin_sharing_variables(void *** GlobalArgs,size_t nArgs)212 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
213 omptarget_nvptx_globalArgs.EnsureSize(nArgs);
214 *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
215 }
216
217 // End a data sharing context. There is no need to have a list of refs
218 // to shared variables because the context in which those variables were
219 // shared has now ended. This should clean-up the list of references only
220 // without affecting the actual global storage of the variables.
221 // In L0 data sharing this is called by master thread.
222 // In L1 data sharing this is called by active warp master thread.
__kmpc_end_sharing_variables()223 EXTERN void __kmpc_end_sharing_variables() {
224 omptarget_nvptx_globalArgs.DeInit();
225 }
226
227 // This function will return a list of references to global variables. This
228 // is how the workers will get a reference to the globalized variable. The
229 // members of this list will be passed to the outlined parallel function
230 // preserving the order.
231 // Called by all workers.
__kmpc_get_shared_variables(void *** GlobalArgs)232 EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
233 *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
234 }
235
236 // This function is used to init static memory manager. This manager is used to
237 // manage statically allocated global memory. This memory is allocated by the
238 // compiler and used to correctly implement globalization of the variables in
239 // target, teams and distribute regions.
__kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,const void * buf,size_t size,int16_t is_shared,const void ** frame)240 EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
241 const void *buf, size_t size,
242 int16_t is_shared,
243 const void **frame) {
244 if (is_shared) {
245 *frame = buf;
246 return;
247 }
248 if (isSPMDExecutionMode) {
249 if (GetThreadIdInBlock() == 0) {
250 *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
251 }
252 __kmpc_impl_syncthreads();
253 return;
254 }
255 ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
256 "Must be called only in the target master thread.");
257 *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
258 __kmpc_impl_threadfence();
259 }
260
__kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,int16_t is_shared)261 EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
262 int16_t is_shared) {
263 if (is_shared)
264 return;
265 if (isSPMDExecutionMode) {
266 __kmpc_impl_syncthreads();
267 if (GetThreadIdInBlock() == 0) {
268 omptarget_nvptx_simpleMemoryManager.Release();
269 }
270 return;
271 }
272 __kmpc_impl_threadfence();
273 ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
274 "Must be called only in the target master thread.");
275 omptarget_nvptx_simpleMemoryManager.Release();
276 }
277
278