1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "device.h"
15 #include "private.h"
16 #include "rtl.h"
17
18 #include <cassert>
19 #include <vector>
20
21 /* All begin addresses for partially mapped structs must be 8-aligned in order
22 * to ensure proper alignment of members. E.g.
23 *
24 * struct S {
25 * int a; // 4-aligned
26 * int b; // 4-aligned
27 * int *p; // 8-aligned
28 * } s1;
29 * ...
30 * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
31 * {
32 * s1.b = 5;
33 * for (int i...) s1.p[i] = ...;
34 * }
35 *
36 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
37 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
38 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
39 * requirements for its type. Now, when we allocate memory on the device, in
40 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
41 * This means that the chunk of the struct on the device will start at a
42 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
43 * address of p will be a misaligned 0x204 (on the host there was no need to add
44 * padding between b and p, so p comes exactly 4 bytes after b). If the device
45 * kernel tries to access s1.p, a misaligned address error occurs (as reported
46 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
47 * extending the size of the allocated chuck accordingly, the chuck on the
48 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
49 * &s1.p=0x208, as they should be to satisfy the alignment requirements.
50 */
51 static const int64_t Alignment = 8;
52
53 /// Map global data and execute pending ctors
InitLibrary(DeviceTy & Device)54 static int InitLibrary(DeviceTy& Device) {
55 /*
56 * Map global data
57 */
58 int32_t device_id = Device.DeviceID;
59 int rc = OFFLOAD_SUCCESS;
60
61 Device.PendingGlobalsMtx.lock();
62 PM->TrlTblMtx.lock();
63 for (HostEntriesBeginToTransTableTy::iterator entry_it =
64 PM->HostEntriesBeginToTransTable.begin();
65 entry_it != PM->HostEntriesBeginToTransTable.end(); ++entry_it) {
66 TranslationTable *TransTable = &entry_it->second;
67 if (TransTable->HostTable.EntriesBegin ==
68 TransTable->HostTable.EntriesEnd) {
69 // No host entry so no need to proceed
70 continue;
71 }
72 if (TransTable->TargetsTable[device_id] != 0) {
73 // Library entries have already been processed
74 continue;
75 }
76
77 // 1) get image.
78 assert(TransTable->TargetsImages.size() > (size_t)device_id &&
79 "Not expecting a device ID outside the table's bounds!");
80 __tgt_device_image *img = TransTable->TargetsImages[device_id];
81 if (!img) {
82 REPORT("No image loaded for device id %d.\n", device_id);
83 rc = OFFLOAD_FAIL;
84 break;
85 }
86 // 2) load image into the target table.
87 __tgt_target_table *TargetTable =
88 TransTable->TargetsTable[device_id] = Device.load_binary(img);
89 // Unable to get table for this image: invalidate image and fail.
90 if (!TargetTable) {
91 REPORT("Unable to generate entries table for device id %d.\n", device_id);
92 TransTable->TargetsImages[device_id] = 0;
93 rc = OFFLOAD_FAIL;
94 break;
95 }
96
97 // Verify whether the two table sizes match.
98 size_t hsize =
99 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
100 size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
101
102 // Invalid image for these host entries!
103 if (hsize != tsize) {
104 REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
105 device_id, hsize, tsize);
106 TransTable->TargetsImages[device_id] = 0;
107 TransTable->TargetsTable[device_id] = 0;
108 rc = OFFLOAD_FAIL;
109 break;
110 }
111
112 // process global data that needs to be mapped.
113 Device.DataMapMtx.lock();
114 __tgt_target_table *HostTable = &TransTable->HostTable;
115 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
116 *CurrHostEntry = HostTable->EntriesBegin,
117 *EntryDeviceEnd = TargetTable->EntriesEnd;
118 CurrDeviceEntry != EntryDeviceEnd;
119 CurrDeviceEntry++, CurrHostEntry++) {
120 if (CurrDeviceEntry->size != 0) {
121 // has data.
122 assert(CurrDeviceEntry->size == CurrHostEntry->size &&
123 "data size mismatch");
124
125 // Fortran may use multiple weak declarations for the same symbol,
126 // therefore we must allow for multiple weak symbols to be loaded from
127 // the fat binary. Treat these mappings as any other "regular" mapping.
128 // Add entry to map.
129 if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
130 continue;
131 DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
132 "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
133 CurrDeviceEntry->size);
134 Device.HostDataToTargetMap.emplace(
135 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
136 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
137 (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
138 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, nullptr,
139 true /*IsRefCountINF*/);
140 }
141 }
142 Device.DataMapMtx.unlock();
143 }
144 PM->TrlTblMtx.unlock();
145
146 if (rc != OFFLOAD_SUCCESS) {
147 Device.PendingGlobalsMtx.unlock();
148 return rc;
149 }
150
151 /*
152 * Run ctors for static objects
153 */
154 if (!Device.PendingCtorsDtors.empty()) {
155 // Call all ctors for all libraries registered so far
156 for (auto &lib : Device.PendingCtorsDtors) {
157 if (!lib.second.PendingCtors.empty()) {
158 DP("Has pending ctors... call now\n");
159 for (auto &entry : lib.second.PendingCtors) {
160 void *ctor = entry;
161 int rc = target(device_id, ctor, 0, nullptr, nullptr, nullptr,
162 nullptr, nullptr, nullptr, 1, 1, true /*team*/);
163 if (rc != OFFLOAD_SUCCESS) {
164 REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
165 Device.PendingGlobalsMtx.unlock();
166 return OFFLOAD_FAIL;
167 }
168 }
169 // Clear the list to indicate that this device has been used
170 lib.second.PendingCtors.clear();
171 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
172 }
173 }
174 }
175 Device.HasPendingGlobals = false;
176 Device.PendingGlobalsMtx.unlock();
177
178 return OFFLOAD_SUCCESS;
179 }
180
181 // Check whether a device has been initialized, global ctors have been
182 // executed and global data has been mapped; do so if not already done.
CheckDeviceAndCtors(int64_t device_id)183 int CheckDeviceAndCtors(int64_t device_id) {
184 // Is device ready?
185 if (!device_is_ready(device_id)) {
186 REPORT("Device %" PRId64 " is not ready.\n", device_id);
187 return OFFLOAD_FAIL;
188 }
189
190 // Get device info.
191 DeviceTy &Device = PM->Devices[device_id];
192
193 // Check whether global data has been mapped for this device
194 Device.PendingGlobalsMtx.lock();
195 bool hasPendingGlobals = Device.HasPendingGlobals;
196 Device.PendingGlobalsMtx.unlock();
197 if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
198 REPORT("Failed to init globals on device %" PRId64 "\n", device_id);
199 return OFFLOAD_FAIL;
200 }
201
202 return OFFLOAD_SUCCESS;
203 }
204
getParentIndex(int64_t type)205 static int32_t getParentIndex(int64_t type) {
206 return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
207 }
208
209 /// Call the user-defined mapper function followed by the appropriate
210 // target_data_* function (target_data_{begin,end,update}).
targetDataMapper(DeviceTy & Device,void * arg_base,void * arg,int64_t arg_size,int64_t arg_type,void * arg_mapper,TargetDataFuncPtrTy target_data_function)211 int targetDataMapper(DeviceTy &Device, void *arg_base, void *arg,
212 int64_t arg_size, int64_t arg_type, void *arg_mapper,
213 TargetDataFuncPtrTy target_data_function) {
214 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
215
216 // The mapper function fills up Components.
217 MapperComponentsTy MapperComponents;
218 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper);
219 (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size,
220 arg_type);
221
222 // Construct new arrays for args_base, args, arg_sizes and arg_types
223 // using the information in MapperComponents and call the corresponding
224 // target_data_* function using these new arrays.
225 std::vector<void *> MapperArgsBase(MapperComponents.Components.size());
226 std::vector<void *> MapperArgs(MapperComponents.Components.size());
227 std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size());
228 std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size());
229
230 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
231 auto &C =
232 MapperComponents
233 .Components[target_data_function == targetDataEnd ? I : E - I - 1];
234 MapperArgsBase[I] = C.Base;
235 MapperArgs[I] = C.Begin;
236 MapperArgSizes[I] = C.Size;
237 MapperArgTypes[I] = C.Type;
238 }
239
240 int rc = target_data_function(Device, MapperComponents.Components.size(),
241 MapperArgsBase.data(), MapperArgs.data(),
242 MapperArgSizes.data(), MapperArgTypes.data(),
243 /*arg_names*/ nullptr, /*arg_mappers*/ nullptr,
244 /*__tgt_async_info*/ nullptr);
245
246 return rc;
247 }
248
249 /// Internal function to do the mapping and transfer the data to the device
targetDataBegin(DeviceTy & Device,int32_t arg_num,void ** args_base,void ** args,int64_t * arg_sizes,int64_t * arg_types,map_var_info_t * arg_names,void ** arg_mappers,__tgt_async_info * async_info_ptr)250 int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base,
251 void **args, int64_t *arg_sizes, int64_t *arg_types,
252 map_var_info_t *arg_names, void **arg_mappers,
253 __tgt_async_info *async_info_ptr) {
254 // process each input.
255 for (int32_t i = 0; i < arg_num; ++i) {
256 // Ignore private variables and arrays - there is no mapping for them.
257 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
258 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
259 continue;
260
261 if (arg_mappers && arg_mappers[i]) {
262 // Instead of executing the regular path of targetDataBegin, call the
263 // targetDataMapper variant which will call targetDataBegin again
264 // with new arguments.
265 DP("Calling targetDataMapper for the %dth argument\n", i);
266
267 int rc = targetDataMapper(Device, args_base[i], args[i], arg_sizes[i],
268 arg_types[i], arg_mappers[i], targetDataBegin);
269
270 if (rc != OFFLOAD_SUCCESS) {
271 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
272 " failed.\n");
273 return OFFLOAD_FAIL;
274 }
275
276 // Skip the rest of this function, continue to the next argument.
277 continue;
278 }
279
280 void *HstPtrBegin = args[i];
281 void *HstPtrBase = args_base[i];
282 int64_t data_size = arg_sizes[i];
283 map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i];
284
285 // Adjust for proper alignment if this is a combined entry (for structs).
286 // Look at the next argument - if that is MEMBER_OF this one, then this one
287 // is a combined entry.
288 int64_t padding = 0;
289 const int next_i = i+1;
290 if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num &&
291 getParentIndex(arg_types[next_i]) == i) {
292 padding = (int64_t)HstPtrBegin % Alignment;
293 if (padding) {
294 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
295 "\n", padding, DPxPTR(HstPtrBegin));
296 HstPtrBegin = (char *) HstPtrBegin - padding;
297 data_size += padding;
298 }
299 }
300
301 // Address of pointer on the host and device, respectively.
302 void *Pointer_HstPtrBegin, *PointerTgtPtrBegin;
303 bool IsNew, Pointer_IsNew;
304 bool IsHostPtr = false;
305 bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
306 // Force the creation of a device side copy of the data when:
307 // a close map modifier was associated with a map that contained a to.
308 bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
309 bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT;
310 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
311 // have reached this point via __tgt_target_data_begin and not __tgt_target
312 // then no argument is marked as TARGET_PARAM ("omp target data map" is not
313 // associated with a target region, so there are no target parameters). This
314 // may be considered a hack, we could revise the scheme in the future.
315 bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
316 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
317 DP("Has a pointer entry: \n");
318 // Base is address of pointer.
319 //
320 // Usually, the pointer is already allocated by this time. For example:
321 //
322 // #pragma omp target map(s.p[0:N])
323 //
324 // The map entry for s comes first, and the PTR_AND_OBJ entry comes
325 // afterward, so the pointer is already allocated by the time the
326 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
327 // non-null. However, "declare target link" can produce a PTR_AND_OBJ
328 // entry for a global that might not already be allocated by the time the
329 // PTR_AND_OBJ entry is handled below, and so the allocation might fail
330 // when HasPresentModifier.
331 PointerTgtPtrBegin = Device.getOrAllocTgtPtr(
332 HstPtrBase, HstPtrBase, sizeof(void *), HstPtrName, Pointer_IsNew,
333 IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier,
334 HasPresentModifier);
335 if (!PointerTgtPtrBegin) {
336 REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n",
337 HasPresentModifier ? "'present' map type modifier"
338 : "device failure or illegal mapping");
339 return OFFLOAD_FAIL;
340 }
341 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
342 "\n", sizeof(void *), DPxPTR(PointerTgtPtrBegin),
343 (Pointer_IsNew ? "" : " not"));
344 Pointer_HstPtrBegin = HstPtrBase;
345 // modify current entry.
346 HstPtrBase = *(void **)HstPtrBase;
347 UpdateRef = true; // subsequently update ref count of pointee
348 }
349
350 void *TgtPtrBegin = Device.getOrAllocTgtPtr(
351 HstPtrBegin, HstPtrBase, data_size, HstPtrName, IsNew, IsHostPtr,
352 IsImplicit, UpdateRef, HasCloseModifier, HasPresentModifier);
353 // If data_size==0, then the argument could be a zero-length pointer to
354 // NULL, so getOrAlloc() returning NULL is not an error.
355 if (!TgtPtrBegin && (data_size || HasPresentModifier)) {
356 REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n",
357 HasPresentModifier ? "'present' map type modifier"
358 : "device failure or illegal mapping");
359 return OFFLOAD_FAIL;
360 }
361 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
362 " - is%s new\n", data_size, DPxPTR(TgtPtrBegin),
363 (IsNew ? "" : " not"));
364
365 if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
366 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
367 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
368 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
369 args_base[i] = TgtPtrBase;
370 }
371
372 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
373 bool copy = false;
374 if (!(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
375 HasCloseModifier) {
376 if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
377 copy = true;
378 } else if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
379 !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
380 // Copy data only if the "parent" struct has RefCount==1.
381 // If this is a PTR_AND_OBJ entry, the OBJ is not part of the struct,
382 // so exclude it from this check.
383 int32_t parent_idx = getParentIndex(arg_types[i]);
384 uint64_t parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
385 assert(parent_rc > 0 && "parent struct not found");
386 if (parent_rc == 1) {
387 copy = true;
388 }
389 }
390 }
391
392 if (copy && !IsHostPtr) {
393 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
394 data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
395 int rt = Device.submitData(TgtPtrBegin, HstPtrBegin, data_size,
396 async_info_ptr);
397 if (rt != OFFLOAD_SUCCESS) {
398 REPORT("Copying data to device failed.\n");
399 return OFFLOAD_FAIL;
400 }
401 }
402 }
403
404 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
405 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
406 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
407 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
408 void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
409 int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
410 sizeof(void *), async_info_ptr);
411 if (rt != OFFLOAD_SUCCESS) {
412 REPORT("Copying data to device failed.\n");
413 return OFFLOAD_FAIL;
414 }
415 // create shadow pointers for this entry
416 Device.ShadowMtx.lock();
417 Device.ShadowPtrMap[Pointer_HstPtrBegin] = {
418 HstPtrBase, PointerTgtPtrBegin, TgtPtrBase};
419 Device.ShadowMtx.unlock();
420 }
421 }
422
423 return OFFLOAD_SUCCESS;
424 }
425
426 namespace {
427 /// This structure contains information to deallocate a target pointer, aka.
428 /// used to call the function \p DeviceTy::deallocTgtPtr.
429 struct DeallocTgtPtrInfo {
430 /// Host pointer used to look up into the map table
431 void *HstPtrBegin;
432 /// Size of the data
433 int64_t DataSize;
434 /// Whether it is forced to be removed from the map table
435 bool ForceDelete;
436 /// Whether it has \p close modifier
437 bool HasCloseModifier;
438
DeallocTgtPtrInfo__anonba5a93b80111::DeallocTgtPtrInfo439 DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool ForceDelete,
440 bool HasCloseModifier)
441 : HstPtrBegin(HstPtr), DataSize(Size), ForceDelete(ForceDelete),
442 HasCloseModifier(HasCloseModifier) {}
443 };
444 } // namespace
445
446 /// Internal function to undo the mapping and retrieve the data from the device.
targetDataEnd(DeviceTy & Device,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,__tgt_async_info * AsyncInfo)447 int targetDataEnd(DeviceTy &Device, int32_t ArgNum, void **ArgBases,
448 void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
449 map_var_info_t *ArgNames, void **ArgMappers,
450 __tgt_async_info *AsyncInfo) {
451 int Ret;
452 std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
453 // process each input.
454 for (int32_t I = ArgNum - 1; I >= 0; --I) {
455 // Ignore private variables and arrays - there is no mapping for them.
456 // Also, ignore the use_device_ptr directive, it has no effect here.
457 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
458 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
459 continue;
460
461 if (ArgMappers && ArgMappers[I]) {
462 // Instead of executing the regular path of targetDataEnd, call the
463 // targetDataMapper variant which will call targetDataEnd again
464 // with new arguments.
465 DP("Calling targetDataMapper for the %dth argument\n", I);
466
467 Ret = targetDataMapper(Device, ArgBases[I], Args[I], ArgSizes[I],
468 ArgTypes[I], ArgMappers[I], targetDataEnd);
469
470 if (Ret != OFFLOAD_SUCCESS) {
471 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
472 " failed.\n");
473 return OFFLOAD_FAIL;
474 }
475
476 // Skip the rest of this function, continue to the next argument.
477 continue;
478 }
479
480 void *HstPtrBegin = Args[I];
481 int64_t DataSize = ArgSizes[I];
482 // Adjust for proper alignment if this is a combined entry (for structs).
483 // Look at the next argument - if that is MEMBER_OF this one, then this one
484 // is a combined entry.
485 const int NextI = I + 1;
486 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
487 getParentIndex(ArgTypes[NextI]) == I) {
488 int64_t Padding = (int64_t)HstPtrBegin % Alignment;
489 if (Padding) {
490 DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD
491 "\n",
492 Padding, DPxPTR(HstPtrBegin));
493 HstPtrBegin = (char *)HstPtrBegin - Padding;
494 DataSize += Padding;
495 }
496 }
497
498 bool IsLast, IsHostPtr;
499 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
500 bool UpdateRef = !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
501 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
502 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
503 bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
504 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
505
506 // If PTR_AND_OBJ, HstPtrBegin is address of pointee
507 void *TgtPtrBegin = Device.getTgtPtrBegin(
508 HstPtrBegin, DataSize, IsLast, UpdateRef, IsHostPtr, !IsImplicit);
509 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
510 DP("Mapping does not exist (%s)\n",
511 (HasPresentModifier ? "'present' map type modifier" : "ignored"));
512 if (HasPresentModifier) {
513 // This should be an error upon entering an "omp target exit data". It
514 // should not be an error upon exiting an "omp target data" or "omp
515 // target". For "omp target data", Clang thus doesn't include present
516 // modifiers for end calls. For "omp target", we have not found a valid
517 // OpenMP program for which the error matters: it appears that, if a
518 // program can guarantee that data is present at the beginning of an
519 // "omp target" region so that there's no error there, that data is also
520 // guaranteed to be present at the end.
521 MESSAGE("device mapping required by 'present' map type modifier does "
522 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
523 DPxPTR(HstPtrBegin), DataSize);
524 return OFFLOAD_FAIL;
525 }
526 } else {
527 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
528 " - is%s last\n",
529 DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not"));
530 }
531
532 bool DelEntry = IsLast || ForceDelete;
533
534 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
535 !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
536 DelEntry = false; // protect parent struct from being deallocated
537 }
538
539 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
540 // Move data back to the host
541 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
542 bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
543 bool CopyMember = false;
544 if (!(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
545 HasCloseModifier) {
546 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
547 !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
548 // Copy data only if the "parent" struct has RefCount==1.
549 int32_t ParentIdx = getParentIndex(ArgTypes[I]);
550 uint64_t ParentRC = Device.getMapEntryRefCnt(Args[ParentIdx]);
551 assert(ParentRC > 0 && "parent struct not found");
552 if (ParentRC == 1)
553 CopyMember = true;
554 }
555 }
556
557 if ((DelEntry || Always || CopyMember) &&
558 !(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
559 TgtPtrBegin == HstPtrBegin)) {
560 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
561 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
562 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
563 AsyncInfo);
564 if (Ret != OFFLOAD_SUCCESS) {
565 REPORT("Copying data from device failed.\n");
566 return OFFLOAD_FAIL;
567 }
568 }
569 }
570
571 // If we copied back to the host a struct/array containing pointers, we
572 // need to restore the original host pointer values from their shadow
573 // copies. If the struct is going to be deallocated, remove any remaining
574 // shadow pointer entries for this struct.
575 uintptr_t LB = (uintptr_t)HstPtrBegin;
576 uintptr_t UB = (uintptr_t)HstPtrBegin + DataSize;
577 Device.ShadowMtx.lock();
578 for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
579 Itr != Device.ShadowPtrMap.end();) {
580 void **ShadowHstPtrAddr = (void **)Itr->first;
581
582 // An STL map is sorted on its keys; use this property
583 // to quickly determine when to break out of the loop.
584 if ((uintptr_t)ShadowHstPtrAddr < LB) {
585 ++Itr;
586 continue;
587 }
588 if ((uintptr_t)ShadowHstPtrAddr >= UB)
589 break;
590
591 // If we copied the struct to the host, we need to restore the pointer.
592 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
593 DP("Restoring original host pointer value " DPxMOD " for host "
594 "pointer " DPxMOD "\n",
595 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
596 *ShadowHstPtrAddr = Itr->second.HstPtrVal;
597 }
598 // If the struct is to be deallocated, remove the shadow entry.
599 if (DelEntry) {
600 DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
601 Itr = Device.ShadowPtrMap.erase(Itr);
602 } else {
603 ++Itr;
604 }
605 }
606 Device.ShadowMtx.unlock();
607
608 // Add pointer to the buffer for later deallocation
609 if (DelEntry)
610 DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, ForceDelete,
611 HasCloseModifier);
612 }
613 }
614
615 // We need to synchronize before deallocating data.
616 // If AsyncInfo is nullptr, the previous data transfer (if has) will be
617 // synchronous, so we don't need to synchronize again. If AsyncInfo->Queue is
618 // nullptr, there is no data transfer happened because once there is,
619 // AsyncInfo->Queue will not be nullptr, so again, we don't need to
620 // synchronize.
621 if (AsyncInfo && AsyncInfo->Queue) {
622 Ret = Device.synchronize(AsyncInfo);
623 if (Ret != OFFLOAD_SUCCESS) {
624 REPORT("Failed to synchronize device.\n");
625 return OFFLOAD_FAIL;
626 }
627 }
628
629 // Deallocate target pointer
630 for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
631 Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize,
632 Info.ForceDelete, Info.HasCloseModifier);
633 if (Ret != OFFLOAD_SUCCESS) {
634 REPORT("Deallocating data from device failed.\n");
635 return OFFLOAD_FAIL;
636 }
637 }
638
639 return OFFLOAD_SUCCESS;
640 }
641
targetDataContiguous(DeviceTy & Device,void * ArgsBase,void * HstPtrBegin,int64_t ArgSize,int64_t ArgType)642 static int targetDataContiguous(DeviceTy &Device, void *ArgsBase,
643 void *HstPtrBegin, int64_t ArgSize,
644 int64_t ArgType) {
645 bool IsLast, IsHostPtr;
646 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false,
647 IsHostPtr, /*MustContain=*/true);
648 if (!TgtPtrBegin) {
649 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
650 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
651 MESSAGE("device mapping required by 'present' motion modifier does not "
652 "exist for host address " DPxMOD " (%" PRId64 " bytes)",
653 DPxPTR(HstPtrBegin), ArgSize);
654 return OFFLOAD_FAIL;
655 }
656 return OFFLOAD_SUCCESS;
657 }
658
659 if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
660 TgtPtrBegin == HstPtrBegin) {
661 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
662 DPxPTR(HstPtrBegin));
663 return OFFLOAD_SUCCESS;
664 }
665
666 if (ArgType & OMP_TGT_MAPTYPE_FROM) {
667 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
668 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
669 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, nullptr);
670 if (Ret != OFFLOAD_SUCCESS) {
671 REPORT("Copying data from device failed.\n");
672 return OFFLOAD_FAIL;
673 }
674
675 uintptr_t LB = (uintptr_t)HstPtrBegin;
676 uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
677 Device.ShadowMtx.lock();
678 for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
679 IT != Device.ShadowPtrMap.end(); ++IT) {
680 void **ShadowHstPtrAddr = (void **)IT->first;
681 if ((uintptr_t)ShadowHstPtrAddr < LB)
682 continue;
683 if ((uintptr_t)ShadowHstPtrAddr >= UB)
684 break;
685 DP("Restoring original host pointer value " DPxMOD
686 " for host pointer " DPxMOD "\n",
687 DPxPTR(IT->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
688 *ShadowHstPtrAddr = IT->second.HstPtrVal;
689 }
690 Device.ShadowMtx.unlock();
691 }
692
693 if (ArgType & OMP_TGT_MAPTYPE_TO) {
694 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
695 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
696 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, nullptr);
697 if (Ret != OFFLOAD_SUCCESS) {
698 REPORT("Copying data to device failed.\n");
699 return OFFLOAD_FAIL;
700 }
701
702 uintptr_t LB = (uintptr_t)HstPtrBegin;
703 uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
704 Device.ShadowMtx.lock();
705 for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
706 IT != Device.ShadowPtrMap.end(); ++IT) {
707 void **ShadowHstPtrAddr = (void **)IT->first;
708 if ((uintptr_t)ShadowHstPtrAddr < LB)
709 continue;
710 if ((uintptr_t)ShadowHstPtrAddr >= UB)
711 break;
712 DP("Restoring original target pointer value " DPxMOD " for target "
713 "pointer " DPxMOD "\n",
714 DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
715 Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
716 sizeof(void *), nullptr);
717 if (Ret != OFFLOAD_SUCCESS) {
718 REPORT("Copying data to device failed.\n");
719 Device.ShadowMtx.unlock();
720 return OFFLOAD_FAIL;
721 }
722 }
723 Device.ShadowMtx.unlock();
724 }
725 return OFFLOAD_SUCCESS;
726 }
727
targetDataNonContiguous(DeviceTy & Device,void * ArgsBase,__tgt_target_non_contig * NonContig,uint64_t Size,int64_t ArgType,int CurrentDim,int DimSize,uint64_t Offset)728 static int targetDataNonContiguous(DeviceTy &Device, void *ArgsBase,
729 __tgt_target_non_contig *NonContig,
730 uint64_t Size, int64_t ArgType,
731 int CurrentDim, int DimSize,
732 uint64_t Offset) {
733 int Ret = OFFLOAD_SUCCESS;
734 if (CurrentDim < DimSize) {
735 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
736 uint64_t CurOffset =
737 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
738 // we only need to transfer the first element for the last dimension
739 // since we've already got a contiguous piece.
740 if (CurrentDim != DimSize - 1 || I == 0) {
741 Ret = targetDataNonContiguous(Device, ArgsBase, NonContig, Size,
742 ArgType, CurrentDim + 1, DimSize,
743 Offset + CurOffset);
744 // Stop the whole process if any contiguous piece returns anything
745 // other than OFFLOAD_SUCCESS.
746 if (Ret != OFFLOAD_SUCCESS)
747 return Ret;
748 }
749 }
750 } else {
751 char *Ptr = (char *)ArgsBase + Offset;
752 DP("Transfer of non-contiguous : host ptr %lx offset %ld len %ld\n",
753 (uint64_t)Ptr, Offset, Size);
754 Ret = targetDataContiguous(Device, ArgsBase, Ptr, Size, ArgType);
755 }
756 return Ret;
757 }
758
getNonContigMergedDimension(__tgt_target_non_contig * NonContig,int32_t DimSize)759 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
760 int32_t DimSize) {
761 int RemovedDim = 0;
762 for (int I = DimSize - 1; I > 0; --I) {
763 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
764 RemovedDim++;
765 }
766 return RemovedDim;
767 }
768
769 /// Internal function to pass data to/from the target.
770 // async_info_ptr is currently unused, added here so targetDataUpdate has the
771 // same signature as targetDataBegin and targetDataEnd.
targetDataUpdate(DeviceTy & Device,int32_t ArgNum,void ** ArgsBase,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,__tgt_async_info * AsyncInfoPtr)772 int targetDataUpdate(DeviceTy &Device, int32_t ArgNum, void **ArgsBase,
773 void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
774 map_var_info_t *ArgNames, void **ArgMappers,
775 __tgt_async_info *AsyncInfoPtr) {
776 // process each input.
777 for (int32_t I = 0; I < ArgNum; ++I) {
778 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
779 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
780 continue;
781
782 if (ArgMappers && ArgMappers[I]) {
783 // Instead of executing the regular path of targetDataUpdate, call the
784 // targetDataMapper variant which will call targetDataUpdate again
785 // with new arguments.
786 DP("Calling targetDataMapper for the %dth argument\n", I);
787
788 int Ret = targetDataMapper(Device, ArgsBase[I], Args[I], ArgSizes[I],
789 ArgTypes[I], ArgMappers[I], targetDataUpdate);
790
791 if (Ret != OFFLOAD_SUCCESS) {
792 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
793 " failed.\n");
794 return OFFLOAD_FAIL;
795 }
796
797 // Skip the rest of this function, continue to the next argument.
798 continue;
799 }
800
801 int Ret = OFFLOAD_SUCCESS;
802
803 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
804 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
805 int32_t DimSize = ArgSizes[I];
806 uint64_t Size =
807 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
808 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
809 Ret = targetDataNonContiguous(
810 Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
811 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0);
812 } else {
813 Ret = targetDataContiguous(Device, ArgsBase[I], Args[I], ArgSizes[I],
814 ArgTypes[I]);
815 }
816 if (Ret == OFFLOAD_FAIL)
817 return OFFLOAD_FAIL;
818 }
819 return OFFLOAD_SUCCESS;
820 }
821
822 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
823 OMP_TGT_MAPTYPE_LITERAL |
824 OMP_TGT_MAPTYPE_IMPLICIT;
isLambdaMapping(int64_t Mapping)825 static bool isLambdaMapping(int64_t Mapping) {
826 return (Mapping & LambdaMapping) == LambdaMapping;
827 }
828
829 namespace {
830 /// Find the table information in the map or look it up in the translation
831 /// tables.
getTableMap(void * HostPtr)832 TableMap *getTableMap(void *HostPtr) {
833 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
834 HostPtrToTableMapTy::iterator TableMapIt =
835 PM->HostPtrToTableMap.find(HostPtr);
836
837 if (TableMapIt != PM->HostPtrToTableMap.end())
838 return &TableMapIt->second;
839
840 // We don't have a map. So search all the registered libraries.
841 TableMap *TM = nullptr;
842 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
843 for (HostEntriesBeginToTransTableTy::iterator Itr =
844 PM->HostEntriesBeginToTransTable.begin();
845 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
846 // get the translation table (which contains all the good info).
847 TranslationTable *TransTable = &Itr->second;
848 // iterate over all the host table entries to see if we can locate the
849 // host_ptr.
850 __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
851 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
852 if (Cur->addr != HostPtr)
853 continue;
854 // we got a match, now fill the HostPtrToTableMap so that we
855 // may avoid this search next time.
856 TM = &(PM->HostPtrToTableMap)[HostPtr];
857 TM->Table = TransTable;
858 TM->Index = I;
859 return TM;
860 }
861 }
862
863 return nullptr;
864 }
865
866 /// Get loop trip count
867 /// FIXME: This function will not work right if calling
868 /// __kmpc_push_target_tripcount in one thread but doing offloading in another
869 /// thread, which might occur when we call task yield.
getLoopTripCount(int64_t DeviceId)870 uint64_t getLoopTripCount(int64_t DeviceId) {
871 DeviceTy &Device = PM->Devices[DeviceId];
872 uint64_t LoopTripCount = 0;
873
874 {
875 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
876 auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
877 if (I != Device.LoopTripCnt.end()) {
878 LoopTripCount = I->second;
879 Device.LoopTripCnt.erase(I);
880 DP("loop trip count is %lu.\n", LoopTripCount);
881 }
882 }
883
884 return LoopTripCount;
885 }
886
887 /// A class manages private arguments in a target region.
888 class PrivateArgumentManagerTy {
889 /// A data structure for the information of first-private arguments. We can
890 /// use this information to optimize data transfer by packing all
891 /// first-private arguments and transfer them all at once.
892 struct FirstPrivateArgInfoTy {
893 /// The index of the element in \p TgtArgs corresponding to the argument
894 const int Index;
895 /// Host pointer begin
896 const char *HstPtrBegin;
897 /// Host pointer end
898 const char *HstPtrEnd;
899 /// Aligned size
900 const int64_t AlignedSize;
901 /// Host pointer name
902 const map_var_info_t HstPtrName = nullptr;
903
FirstPrivateArgInfoTy__anonba5a93b80211::PrivateArgumentManagerTy::FirstPrivateArgInfoTy904 FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size,
905 const map_var_info_t HstPtrName = nullptr)
906 : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)),
907 HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment),
908 HstPtrName(HstPtrName) {}
909 };
910
911 /// A vector of target pointers for all private arguments
912 std::vector<void *> TgtPtrs;
913
914 /// A vector of information of all first-private arguments to be packed
915 std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
916 /// Host buffer for all arguments to be packed
917 std::vector<char> FirstPrivateArgBuffer;
918 /// The total size of all arguments to be packed
919 int64_t FirstPrivateArgSize = 0;
920
921 /// A reference to the \p DeviceTy object
922 DeviceTy &Device;
923 /// A pointer to a \p __tgt_async_info object
924 __tgt_async_info *AsyncInfo;
925
926 // TODO: What would be the best value here? Should we make it configurable?
927 // If the size is larger than this threshold, we will allocate and transfer it
928 // immediately instead of packing it.
929 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
930
931 public:
932 /// Constructor
PrivateArgumentManagerTy(DeviceTy & Dev,__tgt_async_info * AsyncInfo)933 PrivateArgumentManagerTy(DeviceTy &Dev, __tgt_async_info *AsyncInfo)
934 : Device(Dev), AsyncInfo(AsyncInfo) {}
935
936 /// Add a private argument
addArg(void * HstPtr,int64_t ArgSize,int64_t ArgOffset,bool IsFirstPrivate,void * & TgtPtr,int TgtArgsIndex,const map_var_info_t HstPtrName=nullptr)937 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
938 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
939 const map_var_info_t HstPtrName = nullptr) {
940 // If the argument is not first-private, or its size is greater than a
941 // predefined threshold, we will allocate memory and issue the transfer
942 // immediately.
943 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate) {
944 TgtPtr = Device.allocData(ArgSize, HstPtr);
945 if (!TgtPtr) {
946 DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
947 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
948 return OFFLOAD_FAIL;
949 }
950 #ifdef OMPTARGET_DEBUG
951 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
952 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
953 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
954 "\n",
955 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
956 DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
957 #endif
958 // If first-private, copy data from host
959 if (IsFirstPrivate) {
960 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
961 if (Ret != OFFLOAD_SUCCESS) {
962 DP("Copying data to device failed, failed.\n");
963 return OFFLOAD_FAIL;
964 }
965 }
966 TgtPtrs.push_back(TgtPtr);
967 } else {
968 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
969 DPxPTR(HstPtr), ArgSize);
970 // When reach this point, the argument must meet all following
971 // requirements:
972 // 1. Its size does not exceed the threshold (see the comment for
973 // FirstPrivateArgSizeThreshold);
974 // 2. It must be first-private (needs to be mapped to target device).
975 // We will pack all this kind of arguments to transfer them all at once
976 // to reduce the number of data transfer. We will not take
977 // non-first-private arguments, aka. private arguments that doesn't need
978 // to be mapped to target device, into account because data allocation
979 // can be very efficient with memory manager.
980
981 // Placeholder value
982 TgtPtr = nullptr;
983 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
984 HstPtrName);
985 FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize;
986 }
987
988 return OFFLOAD_SUCCESS;
989 }
990
991 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
992 /// and start the transfer.
packAndTransfer(std::vector<void * > & TgtArgs)993 int packAndTransfer(std::vector<void *> &TgtArgs) {
994 if (!FirstPrivateArgInfo.empty()) {
995 assert(FirstPrivateArgSize != 0 &&
996 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
997 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
998 auto Itr = FirstPrivateArgBuffer.begin();
999 // Copy all host data to this buffer
1000 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1001 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1002 Itr = std::next(Itr, Info.AlignedSize);
1003 }
1004 // Allocate target memory
1005 void *TgtPtr =
1006 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1007 if (TgtPtr == nullptr) {
1008 DP("Failed to allocate target memory for private arguments.\n");
1009 return OFFLOAD_FAIL;
1010 }
1011 TgtPtrs.push_back(TgtPtr);
1012 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1013 FirstPrivateArgSize, DPxPTR(TgtPtr));
1014 // Transfer data to target device
1015 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1016 FirstPrivateArgSize, AsyncInfo);
1017 if (Ret != OFFLOAD_SUCCESS) {
1018 DP("Failed to submit data of private arguments.\n");
1019 return OFFLOAD_FAIL;
1020 }
1021 // Fill in all placeholder pointers
1022 auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1023 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1024 void *&Ptr = TgtArgs[Info.Index];
1025 assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1026 Ptr = reinterpret_cast<void *>(TP);
1027 TP += Info.AlignedSize;
1028 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1029 "\n",
1030 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1031 DPxPTR(Ptr));
1032 }
1033 }
1034
1035 return OFFLOAD_SUCCESS;
1036 }
1037
1038 /// Free all target memory allocated for private arguments
free()1039 int free() {
1040 for (void *P : TgtPtrs) {
1041 int Ret = Device.deleteData(P);
1042 if (Ret != OFFLOAD_SUCCESS) {
1043 DP("Deallocation of (first-)private arrays failed.\n");
1044 return OFFLOAD_FAIL;
1045 }
1046 }
1047
1048 TgtPtrs.clear();
1049
1050 return OFFLOAD_SUCCESS;
1051 }
1052 };
1053
1054 /// Process data before launching the kernel, including calling targetDataBegin
1055 /// to map and transfer data to target device, transferring (first-)private
1056 /// variables.
processDataBefore(int64_t DeviceId,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,std::vector<void * > & TgtArgs,std::vector<ptrdiff_t> & TgtOffsets,PrivateArgumentManagerTy & PrivateArgumentManager,__tgt_async_info * AsyncInfo)1057 int processDataBefore(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
1058 void **ArgBases, void **Args, int64_t *ArgSizes,
1059 int64_t *ArgTypes, map_var_info_t *ArgNames,
1060 void **ArgMappers, std::vector<void *> &TgtArgs,
1061 std::vector<ptrdiff_t> &TgtOffsets,
1062 PrivateArgumentManagerTy &PrivateArgumentManager,
1063 __tgt_async_info *AsyncInfo) {
1064 DeviceTy &Device = PM->Devices[DeviceId];
1065 int Ret = targetDataBegin(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes,
1066 ArgNames, ArgMappers, AsyncInfo);
1067 if (Ret != OFFLOAD_SUCCESS) {
1068 REPORT("Call to targetDataBegin failed, abort target.\n");
1069 return OFFLOAD_FAIL;
1070 }
1071
1072 // List of (first-)private arrays allocated for this target region
1073 std::vector<int> TgtArgsPositions(ArgNum, -1);
1074
1075 for (int32_t I = 0; I < ArgNum; ++I) {
1076 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1077 // This is not a target parameter, do not push it into TgtArgs.
1078 // Check for lambda mapping.
1079 if (isLambdaMapping(ArgTypes[I])) {
1080 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1081 "PTR_AND_OBJ must be also MEMBER_OF.");
1082 unsigned Idx = getParentIndex(ArgTypes[I]);
1083 int TgtIdx = TgtArgsPositions[Idx];
1084 assert(TgtIdx != -1 && "Base address must be translated already.");
1085 // The parent lambda must be processed already and it must be the last
1086 // in TgtArgs and TgtOffsets arrays.
1087 void *HstPtrVal = Args[I];
1088 void *HstPtrBegin = ArgBases[I];
1089 void *HstPtrBase = Args[Idx];
1090 bool IsLast, IsHostPtr; // unused.
1091 void *TgtPtrBase =
1092 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1093 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1094 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1095 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1096 void *PointerTgtPtrBegin = Device.getTgtPtrBegin(
1097 HstPtrVal, ArgSizes[I], IsLast, false, IsHostPtr);
1098 if (!PointerTgtPtrBegin) {
1099 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1100 DPxPTR(HstPtrVal));
1101 continue;
1102 }
1103 if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
1104 TgtPtrBegin == HstPtrBegin) {
1105 DP("Unified memory is active, no need to map lambda captured"
1106 "variable (" DPxMOD ")\n",
1107 DPxPTR(HstPtrVal));
1108 continue;
1109 }
1110 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1111 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1112 Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1113 sizeof(void *), AsyncInfo);
1114 if (Ret != OFFLOAD_SUCCESS) {
1115 REPORT("Copying data to device failed.\n");
1116 return OFFLOAD_FAIL;
1117 }
1118 }
1119 continue;
1120 }
1121 void *HstPtrBegin = Args[I];
1122 void *HstPtrBase = ArgBases[I];
1123 void *TgtPtrBegin;
1124 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1125 ptrdiff_t TgtBaseOffset;
1126 bool IsLast, IsHostPtr; // unused.
1127 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1128 DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1129 DPxPTR(HstPtrBase));
1130 TgtPtrBegin = HstPtrBase;
1131 TgtBaseOffset = 0;
1132 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1133 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1134 // Can be marked for optimization if the next argument(s) do(es) not
1135 // depend on this one.
1136 const bool IsFirstPrivate =
1137 (I >= ArgNum - 1 || !(ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1138 Ret = PrivateArgumentManager.addArg(
1139 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1140 TgtArgs.size(), HstPtrName);
1141 if (Ret != OFFLOAD_SUCCESS) {
1142 REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1143 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1144 return OFFLOAD_FAIL;
1145 }
1146 } else {
1147 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1148 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1149 TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
1150 false, IsHostPtr);
1151 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1152 #ifdef OMPTARGET_DEBUG
1153 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1154 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1155 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1156 #endif
1157 }
1158 TgtArgsPositions[I] = TgtArgs.size();
1159 TgtArgs.push_back(TgtPtrBegin);
1160 TgtOffsets.push_back(TgtBaseOffset);
1161 }
1162
1163 assert(TgtArgs.size() == TgtOffsets.size() &&
1164 "Size mismatch in arguments and offsets");
1165
1166 // Pack and transfer first-private arguments
1167 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1168 if (Ret != OFFLOAD_SUCCESS) {
1169 DP("Failed to pack and transfer first private arguments\n");
1170 return OFFLOAD_FAIL;
1171 }
1172
1173 return OFFLOAD_SUCCESS;
1174 }
1175
1176 /// Process data after launching the kernel, including transferring data back to
1177 /// host if needed and deallocating target memory of (first-)private variables.
processDataAfter(int64_t DeviceId,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,PrivateArgumentManagerTy & PrivateArgumentManager,__tgt_async_info * AsyncInfo)1178 int processDataAfter(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
1179 void **ArgBases, void **Args, int64_t *ArgSizes,
1180 int64_t *ArgTypes, map_var_info_t *ArgNames,
1181 void **ArgMappers,
1182 PrivateArgumentManagerTy &PrivateArgumentManager,
1183 __tgt_async_info *AsyncInfo) {
1184 DeviceTy &Device = PM->Devices[DeviceId];
1185
1186 // Move data from device.
1187 int Ret = targetDataEnd(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes,
1188 ArgNames, ArgMappers, AsyncInfo);
1189 if (Ret != OFFLOAD_SUCCESS) {
1190 REPORT("Call to targetDataEnd failed, abort target.\n");
1191 return OFFLOAD_FAIL;
1192 }
1193
1194 // Free target memory for private arguments
1195 Ret = PrivateArgumentManager.free();
1196 if (Ret != OFFLOAD_SUCCESS) {
1197 REPORT("Failed to deallocate target memory for private args\n");
1198 return OFFLOAD_FAIL;
1199 }
1200
1201 return OFFLOAD_SUCCESS;
1202 }
1203 } // namespace
1204
1205 /// performs the same actions as data_begin in case arg_num is
1206 /// non-zero and initiates run of the offloaded region on the target platform;
1207 /// if arg_num is non-zero after the region execution is done it also
1208 /// performs the same action as data_update and data_end above. This function
1209 /// returns 0 if it was able to transfer the execution to a target and an
1210 /// integer different from zero otherwise.
target(int64_t DeviceId,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,int32_t TeamNum,int32_t ThreadLimit,int IsTeamConstruct)1211 int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases,
1212 void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
1213 map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
1214 int32_t ThreadLimit, int IsTeamConstruct) {
1215 DeviceTy &Device = PM->Devices[DeviceId];
1216
1217 TableMap *TM = getTableMap(HostPtr);
1218 // No map for this host pointer found!
1219 if (!TM) {
1220 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1221 DPxPTR(HostPtr));
1222 return OFFLOAD_FAIL;
1223 }
1224
1225 // get target table.
1226 __tgt_target_table *TargetTable = nullptr;
1227 {
1228 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1229 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1230 "Not expecting a device ID outside the table's bounds!");
1231 TargetTable = TM->Table->TargetsTable[DeviceId];
1232 }
1233 assert(TargetTable && "Global data has not been mapped\n");
1234
1235 __tgt_async_info AsyncInfo;
1236
1237 std::vector<void *> TgtArgs;
1238 std::vector<ptrdiff_t> TgtOffsets;
1239
1240 PrivateArgumentManagerTy PrivateArgumentManager(Device, &AsyncInfo);
1241
1242 // Process data, such as data mapping, before launching the kernel
1243 int Ret = processDataBefore(DeviceId, HostPtr, ArgNum, ArgBases, Args,
1244 ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
1245 TgtOffsets, PrivateArgumentManager, &AsyncInfo);
1246 if (Ret != OFFLOAD_SUCCESS) {
1247 REPORT("Failed to process data before launching the kernel.\n");
1248 return OFFLOAD_FAIL;
1249 }
1250
1251 // Get loop trip count
1252 uint64_t LoopTripCount = getLoopTripCount(DeviceId);
1253
1254 // Launch device execution.
1255 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1256 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1257 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1258
1259 if (IsTeamConstruct)
1260 Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1261 TgtArgs.size(), TeamNum, ThreadLimit,
1262 LoopTripCount, &AsyncInfo);
1263 else
1264 Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1265 TgtArgs.size(), &AsyncInfo);
1266
1267 if (Ret != OFFLOAD_SUCCESS) {
1268 REPORT("Executing target region abort target.\n");
1269 return OFFLOAD_FAIL;
1270 }
1271
1272 // Transfer data back and deallocate target memory for (first-)private
1273 // variables
1274 Ret = processDataAfter(DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes,
1275 ArgTypes, ArgNames, ArgMappers, PrivateArgumentManager,
1276 &AsyncInfo);
1277 if (Ret != OFFLOAD_SUCCESS) {
1278 REPORT("Failed to process data after launching the kernel.\n");
1279 return OFFLOAD_FAIL;
1280 }
1281
1282 return OFFLOAD_SUCCESS;
1283 }
1284