1 //===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Definitions of target specific functions
10 //
11 //===----------------------------------------------------------------------===//
12 #ifndef _TARGET_IMPL_H_
13 #define _TARGET_IMPL_H_
14
15 #include <assert.h>
16 #include <cuda.h>
17 #include <inttypes.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20
21 #include "nvptx_interface.h"
22
23 #define DEVICE __device__
24 #define INLINE __forceinline__ DEVICE
25 #define NOINLINE __noinline__ DEVICE
26 #define SHARED __shared__
27 #define ALIGN(N) __align__(N)
28
29 ////////////////////////////////////////////////////////////////////////////////
30 // Kernel options
31 ////////////////////////////////////////////////////////////////////////////////
32
33 ////////////////////////////////////////////////////////////////////////////////
34 // The following def must match the absolute limit hardwired in the host RTL
35 // max number of threads per team
36 #define MAX_THREADS_PER_TEAM 1024
37
38 #define WARPSIZE 32
39
40 // Maximum number of preallocated arguments to an outlined parallel/simd function.
41 // Anything more requires dynamic memory allocation.
42 #define MAX_SHARED_ARGS 20
43
44 // Maximum number of omp state objects per SM allocated statically in global
45 // memory.
46 #if __CUDA_ARCH__ >= 600
47 #define OMP_STATE_COUNT 32
48 #else
49 #define OMP_STATE_COUNT 16
50 #endif
51
52 #if !defined(MAX_SM)
53 #if __CUDA_ARCH__ >= 900
54 #error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
55 #elif __CUDA_ARCH__ >= 800
56 // GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
57 // GA102 design has a maxinum of 84 SMs
58 #define MAX_SM 108
59 #elif __CUDA_ARCH__ >= 700
60 #define MAX_SM 84
61 #elif __CUDA_ARCH__ >= 600
62 #define MAX_SM 56
63 #else
64 #define MAX_SM 16
65 #endif
66 #endif
67
68 #define OMP_ACTIVE_PARALLEL_LEVEL 128
69
70 // Data sharing related quantities, need to match what is used in the compiler.
71 enum DATA_SHARING_SIZES {
72 // The maximum number of workers in a kernel.
73 DS_Max_Worker_Threads = 992,
74 // The size reserved for data in a shared memory slot.
75 DS_Slot_Size = 256,
76 // The slot size that should be reserved for a working warp.
77 DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
78 // The maximum number of warps in use
79 DS_Max_Warp_Number = 32,
80 // The size of the preallocated shared memory buffer per team
81 DS_Shared_Memory_Size = 128,
82 };
83
__kmpc_impl_unpack(uint64_t val,uint32_t & lo,uint32_t & hi)84 INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
85 asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
86 }
87
__kmpc_impl_pack(uint32_t lo,uint32_t hi)88 INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
89 uint64_t val;
90 asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
91 return val;
92 }
93
94 static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes =
95 UINT32_C(0xffffffff);
96
__kmpc_impl_lanemask_lt()97 INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
98 __kmpc_impl_lanemask_t res;
99 asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
100 return res;
101 }
102
__kmpc_impl_lanemask_gt()103 INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
104 __kmpc_impl_lanemask_t res;
105 asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
106 return res;
107 }
108
__kmpc_impl_smid()109 INLINE uint32_t __kmpc_impl_smid() {
110 uint32_t id;
111 asm("mov.u32 %0, %%smid;" : "=r"(id));
112 return id;
113 }
114
__kmpc_impl_get_wtick()115 INLINE double __kmpc_impl_get_wtick() {
116 // Timer precision is 1ns
117 return ((double)1E-9);
118 }
119
__kmpc_impl_get_wtime()120 INLINE double __kmpc_impl_get_wtime() {
121 unsigned long long nsecs;
122 asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
123 return (double)nsecs * __kmpc_impl_get_wtick();
124 }
125
__kmpc_impl_ffs(uint32_t x)126 INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
127
__kmpc_impl_popc(uint32_t x)128 INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
129
__kmpc_impl_min(T x,T y)130 template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
131 return min(x, y);
132 }
133
134 #ifndef CUDA_VERSION
135 #error CUDA_VERSION macro is undefined, something wrong with cuda.
136 #endif
137
138 // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
139
__kmpc_impl_activemask()140 INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
141 #if CUDA_VERSION >= 9000
142 return __activemask();
143 #else
144 return __ballot(1);
145 #endif
146 }
147
148 // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
149
__kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask,int32_t Var,int32_t SrcLane)150 INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
151 int32_t SrcLane) {
152 #if CUDA_VERSION >= 9000
153 return __shfl_sync(Mask, Var, SrcLane);
154 #else
155 return __shfl(Var, SrcLane);
156 #endif // CUDA_VERSION
157 }
158
__kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,int32_t Var,uint32_t Delta,int32_t Width)159 INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
160 int32_t Var, uint32_t Delta,
161 int32_t Width) {
162 #if CUDA_VERSION >= 9000
163 return __shfl_down_sync(Mask, Var, Delta, Width);
164 #else
165 return __shfl_down(Var, Delta, Width);
166 #endif // CUDA_VERSION
167 }
168
__kmpc_impl_syncthreads()169 INLINE void __kmpc_impl_syncthreads() {
170 // Use original __syncthreads if compiled by nvcc or clang >= 9.0.
171 #if !defined(__clang__) || __clang_major__ >= 9
172 __syncthreads();
173 #else
174 asm volatile("bar.sync %0;" : : "r"(0) : "memory");
175 #endif // __clang__
176 }
177
__kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask)178 INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
179 #if CUDA_VERSION >= 9000
180 __syncwarp(Mask);
181 #else
182 // In Cuda < 9.0 no need to sync threads in warps.
183 #endif // CUDA_VERSION
184 }
185
186 // NVPTX specific kernel initialization
__kmpc_impl_target_init()187 INLINE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
188 }
189
190 // Barrier until num_threads arrive.
__kmpc_impl_named_sync(uint32_t num_threads)191 INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
192 // The named barrier for active parallel threads of a team in an L1 parallel
193 // region to synchronize with each other.
194 int barrier = 1;
195 asm volatile("bar.sync %0, %1;"
196 :
197 : "r"(barrier), "r"(num_threads)
198 : "memory");
199 }
200
__kmpc_impl_threadfence(void)201 INLINE void __kmpc_impl_threadfence(void) { __threadfence(); }
__kmpc_impl_threadfence_block(void)202 INLINE void __kmpc_impl_threadfence_block(void) { __threadfence_block(); }
__kmpc_impl_threadfence_system(void)203 INLINE void __kmpc_impl_threadfence_system(void) { __threadfence_system(); }
204
205 // Calls to the NVPTX layer (assuming 1D layout)
GetThreadIdInBlock()206 INLINE int GetThreadIdInBlock() { return threadIdx.x; }
GetBlockIdInKernel()207 INLINE int GetBlockIdInKernel() { return blockIdx.x; }
GetNumberOfBlocksInKernel()208 INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
GetNumberOfThreadsInBlock()209 INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
GetWarpId()210 INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
GetLaneId()211 INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
212
213 // Locks
214 EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
215 EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);
216 EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock);
217 EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock);
218 EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock);
219
220 // Memory
__kmpc_impl_malloc(size_t x)221 INLINE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
__kmpc_impl_free(void * x)222 INLINE void __kmpc_impl_free(void *x) { free(x); }
223
224 #endif
225