• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // REQUIRES: nvptx-registered-target
2 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s
3 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s
4 
5 #define __device__ __attribute__((device))
6 #define __global__ __attribute__((global))
7 #define __shared__ __attribute__((shared))
8 #define __constant__ __attribute__((constant))
9 
read_tid()10 __device__ int read_tid() {
11 
12 // CHECK: call i32 @llvm.ptx.read.tid.x()
13 // CHECK: call i32 @llvm.ptx.read.tid.y()
14 // CHECK: call i32 @llvm.ptx.read.tid.z()
15 // CHECK: call i32 @llvm.ptx.read.tid.w()
16 
17   int x = __builtin_ptx_read_tid_x();
18   int y = __builtin_ptx_read_tid_y();
19   int z = __builtin_ptx_read_tid_z();
20   int w = __builtin_ptx_read_tid_w();
21 
22   return x + y + z + w;
23 
24 }
25 
read_ntid()26 __device__ int read_ntid() {
27 
28 // CHECK: call i32 @llvm.ptx.read.ntid.x()
29 // CHECK: call i32 @llvm.ptx.read.ntid.y()
30 // CHECK: call i32 @llvm.ptx.read.ntid.z()
31 // CHECK: call i32 @llvm.ptx.read.ntid.w()
32 
33   int x = __builtin_ptx_read_ntid_x();
34   int y = __builtin_ptx_read_ntid_y();
35   int z = __builtin_ptx_read_ntid_z();
36   int w = __builtin_ptx_read_ntid_w();
37 
38   return x + y + z + w;
39 
40 }
41 
read_ctaid()42 __device__ int read_ctaid() {
43 
44 // CHECK: call i32 @llvm.ptx.read.ctaid.x()
45 // CHECK: call i32 @llvm.ptx.read.ctaid.y()
46 // CHECK: call i32 @llvm.ptx.read.ctaid.z()
47 // CHECK: call i32 @llvm.ptx.read.ctaid.w()
48 
49   int x = __builtin_ptx_read_ctaid_x();
50   int y = __builtin_ptx_read_ctaid_y();
51   int z = __builtin_ptx_read_ctaid_z();
52   int w = __builtin_ptx_read_ctaid_w();
53 
54   return x + y + z + w;
55 
56 }
57 
read_nctaid()58 __device__ int read_nctaid() {
59 
60 // CHECK: call i32 @llvm.ptx.read.nctaid.x()
61 // CHECK: call i32 @llvm.ptx.read.nctaid.y()
62 // CHECK: call i32 @llvm.ptx.read.nctaid.z()
63 // CHECK: call i32 @llvm.ptx.read.nctaid.w()
64 
65   int x = __builtin_ptx_read_nctaid_x();
66   int y = __builtin_ptx_read_nctaid_y();
67   int z = __builtin_ptx_read_nctaid_z();
68   int w = __builtin_ptx_read_nctaid_w();
69 
70   return x + y + z + w;
71 
72 }
73 
read_ids()74 __device__ int read_ids() {
75 
76 // CHECK: call i32 @llvm.ptx.read.laneid()
77 // CHECK: call i32 @llvm.ptx.read.warpid()
78 // CHECK: call i32 @llvm.ptx.read.nwarpid()
79 // CHECK: call i32 @llvm.ptx.read.smid()
80 // CHECK: call i32 @llvm.ptx.read.nsmid()
81 // CHECK: call i32 @llvm.ptx.read.gridid()
82 
83   int a = __builtin_ptx_read_laneid();
84   int b = __builtin_ptx_read_warpid();
85   int c = __builtin_ptx_read_nwarpid();
86   int d = __builtin_ptx_read_smid();
87   int e = __builtin_ptx_read_nsmid();
88   int f = __builtin_ptx_read_gridid();
89 
90   return a + b + c + d + e + f;
91 
92 }
93 
read_lanemasks()94 __device__ int read_lanemasks() {
95 
96 // CHECK: call i32 @llvm.ptx.read.lanemask.eq()
97 // CHECK: call i32 @llvm.ptx.read.lanemask.le()
98 // CHECK: call i32 @llvm.ptx.read.lanemask.lt()
99 // CHECK: call i32 @llvm.ptx.read.lanemask.ge()
100 // CHECK: call i32 @llvm.ptx.read.lanemask.gt()
101 
102   int a = __builtin_ptx_read_lanemask_eq();
103   int b = __builtin_ptx_read_lanemask_le();
104   int c = __builtin_ptx_read_lanemask_lt();
105   int d = __builtin_ptx_read_lanemask_ge();
106   int e = __builtin_ptx_read_lanemask_gt();
107 
108   return a + b + c + d + e;
109 
110 }
111 
read_clocks()112 __device__ long long read_clocks() {
113 
114 // CHECK: call i32 @llvm.ptx.read.clock()
115 // CHECK: call i64 @llvm.ptx.read.clock64()
116 
117   int a = __builtin_ptx_read_clock();
118   long long b = __builtin_ptx_read_clock64();
119 
120   return a + b;
121 }
122 
read_pms()123 __device__ int read_pms() {
124 
125 // CHECK: call i32 @llvm.ptx.read.pm0()
126 // CHECK: call i32 @llvm.ptx.read.pm1()
127 // CHECK: call i32 @llvm.ptx.read.pm2()
128 // CHECK: call i32 @llvm.ptx.read.pm3()
129 
130   int a = __builtin_ptx_read_pm0();
131   int b = __builtin_ptx_read_pm1();
132   int c = __builtin_ptx_read_pm2();
133   int d = __builtin_ptx_read_pm3();
134 
135   return a + b + c + d;
136 
137 }
138 
sync()139 __device__ void sync() {
140 
141 // CHECK: call void @llvm.ptx.bar.sync(i32 0)
142 
143   __builtin_ptx_bar_sync(0);
144 
145 }
146 
147 
148 // NVVM intrinsics
149 
150 // The idea is not to test all intrinsics, just that Clang is recognizing the
151 // builtins defined in BuiltinsNVPTX.def
nvvm_math(float f1,float f2,double d1,double d2)152 __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
153 // CHECK: call float @llvm.nvvm.fmax.f
154   float t1 = __nvvm_fmax_f(f1, f2);
155 // CHECK: call float @llvm.nvvm.fmin.f
156   float t2 = __nvvm_fmin_f(f1, f2);
157 // CHECK: call float @llvm.nvvm.sqrt.rn.f
158   float t3 = __nvvm_sqrt_rn_f(f1);
159 // CHECK: call float @llvm.nvvm.rcp.rn.f
160   float t4 = __nvvm_rcp_rn_f(f2);
161 // CHECK: call float @llvm.nvvm.add.rn.f
162   float t5 = __nvvm_add_rn_f(f1, f2);
163 
164 // CHECK: call double @llvm.nvvm.fmax.d
165   double td1 = __nvvm_fmax_d(d1, d2);
166 // CHECK: call double @llvm.nvvm.fmin.d
167   double td2 = __nvvm_fmin_d(d1, d2);
168 // CHECK: call double @llvm.nvvm.sqrt.rn.d
169   double td3 = __nvvm_sqrt_rn_d(d1);
170 // CHECK: call double @llvm.nvvm.rcp.rn.d
171   double td4 = __nvvm_rcp_rn_d(d2);
172 
173 // CHECK: call void @llvm.nvvm.membar.cta()
174   __nvvm_membar_cta();
175 // CHECK: call void @llvm.nvvm.membar.gl()
176   __nvvm_membar_gl();
177 // CHECK: call void @llvm.nvvm.membar.sys()
178   __nvvm_membar_sys();
179 // CHECK: call void @llvm.nvvm.barrier0()
180   __nvvm_bar0();
181 }
182 
183 __device__ int di;
184 __shared__ int si;
185 __device__ long dl;
186 __shared__ long sl;
187 __device__ long long dll;
188 __shared__ long long sll;
189 
190 // Check for atomic intrinsics
191 // CHECK-LABEL: nvvm_atom
nvvm_atom(float * fp,float f,int * ip,int i,long * lp,long l,long long * llp,long long ll)192 __device__ void nvvm_atom(float *fp, float f, int *ip, int i, long *lp, long l,
193                           long long *llp, long long ll) {
194   // CHECK: atomicrmw add
195   __nvvm_atom_add_gen_i(ip, i);
196   // CHECK: atomicrmw add
197   __nvvm_atom_add_gen_l(&dl, l);
198   // CHECK: atomicrmw add
199   __nvvm_atom_add_gen_ll(&sll, ll);
200 
201   // CHECK: atomicrmw sub
202   __nvvm_atom_sub_gen_i(ip, i);
203   // CHECK: atomicrmw sub
204   __nvvm_atom_sub_gen_l(&dl, l);
205   // CHECK: atomicrmw sub
206   __nvvm_atom_sub_gen_ll(&sll, ll);
207 
208   // CHECK: atomicrmw and
209   __nvvm_atom_and_gen_i(ip, i);
210   // CHECK: atomicrmw and
211   __nvvm_atom_and_gen_l(&dl, l);
212   // CHECK: atomicrmw and
213   __nvvm_atom_and_gen_ll(&sll, ll);
214 
215   // CHECK: atomicrmw or
216   __nvvm_atom_or_gen_i(ip, i);
217   // CHECK: atomicrmw or
218   __nvvm_atom_or_gen_l(&dl, l);
219   // CHECK: atomicrmw or
220   __nvvm_atom_or_gen_ll(&sll, ll);
221 
222   // CHECK: atomicrmw xor
223   __nvvm_atom_xor_gen_i(ip, i);
224   // CHECK: atomicrmw xor
225   __nvvm_atom_xor_gen_l(&dl, l);
226   // CHECK: atomicrmw xor
227   __nvvm_atom_xor_gen_ll(&sll, ll);
228 
229   // CHECK: atomicrmw xchg
230   __nvvm_atom_xchg_gen_i(ip, i);
231   // CHECK: atomicrmw xchg
232   __nvvm_atom_xchg_gen_l(&dl, l);
233   // CHECK: atomicrmw xchg
234   __nvvm_atom_xchg_gen_ll(&sll, ll);
235 
236   // CHECK: atomicrmw max i32*
237   __nvvm_atom_max_gen_i(ip, i);
238   // CHECK: atomicrmw umax i32*
239   __nvvm_atom_max_gen_ui((unsigned int *)ip, i);
240   // CHECK: atomicrmw max
241   __nvvm_atom_max_gen_l(&dl, l);
242   // CHECK: atomicrmw umax
243   __nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
244   // CHECK: atomicrmw max i64*
245   __nvvm_atom_max_gen_ll(&sll, ll);
246   // CHECK: atomicrmw umax i64*
247   __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);
248 
249   // CHECK: atomicrmw min i32*
250   __nvvm_atom_min_gen_i(ip, i);
251   // CHECK: atomicrmw umin i32*
252   __nvvm_atom_min_gen_ui((unsigned int *)ip, i);
253   // CHECK: atomicrmw min
254   __nvvm_atom_min_gen_l(&dl, l);
255   // CHECK: atomicrmw umin
256   __nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
257   // CHECK: atomicrmw min i64*
258   __nvvm_atom_min_gen_ll(&sll, ll);
259   // CHECK: atomicrmw umin i64*
260   __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);
261 
262   // CHECK: cmpxchg
263   // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0
264   __nvvm_atom_cas_gen_i(ip, 0, i);
265   // CHECK: cmpxchg
266   // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0
267   __nvvm_atom_cas_gen_l(&dl, 0, l);
268   // CHECK: cmpxchg
269   // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0
270   __nvvm_atom_cas_gen_ll(&sll, 0, ll);
271 
272   // CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
273   __nvvm_atom_add_gen_f(fp, f);
274 
275   // CHECK: ret
276 }
277