• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-------------- NVPTX implementation of GPU utils -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-id: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H
10 #define LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H
11 
12 #include "src/__support/common.h"
13 
14 #include <stdint.h>
15 
16 namespace LIBC_NAMESPACE {
17 namespace gpu {
18 
19 /// Type aliases to the address spaces used by the NVPTX backend.
20 template <typename T> using Private = [[clang::opencl_private]] T;
21 template <typename T> using Constant = [[clang::opencl_constant]] T;
22 template <typename T> using Local = [[clang::opencl_local]] T;
23 template <typename T> using Global = [[clang::opencl_global]] T;
24 
25 /// Returns the number of CUDA blocks in the 'x' dimension.
get_num_blocks_x()26 LIBC_INLINE uint32_t get_num_blocks_x() {
27   return __nvvm_read_ptx_sreg_nctaid_x();
28 }
29 
30 /// Returns the number of CUDA blocks in the 'y' dimension.
get_num_blocks_y()31 LIBC_INLINE uint32_t get_num_blocks_y() {
32   return __nvvm_read_ptx_sreg_nctaid_y();
33 }
34 
35 /// Returns the number of CUDA blocks in the 'z' dimension.
get_num_blocks_z()36 LIBC_INLINE uint32_t get_num_blocks_z() {
37   return __nvvm_read_ptx_sreg_nctaid_z();
38 }
39 
40 /// Returns the total number of CUDA blocks.
get_num_blocks()41 LIBC_INLINE uint64_t get_num_blocks() {
42   return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
43 }
44 
45 /// Returns the 'x' dimension of the current CUDA block's id.
get_block_id_x()46 LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); }
47 
48 /// Returns the 'y' dimension of the current CUDA block's id.
get_block_id_y()49 LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); }
50 
51 /// Returns the 'z' dimension of the current CUDA block's id.
get_block_id_z()52 LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); }
53 
54 /// Returns the absolute id of the CUDA block.
get_block_id()55 LIBC_INLINE uint64_t get_block_id() {
56   return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
57          get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
58 }
59 
60 /// Returns the number of CUDA threads in the 'x' dimension.
get_num_threads_x()61 LIBC_INLINE uint32_t get_num_threads_x() {
62   return __nvvm_read_ptx_sreg_ntid_x();
63 }
64 
65 /// Returns the number of CUDA threads in the 'y' dimension.
get_num_threads_y()66 LIBC_INLINE uint32_t get_num_threads_y() {
67   return __nvvm_read_ptx_sreg_ntid_y();
68 }
69 
70 /// Returns the number of CUDA threads in the 'z' dimension.
get_num_threads_z()71 LIBC_INLINE uint32_t get_num_threads_z() {
72   return __nvvm_read_ptx_sreg_ntid_z();
73 }
74 
75 /// Returns the total number of threads in the block.
get_num_threads()76 LIBC_INLINE uint64_t get_num_threads() {
77   return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
78 }
79 
80 /// Returns the 'x' dimension id of the thread in the current CUDA block.
get_thread_id_x()81 LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); }
82 
83 /// Returns the 'y' dimension id of the thread in the current CUDA block.
get_thread_id_y()84 LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); }
85 
86 /// Returns the 'z' dimension id of the thread in the current CUDA block.
get_thread_id_z()87 LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); }
88 
89 /// Returns the absolute id of the thread in the current CUDA block.
get_thread_id()90 LIBC_INLINE uint64_t get_thread_id() {
91   return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
92          get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
93 }
94 
95 /// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
get_lane_size()96 LIBC_INLINE uint32_t get_lane_size() { return 32; }
97 
98 /// Returns the id of the thread inside of a CUDA warp executing together.
get_lane_id()99 [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
100   return __nvvm_read_ptx_sreg_laneid();
101 }
102 
103 /// Returns the bit-mask of active threads in the current warp.
get_lane_mask()104 [[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
105   return __nvvm_activemask();
106 }
107 
108 /// Copies the value from the first active thread in the warp to the rest.
broadcast_value(uint64_t lane_mask,uint32_t x)109 [[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask,
110                                                            uint32_t x) {
111   uint32_t mask = static_cast<uint32_t>(lane_mask);
112   uint32_t id = __builtin_ffs(mask) - 1;
113   return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1);
114 }
115 
116 /// Returns a bitmask of threads in the current lane for which \p x is true.
ballot(uint64_t lane_mask,bool x)117 [[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
118   uint32_t mask = static_cast<uint32_t>(lane_mask);
119   return __nvvm_vote_ballot_sync(mask, x);
120 }
121 
122 /// Waits for all the threads in the block to converge and issues a fence.
sync_threads()123 [[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
124 
125 /// Waits for all pending memory operations to complete in program order.
memory_fence()126 [[clang::convergent]] LIBC_INLINE void memory_fence() { __nvvm_membar_sys(); }
127 
128 /// Waits for all threads in the warp to reconverge for independent scheduling.
sync_lane(uint64_t mask)129 [[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
130   __nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
131 }
132 
133 /// Shuffles the the lanes inside the warp according to the given index.
shuffle(uint64_t lane_mask,uint32_t idx,uint32_t x)134 [[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask,
135                                                    uint32_t idx, uint32_t x) {
136   uint32_t mask = static_cast<uint32_t>(lane_mask);
137   uint32_t bitmask = (mask >> idx) & 1;
138   return -bitmask & __nvvm_shfl_sync_idx_i32(mask, x, idx, get_lane_size() - 1);
139 }
140 
141 /// Returns the current value of the GPU's processor clock.
processor_clock()142 LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
143 
144 /// Returns a global fixed-frequency timer at nanosecond frequency.
fixed_frequency_clock()145 LIBC_INLINE uint64_t fixed_frequency_clock() {
146   return __builtin_readsteadycounter();
147 }
148 
149 /// Terminates execution of the calling thread.
end_program()150 [[noreturn]] LIBC_INLINE void end_program() { __nvvm_exit(); }
151 
152 /// Returns a unique identifier for the process cluster the current warp is
153 /// executing on. Here we use the identifier for the symmetric multiprocessor.
get_cluster_id()154 LIBC_INLINE uint32_t get_cluster_id() { return __nvvm_read_ptx_sreg_smid(); }
155 
156 } // namespace gpu
157 } // namespace LIBC_NAMESPACE
158 
159 #endif
160