1 /*===--------------------------------------------------------------------------
2 * ATMI (Asynchronous Task and Memory Interface)
3 *
4 * This file is distributed under the MIT License. See LICENSE.txt for details.
5 *===------------------------------------------------------------------------*/
6 #ifndef SRC_RUNTIME_INCLUDE_INTERNAL_H_
7 #define SRC_RUNTIME_INCLUDE_INTERNAL_H_
8 #include <inttypes.h>
9 #include <pthread.h>
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14
15 #include <atomic>
16 #include <cstring>
17 #include <deque>
18 #include <map>
19 #include <queue>
20 #include <string>
21 #include <utility>
22 #include <vector>
23
24 #include "hsa.h"
25 #include "hsa_ext_amd.h"
26 #include "hsa_ext_finalize.h"
27
28 #include "atmi.h"
29 #include "atmi_runtime.h"
30 #include "rt.h"
31
32 #define MAX_NUM_KERNELS (1024 * 16)
33
34 typedef struct atmi_implicit_args_s {
35 unsigned long offset_x;
36 unsigned long offset_y;
37 unsigned long offset_z;
38 unsigned long hostcall_ptr;
39 char num_gpu_queues;
40 unsigned long gpu_queue_ptr;
41 char num_cpu_queues;
42 unsigned long cpu_worker_signals;
43 unsigned long cpu_queue_ptr;
44 unsigned long kernarg_template_ptr;
45 } atmi_implicit_args_t;
46
47 #ifdef __cplusplus
48 extern "C" {
49 #endif
50
51 #define check(msg, status) \
52 if (status != HSA_STATUS_SUCCESS) { \
53 printf("%s failed.\n", #msg); \
54 exit(1); \
55 }
56
57 #ifdef DEBUG
58 #define DEBUG_PRINT(fmt, ...) \
59 if (core::Runtime::getInstance().getDebugMode()) { \
60 fprintf(stderr, "[%s:%d] " fmt, __FILE__, __LINE__, ##__VA_ARGS__); \
61 }
62 #else
63 #define DEBUG_PRINT(...) \
64 do { \
65 } while (false)
66 #endif
67
68 #ifndef HSA_RUNTIME_INC_HSA_H_
69 typedef struct hsa_signal_s {
70 uint64_t handle;
71 } hsa_signal_t;
72 #endif
73
74 /* All global values go in this global structure */
75 typedef struct atl_context_s {
76 bool struct_initialized;
77 bool g_hsa_initialized;
78 bool g_gpu_initialized;
79 bool g_tasks_initialized;
80 } atl_context_t;
81 extern atl_context_t atlc;
82 extern atl_context_t *atlc_p;
83
84 #ifdef __cplusplus
85 }
86 #endif
87
88 /* ---------------------------------------------------------------------------------
89 * Simulated CPU Data Structures and API
90 * ---------------------------------------------------------------------------------
91 */
92
93 #define ATMI_WAIT_STATE HSA_WAIT_STATE_BLOCKED
94
95 // ---------------------- Kernel Start -------------
96 typedef struct atl_kernel_info_s {
97 uint64_t kernel_object;
98 uint32_t group_segment_size;
99 uint32_t private_segment_size;
100 uint32_t kernel_segment_size;
101 uint32_t num_args;
102 std::vector<uint64_t> arg_alignments;
103 std::vector<uint64_t> arg_offsets;
104 std::vector<uint64_t> arg_sizes;
105 } atl_kernel_info_t;
106
107 typedef struct atl_symbol_info_s {
108 uint64_t addr;
109 uint32_t size;
110 } atl_symbol_info_t;
111
112 extern std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
113 extern std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
114
115 // ---------------------- Kernel End -------------
116
117 extern struct timespec context_init_time;
118
119 namespace core {
120 class TaskgroupImpl;
121 class TaskImpl;
122 class Kernel;
123 class KernelImpl;
124 } // namespace core
125
126 struct SignalPoolT {
SignalPoolTSignalPoolT127 SignalPoolT() {
128 // If no signals are created, and none can be created later,
129 // will ultimately fail at pop()
130
131 unsigned N = 1024; // default max pool size from atmi
132 for (unsigned i = 0; i < N; i++) {
133 hsa_signal_t new_signal;
134 hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal);
135 if (err != HSA_STATUS_SUCCESS) {
136 break;
137 }
138 state.push(new_signal);
139 }
140 DEBUG_PRINT("Signal Pool Initial Size: %lu\n", state.size());
141 }
142 SignalPoolT(const SignalPoolT &) = delete;
143 SignalPoolT(SignalPoolT &&) = delete;
~SignalPoolTSignalPoolT144 ~SignalPoolT() {
145 size_t N = state.size();
146 for (size_t i = 0; i < N; i++) {
147 hsa_signal_t signal = state.front();
148 state.pop();
149 hsa_status_t rc = hsa_signal_destroy(signal);
150 if (rc != HSA_STATUS_SUCCESS) {
151 DEBUG_PRINT("Signal pool destruction failed\n");
152 }
153 }
154 }
sizeSignalPoolT155 size_t size() {
156 lock l(&mutex);
157 return state.size();
158 }
pushSignalPoolT159 void push(hsa_signal_t s) {
160 lock l(&mutex);
161 state.push(s);
162 }
popSignalPoolT163 hsa_signal_t pop(void) {
164 lock l(&mutex);
165 if (!state.empty()) {
166 hsa_signal_t res = state.front();
167 state.pop();
168 return res;
169 }
170
171 // Pool empty, attempt to create another signal
172 hsa_signal_t new_signal;
173 hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal);
174 if (err == HSA_STATUS_SUCCESS) {
175 return new_signal;
176 }
177
178 // Fail
179 return {0};
180 }
181
182 private:
183 static pthread_mutex_t mutex;
184 std::queue<hsa_signal_t> state;
185 struct lock {
lockSignalPoolT::lock186 lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
~lockSignalPoolT::lock187 ~lock() { pthread_mutex_unlock(m); }
188 pthread_mutex_t *m;
189 };
190 };
191
192 extern std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
193
194 namespace core {
195 atmi_status_t atl_init_gpu_context();
196
197 hsa_status_t init_hsa();
198 hsa_status_t finalize_hsa();
199 /*
200 * Generic utils
201 */
alignDown(T value,size_t alignment)202 template <typename T> inline T alignDown(T value, size_t alignment) {
203 return (T)(value & ~(alignment - 1));
204 }
205
alignDown(T * value,size_t alignment)206 template <typename T> inline T *alignDown(T *value, size_t alignment) {
207 return reinterpret_cast<T *>(alignDown((intptr_t)value, alignment));
208 }
209
alignUp(T value,size_t alignment)210 template <typename T> inline T alignUp(T value, size_t alignment) {
211 return alignDown((T)(value + alignment - 1), alignment);
212 }
213
alignUp(T * value,size_t alignment)214 template <typename T> inline T *alignUp(T *value, size_t alignment) {
215 return reinterpret_cast<T *>(
216 alignDown((intptr_t)(value + alignment - 1), alignment));
217 }
218
219 extern void register_allocation(void *addr, size_t size,
220 atmi_mem_place_t place);
221 extern hsa_amd_memory_pool_t
222 get_memory_pool_by_mem_place(atmi_mem_place_t place);
223 extern bool atl_is_atmi_initialized();
224
225 bool handle_group_signal(hsa_signal_value_t value, void *arg);
226
227 void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest);
228 uint16_t
229 create_header(hsa_packet_type_t type, int barrier,
230 atmi_task_fence_scope_t acq_fence = ATMI_FENCE_SCOPE_SYSTEM,
231 atmi_task_fence_scope_t rel_fence = ATMI_FENCE_SCOPE_SYSTEM);
232
233 void allow_access_to_all_gpu_agents(void *ptr);
234 } // namespace core
235
236 const char *get_error_string(hsa_status_t err);
237 const char *get_atmi_error_string(atmi_status_t err);
238
239 #define ATMIErrorCheck(msg, status) \
240 if (status != ATMI_STATUS_SUCCESS) { \
241 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg, \
242 get_atmi_error_string(status)); \
243 exit(1); \
244 } else { \
245 /* printf("%s succeeded.\n", #msg);*/ \
246 }
247
248 #define ErrorCheck(msg, status) \
249 if (status != HSA_STATUS_SUCCESS) { \
250 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg, \
251 get_error_string(status)); \
252 exit(1); \
253 } else { \
254 /* printf("%s succeeded.\n", #msg);*/ \
255 }
256
257 #define ErrorCheckAndContinue(msg, status) \
258 if (status != HSA_STATUS_SUCCESS) { \
259 DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg, \
260 get_error_string(status)); \
261 continue; \
262 } else { \
263 /* printf("%s succeeded.\n", #msg);*/ \
264 }
265
266 #endif // SRC_RUNTIME_INCLUDE_INTERNAL_H_
267